File size: 7,937 Bytes
830eeaa
 
 
 
 
 
 
3738ef6
 
 
51a7d9e
d8a8bf1
51a7d9e
edb9e8a
51a7d9e
 
c00b625
51a7d9e
c44cbfe
3738ef6
 
 
b443e28
3738ef6
 
51a7d9e
 
 
3738ef6
 
 
 
 
 
 
51a7d9e
4b74382
 
1e18916
4b74382
 
 
1e18916
4b74382
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e
1e18916
 
3738ef6
d8a8bf1
 
 
 
1e18916
d8a8bf1
3738ef6
3bc2ef0
3738ef6
03e8281
3738ef6
bccdc56
d8a8bf1
3738ef6
659ca36
 
 
 
1e18916
 
 
 
 
 
 
 
85dc104
3738ef6
 
 
 
c44cbfe
 
3738ef6
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
 
 
51a7d9e
3738ef6
99a7a45
1e18916
 
 
 
 
3738ef6
1e18916
 
 
 
 
 
3738ef6
edb9e8a
3738ef6
1e18916
 
 
 
 
 
 
3738ef6
030c23d
51a7d9e
3738ef6
1e18916
 
 
3738ef6
 
 
 
 
1e18916
3738ef6
1e18916
 
 
 
 
 
 
 
 
 
 
 
9eefdf9
3738ef6
1e18916
 
 
 
 
 
bc05e4d
1e18916
c44cbfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e18916
c44cbfe
1e18916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3738ef6
 
 
 
51a7d9e
 
 
 
5f09b8a
51a7d9e
 
 
 
3738ef6
c44cbfe
51a7d9e
1e18916
3738ef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c44cbfe
3738ef6
51a7d9e
 
 
4f82bbf
1e18916
 
 
 
51a7d9e
 
 
3738ef6
51a7d9e
3738ef6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/AetherDrake"

TITLE = "<h1><center>Sphinx Reasoner</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
    white-space: pre-wrap !important;
}
.message-wrap p {
    margin-bottom: 1em;
    white-space: pre-wrap !important;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""

device = "cuda"  # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def format_text(text):
    """Helper function to format text with proper line breaks and spacing"""
    # Replace single newlines with double newlines for paragraph spacing
    formatted = text.replace('\n', '\n\n')
    # Remove extra spaces between paragraphs
    formatted = '\n'.join(line.strip() for line in formatted.split('\n'))
    return formatted

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 1.0, 
    max_new_tokens: int = 8192, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        conversation, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=60.0, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens=max_new_tokens,
        do_sample=False if temperature == 0 else True,
        top_p=top_p,
        top_k=top_k,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        temperature=temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    buffer = ""
    current_line = ""
    
    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    for new_text in streamer:
        # Add the new text to both buffers
        buffer += new_text
        current_line += new_text
        
        # Check if we have complete lines to process
        if '\n' in current_line:
            lines = current_line.split('\n')
            # The last element might be incomplete, so keep it in current_line
            current_line = lines[-1]
            # Format the complete text
            formatted_buffer = format_text(buffer)
            yield formatted_buffer
        else:
            yield buffer

            
chatbot = gr.Chatbot(
    height=600,
    placeholder=PLACEHOLDER,
    bubble_full_width=False,
    show_copy_button=True
)

DEFAULT_SYSTEM_PROMPT = """You are an AI expert at providing high-quality answers. Your process involves these steps:
1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
Example:
<Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>
2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:
Accuracy: Is it factually correct and verifiable?
Clarity: Is it easy to understand and free of ambiguity?
Completeness: Does it fully address the user's request?
Improvement: What specific aspects could be better?
Example:
<Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>
3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
Example:
<Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>
4. Final Response: Present your revised answer clearly within <Final> tags.
Example:
<Final> This is the improved response. </Final>
5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
Example:
<Definition> This tag defines a new term introduced in the response. </Definition>
Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization."""

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_classes="duplicate-button"
    )
    
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(
            label="⚙️ Parameters",
            open=False,
            render=False
        ),
        additional_inputs=[
            gr.Textbox(
                value=DEFAULT_SYSTEM_PROMPT,
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32000,
                step=1,
                value=8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["What is meant by a Singularity?"],
            ["Explain the theory of Relativity"],
            ["Explain your thought process in details"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers?"],
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()