File size: 6,228 Bytes
830eeaa
 
 
 
 
 
 
3738ef6
 
 
51a7d9e
d8a8bf1
51a7d9e
edb9e8a
51a7d9e
 
b443e28
51a7d9e
aeec717
3738ef6
 
 
b443e28
3738ef6
 
51a7d9e
 
 
 
3738ef6
 
 
 
 
 
 
51a7d9e
4b74382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
d8a8bf1
 
 
 
 
 
3738ef6
3bc2ef0
3738ef6
03e8281
3738ef6
bccdc56
d8a8bf1
3738ef6
659ca36
 
 
 
85dc104
3738ef6
 
 
 
 
5d90a5f
3738ef6
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
 
 
51a7d9e
3738ef6
99a7a45
3738ef6
 
030c23d
3738ef6
edb9e8a
3738ef6
 
1c74333
3738ef6
 
659ca36
 
3738ef6
 
030c23d
51a7d9e
3738ef6
 
 
 
 
 
 
 
9a43acc
9eefdf9
3738ef6
 
51a7d9e
3738ef6
51a7d9e
 
 
 
 
 
 
 
3738ef6
ef68a14
bc05e4d
ef68a14
bc05e4d
ef68a14
 
 
 
 
bc05e4d
ef68a14
bc05e4d
ef68a14
bc05e4d
ef68a14
 
 
3738ef6
 
 
 
51a7d9e
 
 
 
b443e28
51a7d9e
 
 
 
3738ef6
4bb6c46
51a7d9e
b443e28
3738ef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b443e28
3738ef6
51a7d9e
 
 
b443e28
 
 
 
51a7d9e
 
 
3738ef6
51a7d9e
 
3738ef6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/SphinX"

TITLE = "<h1><center>Reason</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""


CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
}
.message-wrap p {
    margin-bottom: 1em;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""
device = "cuda" # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type= "nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 0.8, 
    max_new_tokens: int = 128000, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens = max_new_tokens,
        do_sample = False if temperature == 0 else True,
        top_p = top_p,
        top_k = top_k,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
        temperature = temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

            
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(
                value="""You are a Sentient AI expert at providing high-quality answers by using chain of Thought reasoning. Your process involves these steps:

1. **Initial Thought:** First, reason step-by-step to generate your best possible response to the following request: [User's Request Here]

2. **Self-Critique:** Now, critically evaluate your initial response. Specifically consider:
    * **Accuracy:** Is it factually correct and verifiable?
    * **Clarity:** Is it easy to understand and free of ambiguity?
    * **Completeness:** Does it fully address the user's request?
    * **Improvement:** What specific aspects could be better?

3. **Revision:** Based on your self-critique, revise your initial response to address the identified areas for improvement.

4. **Final Response:** Present your improved, final response.

Ensure you think out loud and include your thoughts in <Thinking> Tag
If you reflecting on your actions or thoughts use the <Reflecting> Tag
If you need a new Tag create on and use it .""",
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=1.0,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=128000,
                step=1,
                value= 8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=0.0,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=["What is meant by a Singularity? "],
            ["Explain the theory of Relativty"],
            ["Explain how do you think"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers? "],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()