Smart_LLM

Running on Zero

File size: 6,228 Bytes

830eeaa
 
 
 
 
 
 
3738ef6
 
 
51a7d9e
d8a8bf1
51a7d9e
edb9e8a
51a7d9e
 
b443e28
51a7d9e
aeec717
3738ef6
 
 
b443e28
3738ef6
 
51a7d9e
 
 
 
3738ef6
 
 
 
 
 
 
51a7d9e
4b74382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
d8a8bf1
 
 
 
 
 
3738ef6
3bc2ef0
3738ef6
03e8281
3738ef6
bccdc56
d8a8bf1
3738ef6
659ca36
 
 
 
85dc104
3738ef6
 
 
 
 
5d90a5f
3738ef6
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
 
 
51a7d9e
3738ef6
99a7a45
3738ef6
 
030c23d
3738ef6
edb9e8a
3738ef6
 
1c74333
3738ef6
 
659ca36
 
3738ef6
 
030c23d
51a7d9e
3738ef6
 
 
 
 
 
 
 
9a43acc
9eefdf9
3738ef6
 
51a7d9e
3738ef6
51a7d9e
 
 
 
 
 
 
 
3738ef6
ef68a14
bc05e4d
ef68a14
bc05e4d
ef68a14
 
 
 
 
bc05e4d
ef68a14
bc05e4d
ef68a14
bc05e4d
ef68a14
 
 
3738ef6
 
 
 
51a7d9e
 
 
 
b443e28
51a7d9e
 
 
 
3738ef6
4bb6c46
51a7d9e
b443e28
3738ef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b443e28
3738ef6
51a7d9e
 
 
b443e28
 
 
 
51a7d9e
 
 
3738ef6
51a7d9e
 
3738ef6

import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/SphinX"

TITLE = "<h1><center>Reason</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""


CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
}
.message-wrap p {
    margin-bottom: 1em;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""
device = "cuda" # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type= "nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 0.8, 
    max_new_tokens: int = 128000, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens = max_new_tokens,
        do_sample = False if temperature == 0 else True,
        top_p = top_p,
        top_k = top_k,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
        temperature = temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

            
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(
                value="""You are a Sentient AI expert at providing high-quality answers by using chain of Thought reasoning. Your process involves these steps:

1. **Initial Thought:** First, reason step-by-step to generate your best possible response to the following request: [User's Request Here]

2. **Self-Critique:** Now, critically evaluate your initial response. Specifically consider:
    * **Accuracy:** Is it factually correct and verifiable?
    * **Clarity:** Is it easy to understand and free of ambiguity?
    * **Completeness:** Does it fully address the user's request?
    * **Improvement:** What specific aspects could be better?

3. **Revision:** Based on your self-critique, revise your initial response to address the identified areas for improvement.

4. **Final Response:** Present your improved, final response.

Ensure you think out loud and include your thoughts in <Thinking> Tag
If you reflecting on your actions or thoughts use the <Reflecting> Tag
If you need a new Tag create on and use it .""",
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=1.0,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=128000,
                step=1,
                value= 8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=0.0,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=["What is meant by a Singularity? "],
            ["Explain the theory of Relativty"],
            ["Explain how do you think"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers? "],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()