Smart_LLM

Running on Zero

File size: 7,937 Bytes

830eeaa
 
 
 
 
 
 
3738ef6
 
 
51a7d9e
d8a8bf1
51a7d9e
edb9e8a
51a7d9e
 
c00b625
51a7d9e
c44cbfe
3738ef6
 
 
b443e28
3738ef6
 
51a7d9e
 
 
3738ef6
 
 
 
 
 
 
51a7d9e
4b74382
 
1e18916
4b74382
 
 
1e18916
4b74382
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e
1e18916
 
3738ef6
d8a8bf1
 
 
 
1e18916
d8a8bf1
3738ef6
3bc2ef0
3738ef6
03e8281
3738ef6
bccdc56
d8a8bf1
3738ef6
659ca36
 
 
 
1e18916
 
 
 
 
 
 
 
85dc104
3738ef6
 
 
 
c44cbfe
 
3738ef6
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
 
 
51a7d9e
3738ef6
99a7a45
1e18916
 
 
 
 
3738ef6
1e18916
 
 
 
 
 
3738ef6
edb9e8a
3738ef6
1e18916
 
 
 
 
 
 
3738ef6
030c23d
51a7d9e
3738ef6
1e18916
 
 
3738ef6
 
 
 
 
1e18916
3738ef6
1e18916
 
 
 
 
 
 
 
 
 
 
 
9eefdf9
3738ef6
1e18916
 
 
 
 
 
bc05e4d
1e18916
c44cbfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e18916
c44cbfe
1e18916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3738ef6
 
 
 
51a7d9e
 
 
 
5f09b8a
51a7d9e
 
 
 
3738ef6
c44cbfe
51a7d9e
1e18916
3738ef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c44cbfe
3738ef6
51a7d9e
 
 
4f82bbf
1e18916
 
 
 
51a7d9e
 
 
3738ef6
51a7d9e
3738ef6

import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/AetherDrake"

TITLE = "<h1><center>Sphinx Reasoner</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
    white-space: pre-wrap !important;
}
.message-wrap p {
    margin-bottom: 1em;
    white-space: pre-wrap !important;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""

device = "cuda"  # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def format_text(text):
    """Helper function to format text with proper line breaks and spacing"""
    # Replace single newlines with double newlines for paragraph spacing
    formatted = text.replace('\n', '\n\n')
    # Remove extra spaces between paragraphs
    formatted = '\n'.join(line.strip() for line in formatted.split('\n'))
    return formatted

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 1.0, 
    max_new_tokens: int = 8192, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        conversation, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=60.0, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens=max_new_tokens,
        do_sample=False if temperature == 0 else True,
        top_p=top_p,
        top_k=top_k,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        temperature=temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    buffer = ""
    current_line = ""
    
    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    for new_text in streamer:
        # Add the new text to both buffers
        buffer += new_text
        current_line += new_text
        
        # Check if we have complete lines to process
        if '\n' in current_line:
            lines = current_line.split('\n')
            # The last element might be incomplete, so keep it in current_line
            current_line = lines[-1]
            # Format the complete text
            formatted_buffer = format_text(buffer)
            yield formatted_buffer
        else:
            yield buffer

            
chatbot = gr.Chatbot(
    height=600,
    placeholder=PLACEHOLDER,
    bubble_full_width=False,
    show_copy_button=True
)

DEFAULT_SYSTEM_PROMPT = """You are an AI expert at providing high-quality answers. Your process involves these steps:
1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
Example:
<Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>
2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:
Accuracy: Is it factually correct and verifiable?
Clarity: Is it easy to understand and free of ambiguity?
Completeness: Does it fully address the user's request?
Improvement: What specific aspects could be better?
Example:
<Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>
3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
Example:
<Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>
4. Final Response: Present your revised answer clearly within <Final> tags.
Example:
<Final> This is the improved response. </Final>
5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
Example:
<Definition> This tag defines a new term introduced in the response. </Definition>
Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization."""

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_classes="duplicate-button"
    )
    
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(
            label="⚙️ Parameters",
            open=False,
            render=False
        ),
        additional_inputs=[
            gr.Textbox(
                value=DEFAULT_SYSTEM_PROMPT,
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32000,
                step=1,
                value=8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["What is meant by a Singularity?"],
            ["Explain the theory of Relativity"],
            ["Explain your thought process in details"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers?"],
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()