Sphinx Reasoner

import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/AetherDrake"

TITLE = "<h1><center>Sphinx Reasoner</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""


CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
}
.message-wrap p {
    margin-bottom: 1em;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""
device = "cuda" # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type= "nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 1.0, 
    max_new_tokens: int = 8192, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens = max_new_tokens,
        do_sample = False if temperature == 0 else True,
        top_p = top_p,
        top_k = top_k,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
        temperature = temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

            
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(
                value="""You are an AI expert at providing high-quality answers. Your process involves these steps:

1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
Example:
<Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>


2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:

Accuracy: Is it factually correct and verifiable?

Clarity: Is it easy to understand and free of ambiguity?

Completeness: Does it fully address the user's request?

Improvement: What specific aspects could be better?
Example:
<Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>


3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
Example:
<Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>


4. Final Response: Present your revised answer clearly within <Final> tags.
Example:
<Final> This is the improved response. </Final>


5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
Example:
<Definition> This tag defines a new term introduced in the response. </Definition>


Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization.

""",
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32000,
                step=1,
                value= 8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["What is meant by a Singularity? "],
            ["Explain the theory of Relativty"],
            ["Explain your thought process"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers? "],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()