import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Keep original template and descriptions
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

class OptimizedLLMInterface:
    def __init__(
        self,
        model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
        model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
        context_size: int = 32768,
        num_threads: int = 8,
    ):
        """Initialize optimized LLM interface"""
        self.model = Llama(
            model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
            n_ctx=context_size,
            n_threads=num_threads,
            n_batch=512  # Increased batch size for better CPU utilization
        )

    def generate_response(
        self,
        message: str,
        history: Optional[list] = None,
        max_tokens: int = 512,
        temperature: float = 0.9,
        top_p: float = 0.95,
    ) -> Generator[str, None, None]:
        """Generate response with optimized streaming"""
        input_text = template.format(content=message)
        input_tokens = self.model.tokenize(input_text.encode('utf-8'))
        
        temp = ""
        for token in self.model.generate(
            input_tokens,
            top_p=top_p,
            temp=temperature,
            repeat_penalty=1.1
        ):
            text = self.model.detokenize([token]).decode('utf-8')
            temp += text
            yield temp

def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
    """Create the Gradio interface"""
    with gr.Blocks() as demo:
        gr.Markdown(DESCRIPTION)

        chatbot = gr.ChatInterface(
            llm_interface.generate_response,
            title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
            description="Edit Settings below if needed.",
            examples=[
                ["How many r's are in the word strawberry?"],
                ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                ['Find the least odd prime factor of $2019^8+1$.'],
            ],
            cache_examples=False,
            fill_height=True
        )

        with gr.Accordion("Adjust Parameters", open=False):
            gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
            gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")

        gr.Markdown(LICENSE)
    
    return demo

def main():
    # Initialize the optimized LLM interface
    llm = OptimizedLLMInterface(
        num_threads=os.cpu_count() or 8  # Automatically use available CPU cores
    )
    
    # Create and launch the demo
    demo = create_demo(llm)
    demo.queue(max_size=10)  # Limit queue size to prevent overload
    demo.launch()

if __name__ == "__main__":
    main()