import os from typing import Generator, Optional import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Keep original template and descriptions DESCRIPTION = ''' # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free. SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry. Focused on advancing AI reasoning capabilities. ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks! **To start a new chat**, click "clear" and start a new dialog. ''' LICENSE = """ --- MIT License --- """ template = "-10{content}\n01" class OptimizedLLMInterface: def __init__( self, model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF", model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf", context_size: int = 32768, num_threads: int = 8, ): """Initialize optimized LLM interface""" self.model = Llama( model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename), n_ctx=context_size, n_threads=num_threads, n_batch=512 # Increased batch size for better CPU utilization ) def generate_response( self, message: str, history: Optional[list] = None, max_tokens: int = 512, temperature: float = 0.9, top_p: float = 0.95, ) -> Generator[str, None, None]: """Generate response with optimized streaming""" input_text = template.format(content=message) input_tokens = self.model.tokenize(input_text.encode('utf-8')) temp = "" for token in self.model.generate( input_tokens, top_p=top_p, temp=temperature, repeat_penalty=1.1 ): text = self.model.detokenize([token]).decode('utf-8') temp += text yield temp def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks: """Create the Gradio interface""" with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) chatbot = gr.ChatInterface( llm_interface.generate_response, title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo", description="Edit Settings below if needed.", examples=[ ["How many r's are in the word strawberry?"], ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'], ['Find the least odd prime factor of $2019^8+1$.'], ], cache_examples=False, fill_height=True ) with gr.Accordion("Adjust Parameters", open=False): gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens") gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature") gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)") gr.Markdown(LICENSE) return demo def main(): # Initialize the optimized LLM interface llm = OptimizedLLMInterface( num_threads=os.cpu_count() or 8 # Automatically use available CPU cores ) # Create and launch the demo demo = create_demo(llm) demo.queue(max_size=10) # Limit queue size to prevent overload demo.launch() if __name__ == "__main__": main()