LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

jwu323 commited on Dec 4, 2024

Commit

4d7e82f

verified ·

1 Parent(s): 238ddf0

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -118

app.py CHANGED Viewed

@@ -1,135 +1,102 @@
-from typing import List, Tuple, Union
 import os
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-class LlamaAssistant:
-    def __init__(self, model_config: dict):
-        self.model = Llama(
-            model_path=hf_hub_download(
-                repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
-                filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
-            )
-        )
-        self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
-        self.generate_cfg = model_config.get("generate_cfg", {
-            "max_tokens": 512,
-            "temperature": 0.7,
-            "top_p": 0.95,
-        })
-    def _format_prompt(self, message: str) -> str:
-        return self.template.format(content=message)
-    def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
-        input_text = self._format_prompt(message)
-        inputs = self.model.tokenize(input_text.encode('utf-8'))
-        response = ""
-        for token in self.model.generate(
-            inputs,
-            top_p=self.generate_cfg["top_p"],
-            temp=self.generate_cfg["temperature"]
-        ):
-            text = self.model.detokenize([token])
-            response += text.decode('utf-8')
-            yield response
-class WebUI:
-    def __init__(self, assistant: LlamaAssistant, config: dict = None):
-        self.assistant = assistant
-        self.config = config or {}
-    def create_interface(self):
-        with gr.Blocks() as demo:
-            gr.Markdown(self.config.get("description", """
-            # LLaMA-O1-Supervised-1129 Demo
-            An experimental research model focused on advancing AI reasoning capabilities.
-            **To start a new chat**, click "clear" and start a new dialog.
-            """))
-            chatbot = gr.ChatInterface(
-                self.assistant.generate,
-                title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
-                description=self.config.get("description", "Edit Settings below if needed."),
-                examples=self.config.get("examples", [
-                    ["How many r's are in the word strawberry?"],
-                    ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
-                    ['Find the least odd prime factor of $2019^8+1$.'],
-                ]),
-                cache_examples=False,
-                fill_height=True
-            )
-            with gr.Accordion("Adjust Parameters", open=False):
-                gr.Slider(
-                    minimum=128,
-                    maximum=8192,
-                    value=self.assistant.generate_cfg["max_tokens"],
-                    step=1,
-                    label="Max Tokens"
-                )
-                gr.Slider(
-                    minimum=0.1,
-                    maximum=1.5,
-                    value=self.assistant.generate_cfg["temperature"],
-                    step=0.1,
-                    label="Temperature"
-                )
-                gr.Slider(
-                    minimum=0.05,
-                    maximum=1.0,
-                    value=self.assistant.generate_cfg["top_p"],
-                    step=0.01,
-                    label="Top-p (nucleus sampling)"
-                )
-            gr.Markdown(self.config.get("license", "--- MIT License ---"))
-        return demo
-    def run(self, **kwargs):
-        demo = self.create_interface()
-        demo.launch(**kwargs)
-def app_gui():
-    # Define model configuration
-    model_config = {
-        "repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
-        "model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
-        "generate_cfg": {
-            "max_tokens": 512,
-            "temperature": float(os.environ.get("T", 0.7)),
-            "top_p": float(os.environ.get("P", 0.95)),
-        }
-    }
-    # UI configuration
-    ui_config = {
-        "title": "LLaMA-O1-Supervised-1129 | Demo",
-        "description":
-            '''
-            # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
-            SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
-            Focused on advancing AI reasoning capabilities.
-            ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
-            **To start a new chat**, click "clear" and start a new dialog.
-            ''',
-        "examples": [
-            ["How many r's are in the word strawberry?"],
-            ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
-            ['Find the least odd prime factor of $2019^8+1$.'],
-        ],
-        "license": "--- MIT License ---"
-    }
-    # Create and run the web interface
-    assistant = LlamaAssistant(model_config)
-    WebUI(assistant, ui_config).run()
-if __name__ == '__main__':
-    app_gui()

 import os
+from typing import Generator, Optional
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# Keep original template and descriptions
+DESCRIPTION = '''
+# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
+SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
+Focused on advancing AI reasoning capabilities.
+## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
+**To start a new chat**, click "clear" and start a new dialog.
+'''
+LICENSE = """
+--- MIT License ---
+"""
+template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
+class OptimizedLLMInterface:
+    def __init__(
+        self,
+        model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
+        model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
+        context_size: int = 32768,
+        num_threads: int = 8,
+    ):
+        """Initialize optimized LLM interface"""
+        self.model = Llama(
+            model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
+            n_ctx=context_size,
+            n_threads=num_threads,
+            n_batch=512  # Increased batch size for better CPU utilization
+        )
+    def generate_response(
+        self,
+        message: str,
+        history: Optional[list] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.9,
+        top_p: float = 0.95,
+    ) -> Generator[str, None, None]:
+        """Generate response with optimized streaming"""
+        input_text = template.format(content=message)
+        input_tokens = self.model.tokenize(input_text.encode('utf-8'))
+        temp = ""
+        for token in self.model.generate(
+            input_tokens,
+            top_p=top_p,
+            temp=temperature,
+            repeat_penalty=1.1
+        ):
+            text = self.model.detokenize([token]).decode('utf-8')
+            temp += text
+            yield temp
+def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
+    """Create the Gradio interface"""
+    with gr.Blocks() as demo:
+        gr.Markdown(DESCRIPTION)
+        chatbot = gr.ChatInterface(
+            llm_interface.generate_response,
+            title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
+            description="Edit Settings below if needed.",
+            examples=[
+                ["How many r's are in the word strawberry?"],
+                ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
+                ['Find the least odd prime factor of $2019^8+1$.'],
+            ],
+            cache_examples=False,
+            fill_height=True
+        )
+        with gr.Accordion("Adjust Parameters", open=False):
+            gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
+            gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
+            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
+        gr.Markdown(LICENSE)
+    return demo
+def main():
+    # Initialize the optimized LLM interface
+    llm = OptimizedLLMInterface(
+        num_threads=os.cpu_count() or 8  # Automatically use available CPU cores
+    )
+    # Create and launch the demo
+    demo = create_demo(llm)
+    demo.queue(max_size=10)  # Limit queue size to prevent overload
+    demo.launch()
+if __name__ == "__main__":
+    main()