jwu323 commited on
Commit
4d7e82f
·
verified ·
1 Parent(s): 238ddf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -118
app.py CHANGED
@@ -1,135 +1,102 @@
1
- from typing import List, Tuple, Union
2
  import os
 
3
  import gradio as gr
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
7
- class LlamaAssistant:
8
- def __init__(self, model_config: dict):
9
- self.model = Llama(
10
- model_path=hf_hub_download(
11
- repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
12
- filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
13
- )
14
- )
15
- self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
16
- self.generate_cfg = model_config.get("generate_cfg", {
17
- "max_tokens": 512,
18
- "temperature": 0.7,
19
- "top_p": 0.95,
20
- })
21
 
22
- def _format_prompt(self, message: str) -> str:
23
- return self.template.format(content=message)
24
 
25
- def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
26
- input_text = self._format_prompt(message)
27
- inputs = self.model.tokenize(input_text.encode('utf-8'))
28
-
29
- response = ""
30
- for token in self.model.generate(
31
- inputs,
32
- top_p=self.generate_cfg["top_p"],
33
- temp=self.generate_cfg["temperature"]
34
- ):
35
- text = self.model.detokenize([token])
36
- response += text.decode('utf-8')
37
- yield response
38
 
39
- class WebUI:
40
- def __init__(self, assistant: LlamaAssistant, config: dict = None):
41
- self.assistant = assistant
42
- self.config = config or {}
43
-
44
- def create_interface(self):
45
- with gr.Blocks() as demo:
46
- gr.Markdown(self.config.get("description", """
47
- # LLaMA-O1-Supervised-1129 Demo
48
- An experimental research model focused on advancing AI reasoning capabilities.
49
-
50
- **To start a new chat**, click "clear" and start a new dialog.
51
- """))
52
 
53
- chatbot = gr.ChatInterface(
54
- self.assistant.generate,
55
- title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
56
- description=self.config.get("description", "Edit Settings below if needed."),
57
- examples=self.config.get("examples", [
58
- ["How many r's are in the word strawberry?"],
59
- ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
60
- ['Find the least odd prime factor of $2019^8+1$.'],
61
- ]),
62
- cache_examples=False,
63
- fill_height=True
64
- )
65
 
66
- with gr.Accordion("Adjust Parameters", open=False):
67
- gr.Slider(
68
- minimum=128,
69
- maximum=8192,
70
- value=self.assistant.generate_cfg["max_tokens"],
71
- step=1,
72
- label="Max Tokens"
73
- )
74
- gr.Slider(
75
- minimum=0.1,
76
- maximum=1.5,
77
- value=self.assistant.generate_cfg["temperature"],
78
- step=0.1,
79
- label="Temperature"
80
- )
81
- gr.Slider(
82
- minimum=0.05,
83
- maximum=1.0,
84
- value=self.assistant.generate_cfg["top_p"],
85
- step=0.01,
86
- label="Top-p (nucleus sampling)"
87
- )
88
 
89
- gr.Markdown(self.config.get("license", "--- MIT License ---"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- return demo
 
 
 
92
 
93
- def run(self, **kwargs):
94
- demo = self.create_interface()
95
- demo.launch(**kwargs)
 
 
 
 
 
 
 
 
 
96
 
97
- def app_gui():
98
- # Define model configuration
99
- model_config = {
100
- "repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
101
- "model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
102
- "generate_cfg": {
103
- "max_tokens": 512,
104
- "temperature": float(os.environ.get("T", 0.7)),
105
- "top_p": float(os.environ.get("P", 0.95)),
106
- }
107
- }
108
 
109
- # UI configuration
110
- ui_config = {
111
- "title": "LLaMA-O1-Supervised-1129 | Demo",
112
- "description":
113
- '''
114
- # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
115
- SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
116
- Focused on advancing AI reasoning capabilities.
117
-
118
- ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
119
-
120
- **To start a new chat**, click "clear" and start a new dialog.
121
- ''',
122
- "examples": [
123
- ["How many r's are in the word strawberry?"],
124
- ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
125
- ['Find the least odd prime factor of $2019^8+1$.'],
126
- ],
127
- "license": "--- MIT License ---"
128
- }
129
 
130
- # Create and run the web interface
131
- assistant = LlamaAssistant(model_config)
132
- WebUI(assistant, ui_config).run()
 
 
 
 
 
 
 
133
 
134
- if __name__ == '__main__':
135
- app_gui()
 
 
1
  import os
2
+ from typing import Generator, Optional
3
  import gradio as gr
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
7
+ # Keep original template and descriptions
8
+ DESCRIPTION = '''
9
+ # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
10
+ SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
11
+ Focused on advancing AI reasoning capabilities.
 
 
 
 
 
 
 
 
 
12
 
13
+ ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
 
14
 
15
+ **To start a new chat**, click "clear" and start a new dialog.
16
+ '''
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ LICENSE = """
19
+ --- MIT License ---
20
+ """
 
 
 
 
 
 
 
 
 
 
21
 
22
+ template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ class OptimizedLLMInterface:
25
+ def __init__(
26
+ self,
27
+ model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
28
+ model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
29
+ context_size: int = 32768,
30
+ num_threads: int = 8,
31
+ ):
32
+ """Initialize optimized LLM interface"""
33
+ self.model = Llama(
34
+ model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
35
+ n_ctx=context_size,
36
+ n_threads=num_threads,
37
+ n_batch=512 # Increased batch size for better CPU utilization
38
+ )
 
 
 
 
 
 
 
39
 
40
+ def generate_response(
41
+ self,
42
+ message: str,
43
+ history: Optional[list] = None,
44
+ max_tokens: int = 512,
45
+ temperature: float = 0.9,
46
+ top_p: float = 0.95,
47
+ ) -> Generator[str, None, None]:
48
+ """Generate response with optimized streaming"""
49
+ input_text = template.format(content=message)
50
+ input_tokens = self.model.tokenize(input_text.encode('utf-8'))
51
+
52
+ temp = ""
53
+ for token in self.model.generate(
54
+ input_tokens,
55
+ top_p=top_p,
56
+ temp=temperature,
57
+ repeat_penalty=1.1
58
+ ):
59
+ text = self.model.detokenize([token]).decode('utf-8')
60
+ temp += text
61
+ yield temp
62
 
63
+ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
64
+ """Create the Gradio interface"""
65
+ with gr.Blocks() as demo:
66
+ gr.Markdown(DESCRIPTION)
67
 
68
+ chatbot = gr.ChatInterface(
69
+ llm_interface.generate_response,
70
+ title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
71
+ description="Edit Settings below if needed.",
72
+ examples=[
73
+ ["How many r's are in the word strawberry?"],
74
+ ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
75
+ ['Find the least odd prime factor of $2019^8+1$.'],
76
+ ],
77
+ cache_examples=False,
78
+ fill_height=True
79
+ )
80
 
81
+ with gr.Accordion("Adjust Parameters", open=False):
82
+ gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
83
+ gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
84
+ gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
 
 
 
 
 
 
 
85
 
86
+ gr.Markdown(LICENSE)
87
+
88
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ def main():
91
+ # Initialize the optimized LLM interface
92
+ llm = OptimizedLLMInterface(
93
+ num_threads=os.cpu_count() or 8 # Automatically use available CPU cores
94
+ )
95
+
96
+ # Create and launch the demo
97
+ demo = create_demo(llm)
98
+ demo.queue(max_size=10) # Limit queue size to prevent overload
99
+ demo.launch()
100
 
101
+ if __name__ == "__main__":
102
+ main()