jwu323 commited on
Commit
b377b1e
·
verified ·
1 Parent(s): 09dbd6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -150
app.py CHANGED
@@ -1,161 +1,126 @@
 
1
  import os
2
  import gradio as gr
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
 
6
- model = Llama(
7
- model_path=hf_hub_download(
8
- repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
9
- filename=os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
10
- )
11
- )
12
-
13
- DESCRIPTION = '''
14
- # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
15
- SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
16
- Focused on advancing AI reasoning capabilities.
17
-
18
- ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
19
-
20
- **To start a new chat**, click "clear" and start a new dialog.
21
- '''
22
-
23
- LICENSE = """
24
- --- MIT License ---
25
- """
26
-
27
- template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
28
-
29
- def llama_o1_template(data):
30
- #query = data['query']
31
- text = template.format(content=data)
32
- return text
33
-
34
- def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
35
- temp = ""
36
- input_texts = [llama_o1_template(message)]
37
- input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
38
- #print(f"input_texts[0]: {input_texts[0]}")
39
- inputs = model.tokenize(input_texts[0].encode('utf-8'))
40
- for token in model.generate(inputs, top_p=top_p, temp=temperature):
41
- #print(f"token: {token}")
42
- text = model.detokenize([token])
43
- #print(f"text detok: {text}")
44
- temp += text.decode('utf-8')
45
- yield temp
46
-
47
- with gr.Blocks() as demo:
48
- gr.Markdown(DESCRIPTION)
49
-
50
- chatbot = gr.ChatInterface(
51
- generate_text,
52
- title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
53
- description="Edit Settings below if needed.",
54
- examples=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  ["How many r's are in the word strawberry?"],
56
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
57
  ['Find the least odd prime factor of $2019^8+1$.'],
58
  ],
59
- cache_examples=False,
60
- fill_height=True
61
- )
62
-
63
- with gr.Accordion("Adjust Parameters", open=False):
64
- gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
65
- gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
66
- gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
67
-
68
- gr.Markdown(LICENSE)
69
-
70
- if __name__ == "__main__":
71
- demo.launch()
72
- # # import spaces
73
-
74
- # import os
75
- # import gradio as gr
76
- # from transformers import AutoTokenizer, AutoModelForCausalLM
77
- # from huggingface_hub import hf_hub_download, snapshot_download
78
- # import accelerate
79
-
80
- # accelerator = accelerate.Accelerator()
81
-
82
- # # Load the model and tokenizer from Hugging Face
83
- # model_path = snapshot_download(
84
- # repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
85
- # )
86
-
87
- # tokenizer = AutoTokenizer.from_pretrained(model_path)
88
- # model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
89
-
90
- # DESCRIPTION = '''
91
- # # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
92
- # SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
93
- # Focused on advancing AI reasoning capabilities.
94
-
95
- # ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
96
-
97
- # **To start a new chat**, click "clear" and start a new dialogue.
98
- # '''
99
-
100
- # LICENSE = """
101
- # --- MIT License ---
102
- # """
103
-
104
- # template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
105
-
106
- # def llama_o1_template(data):
107
- # #query = data['query']
108
- # text = template.format(content=data)
109
- # return text
110
-
111
- # def format_response(response):
112
- # response = response.replace('<start_of_father_id>','')
113
- # response = response.replace('<end_of_father_id><start_of_local_id>','👉')
114
- # response = response.replace('<end_of_local_id><start_of_thought>',', ')
115
- # response = response.replace('<end_of_thought><start_of_rating>','')
116
- # response = response.replace('<end_of_rating>','')
117
- # response = response.replace('<positive_rating>','👍')
118
- # response = response.replace('<negative_rating>','👎')
119
-
120
- # # @spaces.GPU
121
- # def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
122
- # input_text = llama_o1_template(message)
123
- # inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
124
-
125
- # # Generate the text with the model
126
- # output = model.generate(
127
- # **inputs,
128
- # max_length=max_tokens,
129
- # temperature=temperature,
130
- # top_p=top_p,
131
- # do_sample=True,
132
- # )
133
-
134
- # response = tokenizer.decode(output[0], skip_special_tokens=False)
135
- # yield response
136
-
137
- # with gr.Blocks() as demo:
138
- # gr.Markdown(DESCRIPTION)
139
-
140
- # chatbot = gr.ChatInterface(
141
- # generate_text,
142
- # title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
143
- # description="Edit Settings below if needed.",
144
- # examples=[
145
- # ["How many r's are in the word strawberry?"],
146
- # ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
147
- # ['Find the least odd prime factor of $2019^8+1$.'],
148
- # ],
149
- # cache_examples=True,
150
- # fill_height=True,
151
- # )
152
-
153
- # with gr.Accordion("Adjust Parameters", open=False):
154
- # gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
155
- # gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
156
- # gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
157
 
158
- # gr.Markdown(LICENSE)
 
 
159
 
160
- # if __name__ == "__main__":
161
- # demo.launch()
 
1
+ from typing import List, Tuple, Union
2
  import os
3
  import gradio as gr
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
7
+ class LlamaAssistant:
8
+ def __init__(self, model_config: dict):
9
+ self.model = Llama(
10
+ model_path=hf_hub_download(
11
+ repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
12
+ filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
13
+ )
14
+ )
15
+ self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
16
+ self.generate_cfg = model_config.get("generate_cfg", {
17
+ "max_tokens": 512,
18
+ "temperature": 0.7,
19
+ "top_p": 0.95,
20
+ })
21
+
22
+ def _format_prompt(self, message: str) -> str:
23
+ return self.template.format(content=message)
24
+
25
+ def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
26
+ input_text = self._format_prompt(message)
27
+ inputs = self.model.tokenize(input_text.encode('utf-8'))
28
+
29
+ response = ""
30
+ for token in self.model.generate(
31
+ inputs,
32
+ top_p=self.generate_cfg["top_p"],
33
+ temp=self.generate_cfg["temperature"]
34
+ ):
35
+ text = self.model.detokenize([token])
36
+ response += text.decode('utf-8')
37
+ yield response
38
+
39
+ class WebUI:
40
+ def __init__(self, assistant: LlamaAssistant, config: dict = None):
41
+ self.assistant = assistant
42
+ self.config = config or {}
43
+
44
+ def create_interface(self):
45
+ with gr.Blocks() as demo:
46
+ gr.Markdown(self.config.get("description", """
47
+ # LLaMA-O1-Supervised-1129 Demo
48
+ An experimental research model focused on advancing AI reasoning capabilities.
49
+
50
+ **To start a new chat**, click "clear" and start a new dialog.
51
+ """))
52
+
53
+ chatbot = gr.ChatInterface(
54
+ self.assistant.generate,
55
+ title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
56
+ description=self.config.get("description", "Edit Settings below if needed."),
57
+ examples=self.config.get("examples", [
58
+ ["How many r's are in the word strawberry?"],
59
+ ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
60
+ ['Find the least odd prime factor of $2019^8+1$.'],
61
+ ]),
62
+ cache_examples=False,
63
+ fill_height=True
64
+ )
65
+
66
+ with gr.Accordion("Adjust Parameters", open=False):
67
+ gr.Slider(
68
+ minimum=128,
69
+ maximum=8192,
70
+ value=self.assistant.generate_cfg["max_tokens"],
71
+ step=1,
72
+ label="Max Tokens"
73
+ )
74
+ gr.Slider(
75
+ minimum=0.1,
76
+ maximum=1.5,
77
+ value=self.assistant.generate_cfg["temperature"],
78
+ step=0.1,
79
+ label="Temperature"
80
+ )
81
+ gr.Slider(
82
+ minimum=0.05,
83
+ maximum=1.0,
84
+ value=self.assistant.generate_cfg["top_p"],
85
+ step=0.01,
86
+ label="Top-p (nucleus sampling)"
87
+ )
88
+
89
+ gr.Markdown(self.config.get("license", "--- MIT License ---"))
90
+
91
+ return demo
92
+
93
+ def run(self, **kwargs):
94
+ demo = self.create_interface()
95
+ demo.launch(**kwargs)
96
+
97
+ def app_gui():
98
+ # Define model configuration
99
+ model_config = {
100
+ "repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
101
+ "model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
102
+ "generate_cfg": {
103
+ "max_tokens": 512,
104
+ "temperature": float(os.environ.get("T", 0.7)),
105
+ "top_p": float(os.environ.get("P", 0.95)),
106
+ }
107
+ }
108
+
109
+ # UI configuration
110
+ ui_config = {
111
+ "title": "LLaMA-O1-Supervised-1129 | Demo",
112
+ "description": "LLaMA-O1-Supervised-1129 is an experimental research model focused on advancing AI reasoning capabilities.",
113
+ "examples": [
114
  ["How many r's are in the word strawberry?"],
115
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
116
  ['Find the least odd prime factor of $2019^8+1$.'],
117
  ],
118
+ "license": "--- MIT License ---"
119
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # Create and run the web interface
122
+ assistant = LlamaAssistant(model_config)
123
+ WebUI(assistant, ui_config).run(concurrency_limit=80)
124
 
125
+ if __name__ == '__main__':
126
+ app_gui()