Di Zhang commited on
Commit
ee950e1
Β·
verified Β·
1 Parent(s): bb848de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -39
app.py CHANGED
@@ -1,21 +1,15 @@
1
- # import spaces
2
-
3
  import os
4
  import gradio as gr
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
- from huggingface_hub import hf_hub_download, snapshot_download
7
- import accelerate
8
-
9
- accelerator = accelerate.Accelerator()
10
 
11
- # Load the model and tokenizer from Hugging Face
12
- model_path = snapshot_download(
13
- repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
 
 
14
  )
15
 
16
- tokenizer = AutoTokenizer.from_pretrained(model_path)
17
- model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
18
-
19
  DESCRIPTION = '''
20
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
21
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -23,7 +17,7 @@ Focused on advancing AI reasoning capabilities.
23
 
24
  ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
25
 
26
- **To start a new chat**, click "clear" and start a new dialogue.
27
  '''
28
 
29
  LICENSE = """
@@ -37,31 +31,19 @@ def llama_o1_template(data):
37
  text = template.format(content=data)
38
  return text
39
 
40
- def format_response(response):
41
- response = response.replace('<start_of_father_id>','')
42
- response = response.replace('<end_of_father_id><start_of_local_id>','πŸ‘‰')
43
- response = response.replace('<end_of_local_id><start_of_thought>',', ')
44
- response = response.replace('<end_of_thought><start_of_rating>','')
45
- response = response.replace('<end_of_rating>','')
46
- response = response.replace('<positive_rating>','πŸ‘')
47
- response = response.replace('<negative_rating>','πŸ‘Ž')
48
-
49
- # @spaces.GPU
50
  def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
51
- input_text = llama_o1_template(message)
52
- inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
53
-
54
- # Generate the text with the model
55
- output = model.generate(
56
- **inputs,
57
- max_length=max_tokens,
58
- temperature=temperature,
59
- top_p=top_p,
60
- do_sample=True,
61
- )
62
 
63
- response = tokenizer.decode(output[0], skip_special_tokens=False)
64
- yield response
65
 
66
  with gr.Blocks() as demo:
67
  gr.Markdown(DESCRIPTION)
@@ -75,8 +57,8 @@ with gr.Blocks() as demo:
75
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
76
  ['Find the least odd prime factor of $2019^8+1$.'],
77
  ],
78
- cache_examples=True,
79
- fill_height=True,
80
  )
81
 
82
  with gr.Accordion("Adjust Parameters", open=False):
@@ -88,3 +70,93 @@ with gr.Blocks() as demo:
88
 
89
  if __name__ == "__main__":
90
  demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
 
 
 
5
 
6
+ model = Llama(
7
+ model_path=hf_hub_download(
8
+ repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
9
+ filename=os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
10
+ )
11
  )
12
 
 
 
 
13
  DESCRIPTION = '''
14
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
15
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
 
17
 
18
  ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
19
 
20
+ **To start a new chat**, click "clear" and start a new dialog.
21
  '''
22
 
23
  LICENSE = """
 
31
  text = template.format(content=data)
32
  return text
33
 
 
 
 
 
 
 
 
 
 
 
34
  def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
35
+ temp = ""
36
+ input_texts = [llama_o1_template(message)]
37
+ input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
38
+ #print(f"input_texts[0]: {input_texts[0]}")
39
+ inputs = model.tokenize(input_texts[0].encode('utf-8'))
40
+ for token in model.generate(inputs, top_p=top_p, temp=temperature):
41
+ #print(f"token: {token}")
42
+ text = model.detokenize([token])
43
+ #print(f"text detok: {text}")
44
+ temp += text.decode('utf-8')
45
+ yield temp
46
 
 
 
47
 
48
  with gr.Blocks() as demo:
49
  gr.Markdown(DESCRIPTION)
 
57
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
58
  ['Find the least odd prime factor of $2019^8+1$.'],
59
  ],
60
+ cache_examples=False,
61
+ fill_height=True
62
  )
63
 
64
  with gr.Accordion("Adjust Parameters", open=False):
 
70
 
71
  if __name__ == "__main__":
72
  demo.launch()
73
+ # # import spaces
74
+
75
+ # import os
76
+ # import gradio as gr
77
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
78
+ # from huggingface_hub import hf_hub_download, snapshot_download
79
+ # import accelerate
80
+
81
+ # accelerator = accelerate.Accelerator()
82
+
83
+ # # Load the model and tokenizer from Hugging Face
84
+ # model_path = snapshot_download(
85
+ # repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
86
+ # )
87
+
88
+ # tokenizer = AutoTokenizer.from_pretrained(model_path)
89
+ # model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
90
+
91
+ # DESCRIPTION = '''
92
+ # # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
93
+ # SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
94
+ # Focused on advancing AI reasoning capabilities.
95
+
96
+ # ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
97
+
98
+ # **To start a new chat**, click "clear" and start a new dialogue.
99
+ # '''
100
+
101
+ # LICENSE = """
102
+ # --- MIT License ---
103
+ # """
104
+
105
+ # template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
106
+
107
+ # def llama_o1_template(data):
108
+ # #query = data['query']
109
+ # text = template.format(content=data)
110
+ # return text
111
+
112
+ # def format_response(response):
113
+ # response = response.replace('<start_of_father_id>','')
114
+ # response = response.replace('<end_of_father_id><start_of_local_id>','πŸ‘‰')
115
+ # response = response.replace('<end_of_local_id><start_of_thought>',', ')
116
+ # response = response.replace('<end_of_thought><start_of_rating>','')
117
+ # response = response.replace('<end_of_rating>','')
118
+ # response = response.replace('<positive_rating>','πŸ‘')
119
+ # response = response.replace('<negative_rating>','πŸ‘Ž')
120
+
121
+ # # @spaces.GPU
122
+ # def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
123
+ # input_text = llama_o1_template(message)
124
+ # inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
125
+
126
+ # # Generate the text with the model
127
+ # output = model.generate(
128
+ # **inputs,
129
+ # max_length=max_tokens,
130
+ # temperature=temperature,
131
+ # top_p=top_p,
132
+ # do_sample=True,
133
+ # )
134
+
135
+ # response = tokenizer.decode(output[0], skip_special_tokens=False)
136
+ # yield response
137
+
138
+ # with gr.Blocks() as demo:
139
+ # gr.Markdown(DESCRIPTION)
140
+
141
+ # chatbot = gr.ChatInterface(
142
+ # generate_text,
143
+ # title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
144
+ # description="Edit Settings below if needed.",
145
+ # examples=[
146
+ # ["How many r's are in the word strawberry?"],
147
+ # ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
148
+ # ['Find the least odd prime factor of $2019^8+1$.'],
149
+ # ],
150
+ # cache_examples=True,
151
+ # fill_height=True,
152
+ # )
153
+
154
+ # with gr.Accordion("Adjust Parameters", open=False):
155
+ # gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
156
+ # gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
157
+ # gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
158
+
159
+ # gr.Markdown(LICENSE)
160
+
161
+ # if __name__ == "__main__":
162
+ # demo.launch()