File size: 6,535 Bytes
fc46f2c
 
c344902
ee950e1
fc46f2c
c344902
 
 
 
 
 
 
4d7e82f
 
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
 
b377b1e
4d7e82f
 
 
b377b1e
4d7e82f
b377b1e
c344902
 
 
 
ee950e1
c344902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43b7c77
ee950e1
c344902
 
 
 
 
 
 
4d7e82f
c344902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

model = Llama(
    model_path=hf_hub_download(
        repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF"),
        filename=os.environ.get("MODEL_FILE", "LLaMA-O1-Supervised-1129-q2_k.gguf"),
    )
)

DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

def llama_o1_template(data):
    #query = data['query']
    text = template.format(content=data)
    return text

def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
    temp = ""
    input_texts = [llama_o1_template(message)]
    input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
    #print(f"input_texts[0]: {input_texts[0]}")
    inputs = model.tokenize(input_texts[0].encode('utf-8'))
    for token in model.generate(inputs, top_p=top_p, temp=temperature):
        #print(f"token: {token}")
        text = model.detokenize([token])
        #print(f"text detok: {text}")
        temp += text.decode('utf-8')
        yield temp

with gr.Blocks() as demo:
    gr.Markdown(DESCRIPTION)

    chatbot = gr.ChatInterface(
        generate_text,
        title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
        description="Edit Settings below if needed.",
        examples=[
            ["How many r's are in the word strawberry?"],
            ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
            ['Find the least odd prime factor of $2019^8+1$.'],
        ],
        cache_examples=False,
        fill_height=True
    )

    with gr.Accordion("Adjust Parameters", open=False):
        gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
        gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
        gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")

    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.launch()
# # import spaces

# import os
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import hf_hub_download, snapshot_download
# import accelerate

# accelerator = accelerate.Accelerator()

# # Load the model and tokenizer from Hugging Face
# model_path = snapshot_download(
#     repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
# )

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')

# DESCRIPTION = '''
# # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
# SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
# Focused on advancing AI reasoning capabilities.  

# ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

# **To start a new chat**, click "clear" and start a new dialogue.
# '''

# LICENSE = """
# --- MIT License ---
# """

# template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

# def llama_o1_template(data):
#     #query = data['query']
#     text = template.format(content=data)
#     return text

# def format_response(response):
#     response = response.replace('<start_of_father_id>','')
#     response = response.replace('<end_of_father_id><start_of_local_id>','👉')
#     response = response.replace('<end_of_local_id><start_of_thought>',', ')
#     response = response.replace('<end_of_thought><start_of_rating>','')
#     response = response.replace('<end_of_rating>','')
#     response = response.replace('<positive_rating>','👍')
#     response = response.replace('<negative_rating>','👎')
    
# # @spaces.GPU
# def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
#     input_text = llama_o1_template(message)
#     inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)

#     # Generate the text with the model
#     output = model.generate(
#         **inputs,
#         max_length=max_tokens,
#         temperature=temperature,
#         top_p=top_p,
#         do_sample=True,
#     )

#     response = tokenizer.decode(output[0], skip_special_tokens=False)
#     yield response

# with gr.Blocks() as demo:
#     gr.Markdown(DESCRIPTION)

#     chatbot = gr.ChatInterface(
#         generate_text,
#         title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
#         description="Edit Settings below if needed.",
#         examples=[
#             ["How many r's are in the word strawberry?"],
#             ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
#             ['Find the least odd prime factor of $2019^8+1$.'],
#         ],
#         cache_examples=True,
#         fill_height=True,
#     )

#     with gr.Accordion("Adjust Parameters", open=False):
#         gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
#         gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
#         gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")

#     gr.Markdown(LICENSE)

# if __name__ == "__main__":
#     demo.launch()