File size: 6,535 Bytes
fc46f2c c344902 ee950e1 fc46f2c c344902 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e c344902 ee950e1 c344902 43b7c77 ee950e1 c344902 4d7e82f c344902 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
model = Llama(
model_path=hf_hub_download(
repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF"),
filename=os.environ.get("MODEL_FILE", "LLaMA-O1-Supervised-1129-q2_k.gguf"),
)
)
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
Focused on advancing AI reasoning capabilities.
## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
**To start a new chat**, click "clear" and start a new dialog.
'''
LICENSE = """
--- MIT License ---
"""
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
def llama_o1_template(data):
#query = data['query']
text = template.format(content=data)
return text
def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
temp = ""
input_texts = [llama_o1_template(message)]
input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
#print(f"input_texts[0]: {input_texts[0]}")
inputs = model.tokenize(input_texts[0].encode('utf-8'))
for token in model.generate(inputs, top_p=top_p, temp=temperature):
#print(f"token: {token}")
text = model.detokenize([token])
#print(f"text detok: {text}")
temp += text.decode('utf-8')
yield temp
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.ChatInterface(
generate_text,
title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
description="Edit Settings below if needed.",
examples=[
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
cache_examples=False,
fill_height=True
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.launch()
# # import spaces
# import os
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import hf_hub_download, snapshot_download
# import accelerate
# accelerator = accelerate.Accelerator()
# # Load the model and tokenizer from Hugging Face
# model_path = snapshot_download(
# repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
# )
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
# DESCRIPTION = '''
# # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
# SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
# Focused on advancing AI reasoning capabilities.
# ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
# **To start a new chat**, click "clear" and start a new dialogue.
# '''
# LICENSE = """
# --- MIT License ---
# """
# template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
# def llama_o1_template(data):
# #query = data['query']
# text = template.format(content=data)
# return text
# def format_response(response):
# response = response.replace('<start_of_father_id>','')
# response = response.replace('<end_of_father_id><start_of_local_id>','👉')
# response = response.replace('<end_of_local_id><start_of_thought>',', ')
# response = response.replace('<end_of_thought><start_of_rating>','')
# response = response.replace('<end_of_rating>','')
# response = response.replace('<positive_rating>','👍')
# response = response.replace('<negative_rating>','👎')
# # @spaces.GPU
# def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
# input_text = llama_o1_template(message)
# inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
# # Generate the text with the model
# output = model.generate(
# **inputs,
# max_length=max_tokens,
# temperature=temperature,
# top_p=top_p,
# do_sample=True,
# )
# response = tokenizer.decode(output[0], skip_special_tokens=False)
# yield response
# with gr.Blocks() as demo:
# gr.Markdown(DESCRIPTION)
# chatbot = gr.ChatInterface(
# generate_text,
# title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
# description="Edit Settings below if needed.",
# examples=[
# ["How many r's are in the word strawberry?"],
# ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
# ['Find the least odd prime factor of $2019^8+1$.'],
# ],
# cache_examples=True,
# fill_height=True,
# )
# with gr.Accordion("Adjust Parameters", open=False):
# gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
# gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
# gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
# gr.Markdown(LICENSE)
# if __name__ == "__main__":
# demo.launch() |