LLaMA-O1-Supervised-1129-Demo

Running

Di Zhang commited on Dec 2, 2024

Commit

7666411

verified ·

1 Parent(s): ddd8f10

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,16 +1,19 @@
 import os
 import gradio as gr
-from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import spaces
-model = Llama(
-    model_path=hf_hub_download(
-        repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
-        filename=os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
-    )
 )
 DESCRIPTION = '''
 # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
 SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -32,20 +35,22 @@ def llama_o1_template(data):
     text = template.format(content=data)
     return text
-@spaces.GPU
 def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
-    temp = ""
-    input_texts = [llama_o1_template(message)]
-    input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
-    #print(f"input_texts[0]: {input_texts[0]}")
-    inputs = model.tokenize(input_texts[0].encode('utf-8'))
-    for token in model.generate(inputs, top_p=top_p, temp=temperature):
-        #print(f"token: {token}")
-        text = model.detokenize([token])
-        #print(f"text detok: {text}")
-        temp += text.decode('utf-8')
-        yield temp
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
@@ -72,4 +77,3 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.launch()

+import spaces
 import os
 import gradio as gr
+from transformers import LlamaForCausalLM, LlamaTokenizer
 from huggingface_hub import hf_hub_download
+# Load the model and tokenizer from Hugging Face
+model_path = snapshot_download(
+    repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF")
 )
+tokenizer = LlamaTokenizer.from_pretrained(model_path)
+model = LlamaForCausalLM.from_pretrained(model_path)
 DESCRIPTION = '''
 # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
 SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
     text = template.format(content=data)
     return text
 def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
+    input_text = llama_o1_template(message)
+    inputs = tokenizer(input_text, return_tensors="pt")
+    # Generate the text with the model
+    output = model.generate(
+        **inputs,
+        max_length=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    response = tokenizer.decode(output[0], skip_special_tokens=True)
+    yield response
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
 if __name__ == "__main__":
     demo.launch()