Di Zhang commited on
Commit
7666411
·
verified ·
1 Parent(s): ddd8f10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -1,16 +1,19 @@
 
 
 
1
  import os
2
  import gradio as gr
3
- from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
- import spaces
6
 
7
- model = Llama(
8
- model_path=hf_hub_download(
9
- repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
10
- filename=os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
11
- )
12
  )
13
 
 
 
 
14
  DESCRIPTION = '''
15
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
16
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -32,20 +35,22 @@ def llama_o1_template(data):
32
  text = template.format(content=data)
33
  return text
34
 
35
- @spaces.GPU
36
  def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
37
- temp = ""
38
- input_texts = [llama_o1_template(message)]
39
- input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
40
- #print(f"input_texts[0]: {input_texts[0]}")
41
- inputs = model.tokenize(input_texts[0].encode('utf-8'))
42
- for token in model.generate(inputs, top_p=top_p, temp=temperature):
43
- #print(f"token: {token}")
44
- text = model.detokenize([token])
45
- #print(f"text detok: {text}")
46
- temp += text.decode('utf-8')
47
- yield temp
 
48
 
 
 
49
 
50
  with gr.Blocks() as demo:
51
  gr.Markdown(DESCRIPTION)
@@ -72,4 +77,3 @@ with gr.Blocks() as demo:
72
 
73
  if __name__ == "__main__":
74
  demo.launch()
75
-
 
1
+
2
+ import spaces
3
+
4
  import os
5
  import gradio as gr
6
+ from transformers import LlamaForCausalLM, LlamaTokenizer
7
  from huggingface_hub import hf_hub_download
 
8
 
9
+ # Load the model and tokenizer from Hugging Face
10
+ model_path = snapshot_download(
11
+ repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF")
 
 
12
  )
13
 
14
+ tokenizer = LlamaTokenizer.from_pretrained(model_path)
15
+ model = LlamaForCausalLM.from_pretrained(model_path)
16
+
17
  DESCRIPTION = '''
18
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
19
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
 
35
  text = template.format(content=data)
36
  return text
37
 
 
38
  def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
39
+ input_text = llama_o1_template(message)
40
+ inputs = tokenizer(input_text, return_tensors="pt")
41
+
42
+ # Generate the text with the model
43
+ output = model.generate(
44
+ **inputs,
45
+ max_length=max_tokens,
46
+ temperature=temperature,
47
+ top_p=top_p,
48
+ do_sample=True,
49
+ pad_token_id=tokenizer.eos_token_id,
50
+ )
51
 
52
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
53
+ yield response
54
 
55
  with gr.Blocks() as demo:
56
  gr.Markdown(DESCRIPTION)
 
77
 
78
  if __name__ == "__main__":
79
  demo.launch()