AwA-0.5B

Running

Spestly commited on 24 days ago

Commit

6c71f71

verified ·

1 Parent(s): 32260ac

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 # Load model and tokenizer
 model_name = "Spestly/AwA-1.5B"
@@ -18,23 +19,30 @@ def generate_response_stream(message, history):
         f"### Instruction:\n{message}\n\n### Response:"
     )
     inputs = tokenizer(instruction, return_tensors="pt")
-    with torch.no_grad():
-        # Generate tokens one at a time
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=1000,
-            num_return_sequences=1,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            streamer=None,  # Replace this if the Transformers version supports streaming
-        )
-        # Decode and yield response tokens incrementally
-        for token_id in generated_ids[0]:
-            yield tokenizer.decode(token_id, skip_special_tokens=True)
 iface = gr.ChatInterface(
     fn=generate_response_stream,

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 import torch
+import threading
 # Load model and tokenizer
 model_name = "Spestly/AwA-1.5B"
         f"### Instruction:\n{message}\n\n### Response:"
     )
+    # Tokenize the input instruction
     inputs = tokenizer(instruction, return_tensors="pt")
+    # Create a TextIteratorStreamer for real-time output
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    # Generate tokens in a separate thread
+    generation_thread = threading.Thread(
+        target=model.generate,
+        kwargs={
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            "max_new_tokens": 1000,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "do_sample": True,
+            "streamer": streamer,
+        }
+    )
+    generation_thread.start()
+    # Stream tokens as they are generated
+    for token in streamer:
+        yield token
 iface = gr.ChatInterface(
     fn=generate_response_stream,