Spaces:

VanYsa
/

MyAlexa

Paused

App Files Files Community

VanYsa commited on Apr 28, 2024

Commit

3cad491

1 Parent(s): e56ba3c

added meta llm

Browse files

Files changed (1) hide show

app.py +59 -36

app.py CHANGED Viewed

@@ -11,6 +11,14 @@ import time
 from nemo.collections.asr.models import ASRModel
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
@@ -42,12 +50,13 @@ decoding_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decoding_cfg)
 ### LLM model
-pipeline = transformers.pipeline(
-    "text-generation",
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
-    model_kwargs={"torch_dtype": torch.bfloat16},
-	device=device
-)
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
@@ -133,36 +142,50 @@ def bot(history,message):
 		time.sleep(0.05)
 		yield history
-def bot_response(message):
-	"""
-	Generates a response from the LLM model.
-	max_new_tokens, temperature and top_p are set to 256, 0.6 and 0.9 respectively.
-	"""
-	messages = [
-    {"role": "system", "content": "You are a helpful AI assistant."},
-    {"role": "user", "content": "What is an apple"},
-	]
-	prompt = pipeline.tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-	)
-	terminators = [
-		pipeline.tokenizer.eos_token_id,
-		pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-	]
-	outputs = pipeline(
-		prompt,
-		max_new_tokens=256,
-		eos_token_id=terminators,
-		do_sample=True,
-		temperature=0.6,
-		top_p=0.9,
-	)
-	return outputs[0]["generated_text"][len(prompt):]
 with gr.Blocks(
 	title="MyAlexa",

 from nemo.collections.asr.models import ASRModel
+from transformers import GemmaTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+# Set an environment variable
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
 DESCRIPTION = '''
 canary_model.change_decoding_strategy(decoding_cfg)
 ### LLM model
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+llama3_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto")  # to("cuda:0")
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
 def convert_audio(audio_filepath, tmpdir, utt_id):
 	"""
 		time.sleep(0.05)
 		yield history
+def bot_response(message: str,
+              history: list,
+              temperature: float,
+              max_new_tokens: int
+             ) -> str: # type: ignore
+    """
+    Generate a streaming response using the llama3-8b model.
+    Args:
+        message (str): The input message.
+        history (list): The conversation history used by ChatInterface.
+        temperature (float): The temperature for generating the response.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+    Returns:
+        str: The generated response.
+    """
+    conversation = []
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(llama3_model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids= input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id=terminators,
+    )
+    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
+    if temperature == 0:
+        generate_kwargs['do_sample'] = False
+    t = Thread(target=llama3_model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        #print(outputs)
+        yield "".join(outputs)
 with gr.Blocks(
 	title="MyAlexa",