camanalo1 commited on
Commit
35f8a26
·
verified ·
1 Parent(s): ec92ed3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline
4
+ from nemo.collections.asr.models import EncDecMultiTaskModel
5
+ from transformers import VitsTokenizer, VitsModel
6
+
7
+ # Load Canary ASR model
8
+ canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
9
+ decode_cfg = canary_model.cfg.decoding
10
+ decode_cfg.beam.beam_size = 1
11
+ canary_model.change_decoding_strategy(decode_cfg)
12
+
13
+ # Load Phi-3 Mini-128K-Instruct LLM model
14
+ phi_3_model_id = "microsoft/Phi-3-mini-128k-instruct"
15
+ phi_3_pipeline = pipeline("text-generation", model=phi_3_model_id, trust_remote_code=True)
16
+
17
+ # Load VITS TTS model
18
+ vits_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
19
+ vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
20
+
21
+ def transcribe_audio(audio):
22
+ transcribed_text = canary_model.transcribe(audio, batch_size=16)
23
+ return transcribed_text
24
+
25
+ def generate_response(prompt):
26
+ response = phi_3_pipeline(prompt)[0]['generated_text']
27
+ return response
28
+
29
+ def synthesize_speech(text):
30
+ inputs = vits_tokenizer(text=text, return_tensors="pt")
31
+ with torch.no_grad():
32
+ outputs = vits_model(**inputs)
33
+ waveform = outputs.waveform[0]
34
+ return waveform
35
+
36
+ # Define Gradio interface
37
+ gr.Interface(
38
+ fn=[transcribe_audio, generate_response, synthesize_speech],
39
+ inputs=["audio", "text", "text"],
40
+ outputs=[gr.outputs.Textbox(label="Transcribed Text"),
41
+ gr.outputs.Textbox(label="Generated Response"),
42
+ gr.outputs.Audio(label="Synthesized Speech")]
43
+ ).launch()