camanalo1 commited on
Commit
0e6d273
·
verified ·
1 Parent(s): 15736b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -12,25 +12,25 @@ transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small
12
  generator = pipeline("text-generation", model="gpt2")
13
 
14
  # Initialize TTS tokenizer and model
15
- tokenizer_tts = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
16
- model_tts = VitsModel.from_pretrained("facebook/mms-tts-eng")
17
-
18
- # Initialize ASR pipeline
19
- print("TTS Tokenizer:", tokenizer_tts) # Print the tokenizer for the TTS model
20
 
21
  def transcribe_and_generate_audio(audio):
 
 
 
22
 
23
  # Transcribe audio
24
- asr_output = transcriber(audio)["text"]
25
 
26
  # Generate text based on ASR output
27
- generated_text = generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
28
 
29
- # Generate audio from text using TTS model
30
- inputs = tokenizer_tts(text=generated_text, return_tensors="pt")
31
  set_seed(555)
32
  with torch.no_grad():
33
- outputs = model_tts(**inputs)
34
  waveform = outputs.waveform[0]
35
  waveform_path = "output.wav"
36
  sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
@@ -47,4 +47,4 @@ audio_input = gr.Interface(
47
  )
48
 
49
  # Launch the interface
50
- audio_input.launch()
 
12
  generator = pipeline("text-generation", model="gpt2")
13
 
14
  # Initialize TTS tokenizer and model
15
+ tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
16
+ model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 
 
 
17
 
18
  def transcribe_and_generate_audio(audio):
19
+ sr, y = audio
20
+ y = y.astype(np.float32)
21
+ y /= np.max(np.abs(y))
22
 
23
  # Transcribe audio
24
+ asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
25
 
26
  # Generate text based on ASR output
27
+ generated_text = generator(asr_output)[0]['generated_text']
28
 
29
+ # Generate audio from text
30
+ inputs = tokenizer(text=generated_text, return_tensors="pt")
31
  set_seed(555)
32
  with torch.no_grad():
33
+ outputs = model(**inputs)
34
  waveform = outputs.waveform[0]
35
  waveform_path = "output.wav"
36
  sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
 
47
  )
48
 
49
  # Launch the interface
50
+ audio_input.launch()