elizabetvaganova commited on
Commit
4dc6c4f
·
1 Parent(s): 1196030

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -1,13 +1,20 @@
1
  import gradio as gr
2
  import numpy as np
3
  import torch
 
4
  from datasets import load_dataset
5
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
- # Load a lightweight automatic speech recognition model (vosk)
10
- asr_pipe = pipeline("automatic-speech-recognition", model="alphacep/kaldi-ru", device=device)
 
 
 
 
 
 
11
 
12
  # Load a lightweight text-to-speech checkpoint and speaker embeddings
13
  processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
@@ -19,8 +26,10 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
19
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
20
 
21
  def translate(audio):
22
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
23
- return outputs["text"]
 
 
24
 
25
  def synthesise(text):
26
  inputs = processor(text=text, return_tensors="pt")
@@ -60,4 +69,4 @@ file_translate = gr.Interface(
60
  with demo:
61
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
62
 
63
- demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
+ from vosk import KaldiRecognizer, Model
5
  from datasets import load_dataset
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
+ # Load Vosk automatic speech recognition model
11
+ vosk_model = Model("elizabetvaganova/speech-to-speech-translation-vaganova")
12
+
13
+ def recognize_speech(audio):
14
+ recognizer = KaldiRecognizer(vosk_model, 16000)
15
+ recognizer.AcceptWaveform(audio.data)
16
+ result = recognizer.FinalResult()
17
+ return result["text"]
18
 
19
  # Load a lightweight text-to-speech checkpoint and speaker embeddings
20
  processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
 
26
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
 
28
  def translate(audio):
29
+ recognizer = KaldiRecognizer(vosk_model, 16000)
30
+ recognizer.AcceptWaveform(audio.data)
31
+ result = recognizer.FinalResult()
32
+ return result["text"]
33
 
34
  def synthesise(text):
35
  inputs = processor(text=text, return_tensors="pt")
 
69
  with demo:
70
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
71
 
72
+ demo.launch()