elizabetvaganova
commited on
Commit
·
4dc6c4f
1
Parent(s):
1196030
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,20 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import torch
|
|
|
4 |
from datasets import load_dataset
|
5 |
-
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
|
6 |
|
7 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
8 |
|
9 |
-
# Load
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Load a lightweight text-to-speech checkpoint and speaker embeddings
|
13 |
processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
|
@@ -19,8 +26,10 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
|
|
19 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
20 |
|
21 |
def translate(audio):
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
25 |
def synthesise(text):
|
26 |
inputs = processor(text=text, return_tensors="pt")
|
@@ -60,4 +69,4 @@ file_translate = gr.Interface(
|
|
60 |
with demo:
|
61 |
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
|
62 |
|
63 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
+
from vosk import KaldiRecognizer, Model
|
5 |
from datasets import load_dataset
|
6 |
+
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
|
7 |
|
8 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
9 |
|
10 |
+
# Load Vosk automatic speech recognition model
|
11 |
+
vosk_model = Model("elizabetvaganova/speech-to-speech-translation-vaganova")
|
12 |
+
|
13 |
+
def recognize_speech(audio):
|
14 |
+
recognizer = KaldiRecognizer(vosk_model, 16000)
|
15 |
+
recognizer.AcceptWaveform(audio.data)
|
16 |
+
result = recognizer.FinalResult()
|
17 |
+
return result["text"]
|
18 |
|
19 |
# Load a lightweight text-to-speech checkpoint and speaker embeddings
|
20 |
processor = SpeechT5Processor.from_pretrained("ttskit/ttskit-tts-ljspeech")
|
|
|
26 |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
27 |
|
28 |
def translate(audio):
|
29 |
+
recognizer = KaldiRecognizer(vosk_model, 16000)
|
30 |
+
recognizer.AcceptWaveform(audio.data)
|
31 |
+
result = recognizer.FinalResult()
|
32 |
+
return result["text"]
|
33 |
|
34 |
def synthesise(text):
|
35 |
inputs = processor(text=text, return_tensors="pt")
|
|
|
69 |
with demo:
|
70 |
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
|
71 |
|
72 |
+
demo.launch()
|