juliuserictuliao commited on
Commit
af6c335
·
verified ·
1 Parent(s): 94dfae8
Files changed (1) hide show
  1. app.py +7 -18
app.py CHANGED
@@ -5,39 +5,28 @@ from datasets import load_dataset
5
 
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
8
-
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
- # load speech translation checkpoint
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
-
17
-
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
-
23
- model = SpeechT5ForTextToSpeech.from_pretrained("facebook/mms-tts-tgl")
24
-
25
-
26
-
27
 
28
  def translate(audio):
29
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "tagalog"})
30
- print(outputs["text"])
31
  return outputs["text"]
32
 
33
-
34
  def synthesise(text):
35
  inputs = processor(text=text, return_tensors="pt")
36
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
37
  return speech.cpu()
38
 
39
-
40
-
41
  def speech_to_speech_translation(audio):
42
  translated_text = translate(audio)
43
  synthesised_speech = synthesise(translated_text)
 
5
 
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
+ # Load speech translation checkpoint
11
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
12
 
13
+ # Load text-to-speech checkpoint and speaker embeddings
14
+ processor = SpeechT5Processor.from_pretrained("facebook/mms-tts-spa")
15
+ model = SpeechT5ForTextToSpeech.from_pretrained("facebook/mms-tts-spa").to(device)
 
16
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
17
 
18
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
19
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
 
 
 
 
 
20
 
21
  def translate(audio):
22
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
 
23
  return outputs["text"]
24
 
 
25
  def synthesise(text):
26
  inputs = processor(text=text, return_tensors="pt")
27
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings, vocoder=vocoder)
28
  return speech.cpu()
29
 
 
 
30
  def speech_to_speech_translation(audio):
31
  translated_text = translate(audio)
32
  synthesised_speech = synthesise(translated_text)