elizabetvaganova commited on
Commit
16bbc4e
·
1 Parent(s): 5a46f1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -3,48 +3,50 @@ import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, pipeline
7
- from transformers import SpeechT5Processor
8
-
9
- token = "<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>"
10
- model_identifier = "tugstugi/mongolian-tts-ljspeech"
11
-
12
- processor = SpeechT5Processor.from_pretrained(model_identifier, revision="main", token=token)
13
 
14
 
15
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
 
17
  # load speech translation checkpoint
18
- asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
19
 
20
  # load text-to-speech checkpoint and speaker embeddings
21
- processor = SpeechT5Processor.from_pretrained("tugstugi/mongolian-tts-ljspeech")
22
-
23
 
 
24
  model = SpeechT5ForTextToSpeech.from_pretrained("ttskit/ttskit-tts-ljspeech").to(device)
 
 
25
  vocoder = SpeechT5HifiGan.from_pretrained("ljspeech/vocoder-cryptron").to(device)
26
 
27
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
 
 
30
  def translate(audio):
31
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
32
  return outputs["text"]
33
 
 
34
  def synthesise(text):
35
  inputs = processor(text=text, return_tensors="pt")
36
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
37
  return speech.cpu()
38
 
 
39
  def speech_to_speech_translation(audio):
40
  translated_text = translate(audio)
41
  synthesised_speech = synthesise(translated_text)
42
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
43
  return 16000, synthesised_speech
44
 
 
45
  title = "Cascaded STST"
46
  description = """
47
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses Facebook's [Wav2Vec 2.0](https://huggingface.co/facebook/wav2vec2-base-960h) model for speech recognition, and a lightweight text-to-speech model ([ttskit/ttskit-tts-ljspeech](https://huggingface.co/ttskit/ttskit-tts-ljspeech)) along with a lightweight vocoder ([ljspeech/vocoder-cryptron](https://huggingface.co/ljspeech/vocoder-cryptron)).
 
 
48
  """
49
 
50
  demo = gr.Blocks()
@@ -69,4 +71,4 @@ file_translate = gr.Interface(
69
  with demo:
70
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
71
 
72
- demo.launch()
 
3
  import torch
4
  from datasets import load_dataset
5
 
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
 
 
 
 
7
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
16
 
17
+ # Using a more lightweight text-to-speech model
18
  model = SpeechT5ForTextToSpeech.from_pretrained("ttskit/ttskit-tts-ljspeech").to(device)
19
+
20
+ # Using a more lightweight vocoder
21
  vocoder = SpeechT5HifiGan.from_pretrained("ljspeech/vocoder-cryptron").to(device)
22
 
23
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
24
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
25
 
26
+
27
  def translate(audio):
28
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
29
  return outputs["text"]
30
 
31
+
32
  def synthesise(text):
33
  inputs = processor(text=text, return_tensors="pt")
34
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
35
  return speech.cpu()
36
 
37
+
38
  def speech_to_speech_translation(audio):
39
  translated_text = translate(audio)
40
  synthesised_speech = synthesise(translated_text)
41
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
42
  return 16000, synthesised_speech
43
 
44
+
45
  title = "Cascaded STST"
46
  description = """
47
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
48
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
49
+ ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
50
  """
51
 
52
  demo = gr.Blocks()
 
71
  with demo:
72
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
73
 
74
+ demo.launch()