vineetsharma commited on
Commit
460cdbb
·
1 Parent(s): 0b0e130

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -6
app.py CHANGED
@@ -6,7 +6,7 @@ from datasets import load_dataset
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
8
  ## Imports for MMS
9
- # from transformers import VitsModel, VitsTokenizer
10
 
11
 
12
 
@@ -25,9 +25,16 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
25
  # For Dutch
26
 
27
  ##### speecht5 #####
28
- model_id = 'sanchit-gandhi/speecht5_tts_vox_nl'
29
- processor = SpeechT5Processor.from_pretrained(model_id)
30
- model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
 
 
 
 
 
 
 
31
 
32
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
33
 
@@ -35,6 +42,8 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(devic
35
 
36
 
37
 
 
 
38
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
39
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
40
 
@@ -48,11 +57,21 @@ def translate(audio):
48
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
49
  return outputs["text"]
50
 
 
 
 
 
 
 
51
 
52
  def synthesise(text):
53
- inputs = processor(text=text, return_tensors="pt")
54
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
 
55
  return speech.cpu()
 
56
 
57
 
58
  def speech_to_speech_translation(audio):
 
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
8
  ## Imports for MMS
9
+ from transformers import VitsModel, VitsTokenizer
10
 
11
 
12
 
 
25
  # For Dutch
26
 
27
  ##### speecht5 #####
28
+ # model_id = 'sanchit-gandhi/speecht5_tts_vox_nl'
29
+ # processor = SpeechT5Processor.from_pretrained(model_id)
30
+ # model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
31
+
32
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
33
+
34
+
35
+ ##### mms #####
36
+ model = VitsModel.from_pretrained("Matthijs/mms-tts-nld")
37
+ tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-nld")
38
 
39
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
40
 
 
42
 
43
 
44
 
45
+
46
+
47
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
48
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
49
 
 
57
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
58
  return outputs["text"]
59
 
60
+ # Original
61
+ # def synthesise(text):
62
+ # inputs = processor(text=text, return_tensors="pt")
63
+ # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
64
+ # return speech.cpu()
65
+
66
 
67
  def synthesise(text):
68
+ inputs = tokenizer(text, return_tensors="pt")
69
+ with torch.no_grad():
70
+ outputs = model(inputs["input_ids"])
71
+ speech = outputs.audio[0]
72
+
73
  return speech.cpu()
74
+
75
 
76
 
77
  def speech_to_speech_translation(audio):