DHEIVER commited on
Commit
5ae02e5
·
verified ·
1 Parent(s): 96fa0f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -73
app.py CHANGED
@@ -1,60 +1,46 @@
 
1
  import torch
2
  import torchaudio
3
  import scipy.io.wavfile
 
4
  from transformers import AutoProcessor, SeamlessM4Tv2Model
5
  from pathlib import Path
6
  from typing import Optional, Union
7
 
8
  class SeamlessTranslator:
9
- """
10
- A wrapper class for Facebook's SeamlessM4T translation model.
11
- Handles both text-to-speech and speech-to-speech translation.
12
- """
13
-
14
  def __init__(self, model_name: str = "facebook/seamless-m4t-v2-large"):
15
- """
16
- Initialize the translator with the specified model.
17
-
18
- Args:
19
- model_name (str): Name of the model to use
20
- """
21
  try:
22
  self.processor = AutoProcessor.from_pretrained(model_name)
23
  self.model = SeamlessM4Tv2Model.from_pretrained(model_name)
24
  self.sample_rate = self.model.config.sampling_rate
25
  except Exception as e:
26
  raise RuntimeError(f"Failed to initialize model: {str(e)}")
27
-
28
- def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> numpy.ndarray:
29
- """
30
- Translate text to speech in the target language.
31
 
32
- Args:
33
- text (str): Input text to translate
34
- src_lang (str): Source language code (e.g., 'eng')
35
- tgt_lang (str): Target language code (e.g., 'rus')
36
-
37
- Returns:
38
- numpy.ndarray: Audio waveform array
39
- """
 
 
 
 
 
 
 
 
 
40
  try:
41
  inputs = self.processor(text=text, src_lang=src_lang, return_tensors="pt")
42
  audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
43
- return audio_array
44
  except Exception as e:
45
  raise RuntimeError(f"Text translation failed: {str(e)}")
46
 
47
- def translate_audio(self, audio_path: Union[str, Path], tgt_lang: str) -> numpy.ndarray:
48
- """
49
- Translate audio to speech in the target language.
50
-
51
- Args:
52
- audio_path (str or Path): Path to input audio file
53
- tgt_lang (str): Target language code (e.g., 'rus')
54
-
55
- Returns:
56
- numpy.ndarray: Audio waveform array
57
- """
58
  try:
59
  # Load and resample audio
60
  audio, orig_freq = torchaudio.load(audio_path)
@@ -67,50 +53,103 @@ class SeamlessTranslator:
67
  # Process and generate translation
68
  inputs = self.processor(audios=audio, return_tensors="pt")
69
  audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
70
- return audio_array
71
  except Exception as e:
72
  raise RuntimeError(f"Audio translation failed: {str(e)}")
73
 
74
- def save_audio(self, audio_array: numpy.ndarray, output_path: Union[str, Path]) -> None:
75
- """
76
- Save an audio array to a WAV file.
77
-
78
- Args:
79
- audio_array (numpy.ndarray): Audio data to save
80
- output_path (str or Path): Path where to save the WAV file
81
- """
82
- try:
83
- scipy.io.wavfile.write(
84
- output_path,
85
- rate=self.sample_rate,
86
- data=audio_array
87
- )
88
- except Exception as e:
89
- raise RuntimeError(f"Failed to save audio: {str(e)}")
90
 
91
- def main():
92
- """Example usage of the SeamlessTranslator class."""
93
- try:
94
- # Initialize translator
95
- translator = SeamlessTranslator()
96
 
97
- # Example text translation
98
- text_audio = translator.translate_text(
99
- text="Hello, my dog is cute",
100
- src_lang="eng",
101
- tgt_lang="rus"
102
- )
103
- translator.save_audio(text_audio, "output_from_text.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Example audio translation
106
- audio_audio = translator.translate_audio(
107
- audio_path="input_audio.wav",
108
- tgt_lang="rus"
109
- )
110
- translator.save_audio(audio_audio, "output_from_audio.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- except Exception as e:
113
- print(f"Translation failed: {str(e)}")
114
 
115
  if __name__ == "__main__":
116
- main()
 
 
1
+ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import scipy.io.wavfile
5
+ import numpy as np
6
  from transformers import AutoProcessor, SeamlessM4Tv2Model
7
  from pathlib import Path
8
  from typing import Optional, Union
9
 
10
  class SeamlessTranslator:
 
 
 
 
 
11
  def __init__(self, model_name: str = "facebook/seamless-m4t-v2-large"):
 
 
 
 
 
 
12
  try:
13
  self.processor = AutoProcessor.from_pretrained(model_name)
14
  self.model = SeamlessM4Tv2Model.from_pretrained(model_name)
15
  self.sample_rate = self.model.config.sampling_rate
16
  except Exception as e:
17
  raise RuntimeError(f"Failed to initialize model: {str(e)}")
 
 
 
 
18
 
19
+ # Available language pairs
20
+ self.language_codes = {
21
+ "English": "eng",
22
+ "Spanish": "spa",
23
+ "French": "fra",
24
+ "German": "deu",
25
+ "Italian": "ita",
26
+ "Portuguese": "por",
27
+ "Russian": "rus",
28
+ "Chinese": "cmn",
29
+ "Japanese": "jpn",
30
+ "Korean": "kor",
31
+ "Arabic": "ara",
32
+ "Hindi": "hin",
33
+ }
34
+
35
+ def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> tuple[int, np.ndarray]:
36
  try:
37
  inputs = self.processor(text=text, src_lang=src_lang, return_tensors="pt")
38
  audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
39
+ return self.sample_rate, audio_array
40
  except Exception as e:
41
  raise RuntimeError(f"Text translation failed: {str(e)}")
42
 
43
+ def translate_audio(self, audio_path: str, tgt_lang: str) -> tuple[int, np.ndarray]:
 
 
 
 
 
 
 
 
 
 
44
  try:
45
  # Load and resample audio
46
  audio, orig_freq = torchaudio.load(audio_path)
 
53
  # Process and generate translation
54
  inputs = self.processor(audios=audio, return_tensors="pt")
55
  audio_array = self.model.generate(**inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
56
+ return self.sample_rate, audio_array
57
  except Exception as e:
58
  raise RuntimeError(f"Audio translation failed: {str(e)}")
59
 
60
+ class GradioInterface:
61
+ def __init__(self):
62
+ self.translator = SeamlessTranslator()
63
+ self.languages = list(self.translator.language_codes.keys())
64
+
65
+ def text_to_speech(self, text: str, src_lang: str, tgt_lang: str) -> tuple[int, np.ndarray]:
66
+ src_code = self.translator.language_codes[src_lang]
67
+ tgt_code = self.translator.language_codes[tgt_lang]
68
+ return self.translator.translate_text(text, src_code, tgt_code)
69
+
70
+ def speech_to_speech(self, audio_path: str, tgt_lang: str) -> tuple[int, np.ndarray]:
71
+ tgt_code = self.translator.language_codes[tgt_lang]
72
+ return self.translator.translate_audio(audio_path, tgt_code)
 
 
 
73
 
74
+ def launch(self):
75
+ # Create the Gradio interface
76
+ with gr.Blocks(title="SeamlessM4T Translator") as demo:
77
+ gr.Markdown("# 🌐 SeamlessM4T Translator")
78
+ gr.Markdown("Translate text or speech to different languages using Meta's SeamlessM4T model")
79
 
80
+ with gr.Tabs():
81
+ # Text-to-Speech tab
82
+ with gr.TabItem("Text to Speech"):
83
+ with gr.Row():
84
+ with gr.Column():
85
+ text_input = gr.Textbox(
86
+ label="Input Text",
87
+ placeholder="Enter text to translate...",
88
+ lines=3
89
+ )
90
+ src_lang = gr.Dropdown(
91
+ choices=self.languages,
92
+ value="English",
93
+ label="Source Language"
94
+ )
95
+ tgt_lang_text = gr.Dropdown(
96
+ choices=self.languages,
97
+ value="Spanish",
98
+ label="Target Language"
99
+ )
100
+ translate_btn = gr.Button("Translate", variant="primary")
101
+
102
+ with gr.Column():
103
+ audio_output = gr.Audio(
104
+ label="Translated Speech",
105
+ type="numpy"
106
+ )
107
 
108
+ translate_btn.click(
109
+ fn=self.text_to_speech,
110
+ inputs=[text_input, src_lang, tgt_lang_text],
111
+ outputs=audio_output
112
+ )
113
+
114
+ # Speech-to-Speech tab
115
+ with gr.TabItem("Speech to Speech"):
116
+ with gr.Row():
117
+ with gr.Column():
118
+ audio_input = gr.Audio(
119
+ label="Input Speech",
120
+ type="filepath"
121
+ )
122
+ tgt_lang_speech = gr.Dropdown(
123
+ choices=self.languages,
124
+ value="Spanish",
125
+ label="Target Language"
126
+ )
127
+ translate_audio_btn = gr.Button("Translate", variant="primary")
128
+
129
+ with gr.Column():
130
+ audio_output_s2s = gr.Audio(
131
+ label="Translated Speech",
132
+ type="numpy"
133
+ )
134
+
135
+ translate_audio_btn.click(
136
+ fn=self.speech_to_speech,
137
+ inputs=[audio_input, tgt_lang_speech],
138
+ outputs=audio_output_s2s
139
+ )
140
+
141
+ gr.Markdown(
142
+ """
143
+ ### Notes
144
+ - Text-to-Speech: Enter text and select source/target languages
145
+ - Speech-to-Speech: Upload an audio file and select target language
146
+ - Processing may take a few moments depending on input length
147
+ """
148
+ )
149
 
150
+ # Launch the interface
151
+ demo.launch(share=True)
152
 
153
  if __name__ == "__main__":
154
+ interface = GradioInterface()
155
+ interface.launch()