frogcho123 commited on
Commit
17c527a
·
1 Parent(s): 074508e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -33
app.py CHANGED
@@ -4,56 +4,55 @@ import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
 
7
- def translate_voice(file_obj, target_lang):
8
- # Save the temporary file to disk
9
- temp_file_path = "temp_audio_file.wav"
10
- with open(temp_file_path, "wb") as out_file:
11
- out_file.write(file_obj.read())
12
 
13
- # Load the model and switch to float32
14
- model = whisper.load_model("base").float()
15
 
16
- # Load the audio
17
- audio = whisper.load_audio(temp_file_path)
18
 
19
- # Pad or trim the audio
20
- audio = whisper.pad_or_trim(audio)
21
 
22
- # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
23
- mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32
 
 
24
 
25
- # Proceed with your language detection and decoding
26
- _, probs = model.detect_language(mel)
27
- options = whisper.DecodingOptions()
28
- result = whisper.decode(model, mel, options)
29
 
30
- text = result.text
31
- lang = max(probs, key=probs.get)
 
32
 
33
- # Translate
34
- tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
35
- model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
 
36
 
37
- tokenizer.src_lang = target_lang
38
- encoded_bg = tokenizer(text, return_tensors="pt")
39
- generated_tokens = model.generate(**encoded_bg)
40
- translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
41
 
42
- # Text-to-audio (TTS)
43
- tts = gTTS(text=translated_text, lang=target_lang)
44
- filename = "to_speech.mp3"
45
- tts.save(filename)
46
 
47
- return filename, text, translated_text, target_lang
 
48
 
49
  iface = gr.Interface(
50
  fn=translate_voice,
51
  inputs=[
52
- gr.inputs.File(type="file", label="Your Audio"),
53
  gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
54
  ],
55
  outputs=[
56
- gr.outputs.Audio(type="filepath", label="Translated Audio"),
57
  gr.outputs.Textbox(label="Original Text"),
58
  gr.outputs.Textbox(label="Translated Text"),
59
  gr.outputs.Textbox(label="Target Language"),
 
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
 
7
+ def translate_voice(file, target_lang):
8
+ try:
9
+ # Load the model and switch to float32
10
+ model = whisper.load_model("base").float()
 
11
 
12
+ # Load the audio
13
+ audio = whisper.load_audio(file.name)
14
 
15
+ # Pad or trim the audio
16
+ audio = whisper.pad_or_trim(audio)
17
 
18
+ # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case)
19
+ mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32
20
 
21
+ # Proceed with your language detection and decoding
22
+ _, probs = model.detect_language(mel)
23
+ options = whisper.DecodingOptions()
24
+ result = whisper.decode(model, mel, options)
25
 
26
+ text = result.text
27
+ lang = max(probs, key=probs.get)
 
 
28
 
29
+ # Translate
30
+ tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
31
+ model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
32
 
33
+ tokenizer.src_lang = target_lang
34
+ encoded_bg = tokenizer(text, return_tensors="pt")
35
+ generated_tokens = model.generate(**encoded_bg)
36
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
37
 
38
+ # Text-to-audio (TTS)
39
+ tts = gTTS(text=translated_text, lang=target_lang)
40
+ filename = "to_speech.mp3"
41
+ tts.save(filename)
42
 
43
+ return filename, text, translated_text, target_lang
 
 
 
44
 
45
+ except Exception as e:
46
+ return str(e), "", "", ""
47
 
48
  iface = gr.Interface(
49
  fn=translate_voice,
50
  inputs=[
51
+ gr.inputs.File(label="Your Audio"),
52
  gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
53
  ],
54
  outputs=[
55
+ gr.outputs.Audio(type="auto", label="Translated Audio"),
56
  gr.outputs.Textbox(label="Original Text"),
57
  gr.outputs.Textbox(label="Translated Text"),
58
  gr.outputs.Textbox(label="Target Language"),