artificialguybr commited on
Commit
75517c0
·
verified ·
1 Parent(s): 285da88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -11
app.py CHANGED
@@ -5,7 +5,8 @@ import subprocess
5
  import os, stat
6
  import uuid
7
  from googletrans import Translator
8
- from TTS.api import TTS
 
9
  import ffmpeg
10
  import json
11
  from scipy.signal import wiener
@@ -24,7 +25,6 @@ from huggingface_hub import HfApi
24
  import moviepy.editor as mp
25
 
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
- os.environ["COQUI_TOS_AGREED"] = "1"
28
  api = HfApi(token=HF_TOKEN)
29
  repo_id = "artificialguybr/video-dubbing"
30
  ZipFile("ffmpeg.zip").extractall()
@@ -121,6 +121,10 @@ def transcribe_audio(file_path):
121
 
122
  return result
123
 
 
 
 
 
124
  @spaces.GPU
125
  def process_video(radio, video, target_language, has_closeup_face):
126
  try:
@@ -156,15 +160,34 @@ def process_video(radio, video, target_language, has_closeup_face):
156
  print(f"Error encountered during transcription: {str(e)}")
157
  raise
158
 
159
- language_mapping = {'English': 'en', 'Spanish': 'es', 'French': 'fr', 'German': 'de', 'Italian': 'it', 'Portuguese': 'pt', 'Polish': 'pl', 'Turkish': 'tr', 'Russian': 'ru', 'Dutch': 'nl', 'Czech': 'cs', 'Arabic': 'ar', 'Chinese (Simplified)': 'zh-cn'}
160
- target_language_code = language_mapping[target_language]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  translator = Translator()
162
  translated_text = translator.translate(whisper_text, dest=target_language_code).text
163
  print(translated_text)
164
 
165
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
166
- tts.to('cuda')
167
- tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
168
 
169
  pad_top = 0
170
  pad_bottom = 15
@@ -228,7 +251,7 @@ iface = gr.Interface(
228
  inputs=[
229
  radio,
230
  video,
231
- gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
232
  gr.Checkbox(
233
  label="Video has a close-up face. Use Wav2lip.",
234
  value=False,
@@ -246,10 +269,9 @@ with gr.Blocks() as demo:
246
  radio.change(swap, inputs=[radio], outputs=video)
247
  gr.Markdown("""
248
  **Note:**
249
- - Video limit is 1 minute. It will dubbling all people using just one voice.
250
  - Generation may take up to 5 minutes.
251
- - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
252
- - The tool uses open-source models for all models. It's a alpha version.
253
  - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
254
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
255
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
 
5
  import os, stat
6
  import uuid
7
  from googletrans import Translator
8
+ import edge_tts
9
+ import asyncio
10
  import ffmpeg
11
  import json
12
  from scipy.signal import wiener
 
25
  import moviepy.editor as mp
26
 
27
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
28
  api = HfApi(token=HF_TOKEN)
29
  repo_id = "artificialguybr/video-dubbing"
30
  ZipFile("ffmpeg.zip").extractall()
 
121
 
122
  return result
123
 
124
+ async def text_to_speech(text, voice, output_file):
125
+ communicate = edge_tts.Communicate(text, voice)
126
+ await communicate.save(output_file)
127
+
128
  @spaces.GPU
129
  def process_video(radio, video, target_language, has_closeup_face):
130
  try:
 
160
  print(f"Error encountered during transcription: {str(e)}")
161
  raise
162
 
163
+ language_mapping = {
164
+ 'English': ('en', 'en-US-EricNeural'),
165
+ 'Spanish': ('es', 'es-ES-AlvaroNeural'),
166
+ 'French': ('fr', 'fr-FR-HenriNeural'),
167
+ 'German': ('de', 'de-DE-ConradNeural'),
168
+ 'Italian': ('it', 'it-IT-DiegoNeural'),
169
+ 'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
170
+ 'Polish': ('pl', 'pl-PL-MarekNeural'),
171
+ 'Turkish': ('tr', 'tr-TR-AhmetNeural'),
172
+ 'Russian': ('ru', 'ru-RU-DmitryNeural'),
173
+ 'Dutch': ('nl', 'nl-NL-MaartenNeural'),
174
+ 'Czech': ('cs', 'cs-CZ-AntoninNeural'),
175
+ 'Arabic': ('ar', 'ar-SA-HamedNeural'),
176
+ 'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
177
+ 'Japanese': ('ja', 'ja-JP-KeitaNeural'),
178
+ 'Korean': ('ko', 'ko-KR-InJoonNeural'),
179
+ 'Hindi': ('hi', 'hi-IN-MadhurNeural'),
180
+ 'Swedish': ('sv', 'sv-SE-MattiasNeural'),
181
+ 'Danish': ('da', 'da-DK-JeppeNeural'),
182
+ 'Finnish': ('fi', 'fi-FI-HarriNeural'),
183
+ 'Greek': ('el', 'el-GR-NestorasNeural')
184
+ }
185
+ target_language_code, voice = language_mapping[target_language]
186
  translator = Translator()
187
  translated_text = translator.translate(whisper_text, dest=target_language_code).text
188
  print(translated_text)
189
 
190
+ asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
 
 
191
 
192
  pad_top = 0
193
  pad_bottom = 15
 
251
  inputs=[
252
  radio,
253
  video,
254
+ gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Japanese", "Korean", "Hindi", "Swedish", "Danish", "Finnish", "Greek"], label="Target Language for Dubbing", value="Spanish"),
255
  gr.Checkbox(
256
  label="Video has a close-up face. Use Wav2lip.",
257
  value=False,
 
269
  radio.change(swap, inputs=[radio], outputs=video)
270
  gr.Markdown("""
271
  **Note:**
272
+ - Video limit is 1 minute. It will dubbing all people using just one voice.
273
  - Generation may take up to 5 minutes.
274
+ - The tool uses open-source models for all models. It's an alpha version.
 
275
  - Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
276
  - If you need more than 1 minute, duplicate the Space and change the limit on app.py.
277
  - If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.