Spaces:
Runtime error
Runtime error
import tempfile | |
import os | |
import uuid | |
import time | |
import subprocess | |
import openai | |
import whisper | |
from ffmpy import FFmpeg | |
import gradio as gr | |
from elevenlabs import clone, generate, get_api_key, set_api_key | |
css = """ | |
#col-container{ | |
margin: 0 auto; | |
max-width: 840px; | |
text-align: left; | |
} | |
""" | |
default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。' | |
openai.api_type = 'azure' | |
openai.api_base = 'https://tencent-openai01.openai.azure.com' | |
openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba' | |
openai.api_version = "2023-05-15" | |
openai.log = "debug" | |
# *************************# | |
# 1. Resize the video # | |
# 2. Extract the audio # | |
# 3. Translate the text from audio # | |
# 4. Translate the text # | |
# 5. Voice Synthesis # | |
# 6. Wave2lip # | |
start = time.perf_counter() | |
model = whisper.load_model("base", download_root='./checkpoints') | |
end = time.perf_counter() | |
print('whisper load model time: ', end - start) | |
set_api_key('05a491535c6526e1fc9fc8e195f2fe25') | |
print('elevenlab api key', get_api_key()) | |
language_mapping = { | |
'English': '英语', | |
'Spanish': '西班牙语', | |
'French': '法语', | |
'German': '德语', | |
'Italian': '意大利语', | |
'Portuguese': '葡萄牙语', | |
'Polish': '波兰语', | |
'Turkish': '土耳其语', | |
'Russian': '俄语', | |
'Dutch': '荷兰语', | |
'Czech': '捷克语', | |
'Arabic': '阿拉伯语', | |
'Chinese': '中文普通话' | |
} | |
def resize_video(video_source): | |
return video_source | |
def extract_audio(video_source, output_dir='./'): | |
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav') | |
ff = FFmpeg( | |
inputs={video_source: None}, | |
outputs={output_audio: '-acodec libmp3lame -ar 44100 -q:a 0 -map a -y'} | |
) | |
print('ffmpeg command: ', ff.cmd) | |
ff.run() | |
return output_audio | |
def clone_audio(audio_file, audio_text): | |
voice = clone( | |
name=uuid.uuid4().hex, | |
description="", # Optional | |
files=[audio_file]) | |
print('voice: ', voice) | |
audio = generate(text=audio_text, voice=voice, | |
model='eleven_multilingual_v2') | |
return audio | |
# todo | |
def translate_text(text, target_language): | |
target_language_name = language_mapping[target_language] | |
chat_completion = openai.ChatCompletion.create( | |
engine="gpt-4", | |
temperature=0.1, | |
max_tokens=2048, | |
messages=[ | |
{"role": "system", "content": default_prompt.replace( | |
'{{target_lang}}', target_language_name)}, | |
{"role": "user", "content": text}]) | |
# print the completion | |
print(chat_completion.choices[0].message.content) | |
translated_text = chat_completion.choices[0].message.content | |
return translated_text | |
def infer(video_source, target_language): | |
print('video_source: ', video_source) | |
# check the video format | |
# Create a temporary directory to store the output file | |
output_dir = tempfile.mkdtemp() | |
output_video_file = os.path.join(output_dir, 'output_video.mp4') | |
print("Output file: ", output_video_file) | |
output_audio = extract_audio(video_source, output_dir=output_dir) | |
result = model.transcribe(output_audio) | |
whisper_text = result["text"] | |
whisper_language = result['language'] | |
print("Whisper text: ", whisper_text, whisper_language) | |
target_language_code = language_mapping[target_language] | |
print("Target language code: ", target_language_code) | |
translated_text = translate_text(whisper_text, target_language) | |
print("Translated text: ", translated_text) | |
# 声音 clone && 合成 | |
audio = clone_audio(output_audio, translated_text) | |
audio_file = os.path.join(output_dir, 'output_clone_audio.wav') | |
with open(audio_file, 'wb') as f: | |
f.write(audio) | |
# 合成视频 | |
wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}" | |
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE) | |
print("Video conversion successful.") | |
return output_video_file | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(""" | |
<h1 style="text-align: center;">AI Translation</h1> | |
<p style="text-align: center;"> | |
This is a demo for AI Translation. | |
</p> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
video_source = gr.Video( | |
label="Source Video", show_label=True, interactive=True) | |
target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", | |
"Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English") | |
submit_btn = gr.Button(value="Submit") | |
with gr.Column(): | |
result = gr.Video(label="Result") | |
with gr.Row(): | |
gr.Examples( | |
label="Video Examples", | |
examples=['dictator.mp4'], | |
inputs=[video_source] | |
) | |
submit_btn.click( | |
infer, inputs=[video_source, target_language], outputs=result) | |
demo.queue(5).launch() | |