leolxliu
fix docker file
0dcd183
import tempfile
import os
import uuid
import time
import subprocess
import openai
import whisper
from ffmpy import FFmpeg
import gradio as gr
from elevenlabs import clone, generate, get_api_key, set_api_key
css = """
#col-container{
margin: 0 auto;
max-width: 840px;
text-align: left;
}
"""
default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。'
openai.api_type = 'azure'
openai.api_base = 'https://tencent-openai01.openai.azure.com'
openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
openai.api_version = "2023-05-15"
openai.log = "debug"
# *************************#
# 1. Resize the video #
# 2. Extract the audio #
# 3. Translate the text from audio #
# 4. Translate the text #
# 5. Voice Synthesis #
# 6. Wave2lip #
start = time.perf_counter()
model = whisper.load_model("base", download_root='./checkpoints')
end = time.perf_counter()
print('whisper load model time: ', end - start)
set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
print('elevenlab api key', get_api_key())
language_mapping = {
'English': '英语',
'Spanish': '西班牙语',
'French': '法语',
'German': '德语',
'Italian': '意大利语',
'Portuguese': '葡萄牙语',
'Polish': '波兰语',
'Turkish': '土耳其语',
'Russian': '俄语',
'Dutch': '荷兰语',
'Czech': '捷克语',
'Arabic': '阿拉伯语',
'Chinese': '中文普通话'
}
def resize_video(video_source):
return video_source
def extract_audio(video_source, output_dir='./'):
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
ff = FFmpeg(
inputs={video_source: None},
outputs={output_audio: '-acodec libmp3lame -ar 44100 -q:a 0 -map a -y'}
)
print('ffmpeg command: ', ff.cmd)
ff.run()
return output_audio
def clone_audio(audio_file, audio_text):
voice = clone(
name=uuid.uuid4().hex,
description="", # Optional
files=[audio_file])
print('voice: ', voice)
audio = generate(text=audio_text, voice=voice,
model='eleven_multilingual_v2')
return audio
# todo
def translate_text(text, target_language):
target_language_name = language_mapping[target_language]
chat_completion = openai.ChatCompletion.create(
engine="gpt-4",
temperature=0.1,
max_tokens=2048,
messages=[
{"role": "system", "content": default_prompt.replace(
'{{target_lang}}', target_language_name)},
{"role": "user", "content": text}])
# print the completion
print(chat_completion.choices[0].message.content)
translated_text = chat_completion.choices[0].message.content
return translated_text
def infer(video_source, target_language):
print('video_source: ', video_source)
# check the video format
# Create a temporary directory to store the output file
output_dir = tempfile.mkdtemp()
output_video_file = os.path.join(output_dir, 'output_video.mp4')
print("Output file: ", output_video_file)
output_audio = extract_audio(video_source, output_dir=output_dir)
result = model.transcribe(output_audio)
whisper_text = result["text"]
whisper_language = result['language']
print("Whisper text: ", whisper_text, whisper_language)
target_language_code = language_mapping[target_language]
print("Target language code: ", target_language_code)
translated_text = translate_text(whisper_text, target_language)
print("Translated text: ", translated_text)
# 声音 clone && 合成
audio = clone_audio(output_audio, translated_text)
audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
with open(audio_file, 'wb') as f:
f.write(audio)
# 合成视频
wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}"
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
print("Video conversion successful.")
return output_video_file
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("""
<h1 style="text-align: center;">AI Translation</h1>
<p style="text-align: center;">
This is a demo for AI Translation.
</p>
""")
with gr.Row():
with gr.Column():
video_source = gr.Video(
label="Source Video", show_label=True, interactive=True)
target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish",
"Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English")
submit_btn = gr.Button(value="Submit")
with gr.Column():
result = gr.Video(label="Result")
with gr.Row():
gr.Examples(
label="Video Examples",
examples=['dictator.mp4'],
inputs=[video_source]
)
submit_btn.click(
infer, inputs=[video_source, target_language], outputs=result)
demo.queue(5).launch()