Spaces:
Runtime error
Runtime error
File size: 5,483 Bytes
0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 0dcd183 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 0b94fa2 66a8f87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import tempfile
import os
import uuid
import time
import subprocess
import openai
import whisper
from ffmpy import FFmpeg
import gradio as gr
from elevenlabs import clone, generate, get_api_key, set_api_key
css = """
#col-container{
margin: 0 auto;
max-width: 840px;
text-align: left;
}
"""
default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。'
openai.api_type = 'azure'
openai.api_base = 'https://tencent-openai01.openai.azure.com'
openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
openai.api_version = "2023-05-15"
openai.log = "debug"
# *************************#
# 1. Resize the video #
# 2. Extract the audio #
# 3. Translate the text from audio #
# 4. Translate the text #
# 5. Voice Synthesis #
# 6. Wave2lip #
start = time.perf_counter()
model = whisper.load_model("base", download_root='./checkpoints')
end = time.perf_counter()
print('whisper load model time: ', end - start)
set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
print('elevenlab api key', get_api_key())
language_mapping = {
'English': '英语',
'Spanish': '西班牙语',
'French': '法语',
'German': '德语',
'Italian': '意大利语',
'Portuguese': '葡萄牙语',
'Polish': '波兰语',
'Turkish': '土耳其语',
'Russian': '俄语',
'Dutch': '荷兰语',
'Czech': '捷克语',
'Arabic': '阿拉伯语',
'Chinese': '中文普通话'
}
def resize_video(video_source):
return video_source
def extract_audio(video_source, output_dir='./'):
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
ff = FFmpeg(
inputs={video_source: None},
outputs={output_audio: '-acodec libmp3lame -ar 44100 -q:a 0 -map a -y'}
)
print('ffmpeg command: ', ff.cmd)
ff.run()
return output_audio
def clone_audio(audio_file, audio_text):
voice = clone(
name=uuid.uuid4().hex,
description="", # Optional
files=[audio_file])
print('voice: ', voice)
audio = generate(text=audio_text, voice=voice,
model='eleven_multilingual_v2')
return audio
# todo
def translate_text(text, target_language):
target_language_name = language_mapping[target_language]
chat_completion = openai.ChatCompletion.create(
engine="gpt-4",
temperature=0.1,
max_tokens=2048,
messages=[
{"role": "system", "content": default_prompt.replace(
'{{target_lang}}', target_language_name)},
{"role": "user", "content": text}])
# print the completion
print(chat_completion.choices[0].message.content)
translated_text = chat_completion.choices[0].message.content
return translated_text
def infer(video_source, target_language):
print('video_source: ', video_source)
# check the video format
# Create a temporary directory to store the output file
output_dir = tempfile.mkdtemp()
output_video_file = os.path.join(output_dir, 'output_video.mp4')
print("Output file: ", output_video_file)
output_audio = extract_audio(video_source, output_dir=output_dir)
result = model.transcribe(output_audio)
whisper_text = result["text"]
whisper_language = result['language']
print("Whisper text: ", whisper_text, whisper_language)
target_language_code = language_mapping[target_language]
print("Target language code: ", target_language_code)
translated_text = translate_text(whisper_text, target_language)
print("Translated text: ", translated_text)
# 声音 clone && 合成
audio = clone_audio(output_audio, translated_text)
audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
with open(audio_file, 'wb') as f:
f.write(audio)
# 合成视频
wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}"
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
print("Video conversion successful.")
return output_video_file
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("""
<h1 style="text-align: center;">AI Translation</h1>
<p style="text-align: center;">
This is a demo for AI Translation.
</p>
""")
with gr.Row():
with gr.Column():
video_source = gr.Video(
label="Source Video", show_label=True, interactive=True)
target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish",
"Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English")
submit_btn = gr.Button(value="Submit")
with gr.Column():
result = gr.Video(label="Result")
with gr.Row():
gr.Examples(
label="Video Examples",
examples=['dictator.mp4'],
inputs=[video_source]
)
submit_btn.click(
infer, inputs=[video_source, target_language], outputs=result)
demo.queue(5).launch()
|