File size: 5,483 Bytes
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
66a8f87
 
 
 
 
 
 
 
 
 
 
 
 
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
0dcd183
0b94fa2
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
66a8f87
0b94fa2
66a8f87
0b94fa2
66a8f87
 
 
0b94fa2
 
 
66a8f87
0b94fa2
 
 
 
 
66a8f87
 
 
 
 
 
 
 
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
66a8f87
 
0b94fa2
 
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
66a8f87
 
0b94fa2
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
66a8f87
0b94fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66a8f87
 
 
 
0b94fa2
 
 
 
 
 
 
 
66a8f87
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208


import tempfile
import os
import uuid
import time
import subprocess


import openai

import whisper
from ffmpy import FFmpeg
import gradio as gr
from elevenlabs import clone, generate, get_api_key, set_api_key


css = """
#col-container{
    margin: 0 auto;
    max-width: 840px;
    text-align: left;
}
"""


default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。'


openai.api_type = 'azure'
openai.api_base = 'https://tencent-openai01.openai.azure.com'
openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
openai.api_version = "2023-05-15"
openai.log = "debug"


# *************************#
# 1. Resize the video     #
# 2. Extract the audio    #
# 3. Translate the text from audio #
# 4. Translate the text #
# 5. Voice Synthesis #
# 6. Wave2lip  #


start = time.perf_counter()
model = whisper.load_model("base", download_root='./checkpoints')
end = time.perf_counter()

print('whisper load model time: ', end - start)

set_api_key('05a491535c6526e1fc9fc8e195f2fe25')

print('elevenlab api key', get_api_key())

language_mapping = {
    'English': '英语',
    'Spanish': '西班牙语',
    'French': '法语',
    'German': '德语',
    'Italian': '意大利语',
    'Portuguese': '葡萄牙语',
    'Polish': '波兰语',
    'Turkish': '土耳其语',
    'Russian': '俄语',
    'Dutch': '荷兰语',
    'Czech': '捷克语',
    'Arabic': '阿拉伯语',
    'Chinese': '中文普通话'
}


def resize_video(video_source):

    return video_source


def extract_audio(video_source, output_dir='./'):

    output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')

    ff = FFmpeg(
        inputs={video_source: None},
        outputs={output_audio: '-acodec libmp3lame -ar 44100 -q:a 0 -map a  -y'}
    )

    print('ffmpeg command: ', ff.cmd)
    ff.run()

    return output_audio


def clone_audio(audio_file, audio_text):

    voice = clone(
        name=uuid.uuid4().hex,
        description="",  # Optional
        files=[audio_file])

    print('voice: ', voice)
    audio = generate(text=audio_text, voice=voice,
                     model='eleven_multilingual_v2')

    return audio


# todo
def translate_text(text, target_language):

    target_language_name = language_mapping[target_language]

    chat_completion = openai.ChatCompletion.create(
        engine="gpt-4",
        temperature=0.1,
        max_tokens=2048,
        messages=[
            {"role": "system", "content": default_prompt.replace(
                '{{target_lang}}', target_language_name)},
            {"role": "user", "content": text}])

    # print the completion
    print(chat_completion.choices[0].message.content)

    translated_text = chat_completion.choices[0].message.content

    return translated_text


def infer(video_source, target_language):

    print('video_source: ', video_source)

    # check the video format

    # Create a temporary directory to store the output file
    output_dir = tempfile.mkdtemp()
    output_video_file = os.path.join(output_dir, 'output_video.mp4')
    print("Output file: ", output_video_file)

    output_audio = extract_audio(video_source, output_dir=output_dir)

    result = model.transcribe(output_audio)
    whisper_text = result["text"]
    whisper_language = result['language']

    print("Whisper text: ", whisper_text, whisper_language)

    target_language_code = language_mapping[target_language]

    print("Target language code: ", target_language_code)

    translated_text = translate_text(whisper_text, target_language)

    print("Translated text: ", translated_text)

    # 声音 clone &&  合成
    audio = clone_audio(output_audio, translated_text)

    audio_file = os.path.join(output_dir, 'output_clone_audio.wav')

    with open(audio_file, 'wb') as f:
        f.write(audio)

    # 合成视频

    wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file}  --resize_factor 1 --nosmooth --outfile {output_video_file}"

    subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)

    print("Video conversion successful.")

    return output_video_file


with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):

        gr.Markdown("""
        <h1 style="text-align: center;">AI Translation</h1>
        <p style="text-align: center;"> 
        This is a demo for AI Translation.
        </p>
                
        """)

        with gr.Row():
            with gr.Column():
                video_source = gr.Video(
                    label="Source Video", show_label=True, interactive=True)
                target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish",
                                              "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English")

                submit_btn = gr.Button(value="Submit")

            with gr.Column():
                result = gr.Video(label="Result")

        with gr.Row():
            gr.Examples(
                label="Video Examples",
                examples=['dictator.mp4'],
                inputs=[video_source]
            )
    submit_btn.click(
        infer, inputs=[video_source, target_language], outputs=result)

demo.queue(5).launch()