leolxliu commited on
Commit
0b94fa2
·
1 Parent(s): a8f923d
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+
3
+
4
+
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
2
+
3
+ ARG DEBIAN_FRONTEND=noninteractive
4
+
5
+ # install python via pyenv
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ make \
8
+ build-essential \
9
+ libssl-dev \
10
+ zlib1g-dev \
11
+ libbz2-dev \
12
+ libreadline-dev \
13
+ libsqlite3-dev \
14
+ wget \
15
+ curl \
16
+ llvm \
17
+ libncurses5-dev \
18
+ libncursesw5-dev \
19
+ xz-utils \
20
+ tk-dev \
21
+ libffi-dev \
22
+ liblzma-dev \
23
+ git \
24
+ ca-certificates \
25
+ libgl1 \
26
+ && rm -rf /var/lib/apt/lists/*
27
+
28
+ ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
29
+ ARG PYTHON_VERSION=3.8
30
+
31
+ RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
32
+ pyenv install $PYTHON_VERSION && \
33
+ pyenv global $PYTHON_VERSION
34
+
35
+ # install deps
36
+ RUN apt-get update && apt-get install -y --no-install-recommends \
37
+ ffmpeg libsndfile1 \
38
+ && rm -rf /var/lib/apt/lists/*
39
+
40
+ ENV HOME=/root \
41
+ CUDA_HOME=/usr/local/cuda \
42
+ PATH=/root/.local/bin:$PATH \
43
+ LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} \
44
+ LIBRARY_PATH=${CUDA_HOME}/lib64/stubs:${LIBRARY_PATH}
45
+
46
+
47
+ WORKDIR $HOME/app
48
+ RUN git clone -b master https://github.com/Rudrabha/Wav2Lip $HOME/app
49
+ COPY . $HOME/app/
50
+
51
+
52
+ COPY ./checkpoints/s3fd-619a316812.pth $HOME/app/face_detection/detection/sfd/s3fd.pth
53
+
54
+ RUN pip install --upgrade pip
55
+ RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
56
+ RUN pip install --no-cache-dir -r requirements.txt
57
+ ENV PYTHONUNBUFFERED=1
58
+
59
+ CMD ["python", "app.py"]
60
+
61
+
README.md CHANGED
@@ -1,11 +1,17 @@
1
  ---
2
- title: Video Translation
3
- emoji: 👀
4
- colorFrom: blue
5
  colorTo: pink
6
- sdk: docker
 
 
7
  pinned: false
8
  license: mit
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
+ title: Ai Translation
3
+ emoji: 📊
4
+ colorFrom: yellow
5
  colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.50.2
8
+ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+
16
+
17
+ docker run --gpus all -it ai:v1 /bin/bash
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import tempfile
4
+ import os
5
+ import uuid
6
+ import time
7
+ import subprocess
8
+
9
+
10
+ import openai
11
+
12
+ import whisper
13
+ from ffmpy import FFmpeg
14
+ import gradio as gr
15
+ from elevenlabs import clone, generate, get_api_key, set_api_key
16
+
17
+
18
+ css="""
19
+ #col-container{
20
+ margin: 0 auto;
21
+ max-width: 840px;
22
+ text-align: left;
23
+ }
24
+ """
25
+
26
+
27
+ default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。'
28
+
29
+
30
+ openai.api_type = 'azure'
31
+ openai.api_base = 'https://tencent-openai01.openai.azure.com'
32
+ openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
33
+ openai.api_version = "2023-05-15"
34
+ openai.log = "debug"
35
+
36
+
37
+
38
+ #*************************#
39
+ # 1. Resize the video #
40
+ # 2. Extract the audio #
41
+ # 3. Translate the text from audio #
42
+ # 4. Translate the text #
43
+ # 5. Voice Synthesis #
44
+ # 6. Wave2lip #
45
+
46
+
47
+
48
+ start = time.perf_counter()
49
+ model = whisper.load_model("base",download_root='./checkpoints')
50
+ end = time.perf_counter()
51
+
52
+ print('whisper load model time: ', end - start)
53
+
54
+ set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
55
+
56
+ print('elevenlab api key', get_api_key())
57
+
58
+ language_mapping = {
59
+ 'English':'英语',
60
+ 'Spanish':'西班牙语',
61
+ 'French': '法语',
62
+ 'German': '德语',
63
+ 'Italian': '意大利语',
64
+ 'Portuguese': '葡萄牙语',
65
+ 'Polish': '波兰语',
66
+ 'Turkish': '土耳其语',
67
+ 'Russian': '俄语',
68
+ 'Dutch': '荷兰语',
69
+ 'Czech': '捷克语',
70
+ 'Arabic': '阿拉伯语',
71
+ 'Chinese': '中文普通话'
72
+ }
73
+
74
+
75
+
76
+ def resize_video(video_source):
77
+
78
+ return video_source
79
+
80
+
81
+
82
+ def extract_audio(video_source, output_dir='./'):
83
+
84
+ output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
85
+
86
+ ff = FFmpeg(
87
+ inputs={video_source: None},
88
+ outputs={output_audio: '-acodec pcm_s24le -ar 48000 -q:a 0 -map a -y'}
89
+ )
90
+
91
+ print('ffmpeg command: ', ff.cmd)
92
+ ff.run()
93
+
94
+ return output_audio
95
+
96
+
97
+
98
+ def clone_audio(audio_file, audio_text):
99
+
100
+ voice = clone(
101
+ name=uuid.uuid4().hex,
102
+ description="", # Optional
103
+ files=[audio_file])
104
+
105
+ print('voice: ', voice)
106
+ audio = generate(text=audio_text, voice=voice, model='eleven_multilingual_v2')
107
+
108
+ return audio
109
+
110
+
111
+ # todo
112
+ def translate_text(text, target_language):
113
+
114
+ target_language_name = language_mapping[target_language]
115
+
116
+ chat_completion = openai.ChatCompletion.create(
117
+ engine="gpt-4",
118
+ temperature=0.1,
119
+ max_tokens=2048,
120
+ messages=[
121
+ {"role":"system", "content": default_prompt.replace('{{target_lang}}', target_language_name)},
122
+ {"role": "user", "content": text}])
123
+
124
+ # print the completion
125
+ print(chat_completion.choices[0].message.content)
126
+
127
+
128
+ translated_text = chat_completion.choices[0].message.content
129
+
130
+ return translated_text
131
+
132
+
133
+
134
+ def infer(video_source, target_language):
135
+
136
+ print('video_source: ', video_source)
137
+
138
+ # check the video format
139
+
140
+ # Create a temporary directory to store the output file
141
+ output_dir = tempfile.mkdtemp()
142
+ output_video_file = os.path.join(output_dir, 'output_video.mp4')
143
+ print("Output file: ", output_video_file)
144
+
145
+ output_audio = extract_audio(video_source, output_dir=output_dir)
146
+
147
+
148
+ result = model.transcribe(output_audio)
149
+ whisper_text = result["text"]
150
+ whisper_language = result['language']
151
+
152
+ print("Whisper text: ", whisper_text, whisper_language)
153
+
154
+ target_language_code = language_mapping[target_language]
155
+
156
+ print("Target language code: ", target_language_code)
157
+
158
+ translated_text = translate_text(whisper_text, target_language)
159
+
160
+ print("Translated text: ", translated_text)
161
+
162
+ # 声音 clone && 合成
163
+ audio = clone_audio(output_audio, translated_text)
164
+
165
+ audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
166
+
167
+ with open(audio_file, 'wb') as f:
168
+ f.write(audio)
169
+
170
+ # 合成视频
171
+
172
+ wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}"
173
+
174
+ subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
175
+
176
+ print("Video conversion successful.")
177
+
178
+ return output_video_file
179
+
180
+
181
+ with gr.Blocks(css=css) as demo:
182
+ with gr.Column(elem_id="col-container"):
183
+
184
+ gr.Markdown("""
185
+ <h1 style="text-align: center;">AI Translation</h1>
186
+ <p style="text-align: center;">
187
+ This is a demo for AI Translation.
188
+ </p>
189
+
190
+ """)
191
+
192
+ with gr.Row():
193
+ with gr.Column():
194
+ video_source = gr.Video(label="Source Video", show_label=True,interactive=True)
195
+ target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!",value="English")
196
+
197
+ submit_btn = gr.Button(value="Submit")
198
+
199
+ with gr.Column():
200
+ result = gr.Video(label="Result")
201
+
202
+ with gr.Row():
203
+ gr.Examples(
204
+ label="Video Examples",
205
+ examples=['dictator.mp4'],
206
+ inputs=[video_source]
207
+ )
208
+ submit_btn.click(infer, inputs=[video_source,target_language], outputs=result)
209
+
210
+ demo.launch()
checkpoints/base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e
3
+ size 145262807
checkpoints/mobilenet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
3
+ size 1789735
checkpoints/resnet50.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
3
+ size 109497761
checkpoints/s3fd-619a316812.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619a31681264d3f7f7fc7a16a42cbbe8b23f31a256f75a366e5a1bcd59b33543
3
+ size 89843225
checkpoints/wav2lip_gan.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca9ab7b7b812c0e80a6e70a5977c545a1e8a365a6c49d5e533023c034d7ac3d8
3
+ size 435801865
dictator.mp4 ADDED
Binary file (438 kB). View file
 
download.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -ex
4
+
5
+ wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
6
+ wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
7
+ wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
8
+ wget -c -O checkpoints/s3fd-619a316812.pth 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth'
9
+
10
+
11
+ # whisper
12
+ wget -c -O checkpoints/base.pt 'https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt'
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ffmpy
3
+ elevenlabs
4
+ openai-whisper
5
+ openai
6
+ gradio
7
+
8
+
9
+ httpx==0.24.1
10
+ numpy==1.23.4
11
+ librosa==0.7.0
12
+ tqdm==4.45.0
13
+ numba==0.48
14
+ mediapipe==0.8.11
15
+ opencv-python==4.6.0.66
16
+ --extra-index-url=https://download.pytorch.org/whl/cu116
17
+ torch==1.12.1+cu116
18
+ --extra-index-url=https://download.pytorch.org/whl/cu116
19
+ torchvision==0.13.1+cu116
run.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -ex
4
+
5
+ pad_top=0
6
+ pad_bottom=20
7
+ pad_left=0
8
+ pad_right=0
9
+ rescaleFactor=1
10
+
11
+ python3 inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face './musk.mp4' --audio './musk.mp3' --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth --outfile './output_video.mp4'