Spaces:
Runtime error
Runtime error
leolxliu
commited on
Commit
·
0b94fa2
1
Parent(s):
a8f923d
init repo
Browse files- .gitignore +4 -0
- Dockerfile +61 -0
- README.md +10 -4
- app.py +210 -0
- checkpoints/base.pt +3 -0
- checkpoints/mobilenet.pth +3 -0
- checkpoints/resnet50.pth +3 -0
- checkpoints/s3fd-619a316812.pth +3 -0
- checkpoints/wav2lip_gan.pth +3 -0
- dictator.mp4 +0 -0
- download.sh +12 -0
- requirements.txt +19 -0
- run.sh +11 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
|
3 |
+
|
4 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
|
2 |
+
|
3 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
4 |
+
|
5 |
+
# install python via pyenv
|
6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
7 |
+
make \
|
8 |
+
build-essential \
|
9 |
+
libssl-dev \
|
10 |
+
zlib1g-dev \
|
11 |
+
libbz2-dev \
|
12 |
+
libreadline-dev \
|
13 |
+
libsqlite3-dev \
|
14 |
+
wget \
|
15 |
+
curl \
|
16 |
+
llvm \
|
17 |
+
libncurses5-dev \
|
18 |
+
libncursesw5-dev \
|
19 |
+
xz-utils \
|
20 |
+
tk-dev \
|
21 |
+
libffi-dev \
|
22 |
+
liblzma-dev \
|
23 |
+
git \
|
24 |
+
ca-certificates \
|
25 |
+
libgl1 \
|
26 |
+
&& rm -rf /var/lib/apt/lists/*
|
27 |
+
|
28 |
+
ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
|
29 |
+
ARG PYTHON_VERSION=3.8
|
30 |
+
|
31 |
+
RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
|
32 |
+
pyenv install $PYTHON_VERSION && \
|
33 |
+
pyenv global $PYTHON_VERSION
|
34 |
+
|
35 |
+
# install deps
|
36 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
37 |
+
ffmpeg libsndfile1 \
|
38 |
+
&& rm -rf /var/lib/apt/lists/*
|
39 |
+
|
40 |
+
ENV HOME=/root \
|
41 |
+
CUDA_HOME=/usr/local/cuda \
|
42 |
+
PATH=/root/.local/bin:$PATH \
|
43 |
+
LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} \
|
44 |
+
LIBRARY_PATH=${CUDA_HOME}/lib64/stubs:${LIBRARY_PATH}
|
45 |
+
|
46 |
+
|
47 |
+
WORKDIR $HOME/app
|
48 |
+
RUN git clone -b master https://github.com/Rudrabha/Wav2Lip $HOME/app
|
49 |
+
COPY . $HOME/app/
|
50 |
+
|
51 |
+
|
52 |
+
COPY ./checkpoints/s3fd-619a316812.pth $HOME/app/face_detection/detection/sfd/s3fd.pth
|
53 |
+
|
54 |
+
RUN pip install --upgrade pip
|
55 |
+
RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
|
56 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
57 |
+
ENV PYTHONUNBUFFERED=1
|
58 |
+
|
59 |
+
CMD ["python", "app.py"]
|
60 |
+
|
61 |
+
|
README.md
CHANGED
@@ -1,11 +1,17 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: pink
|
6 |
-
sdk:
|
|
|
|
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Ai Translation
|
3 |
+
emoji: 📊
|
4 |
+
colorFrom: yellow
|
5 |
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.50.2
|
8 |
+
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
docker run --gpus all -it ai:v1 /bin/bash
|
app.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
import uuid
|
6 |
+
import time
|
7 |
+
import subprocess
|
8 |
+
|
9 |
+
|
10 |
+
import openai
|
11 |
+
|
12 |
+
import whisper
|
13 |
+
from ffmpy import FFmpeg
|
14 |
+
import gradio as gr
|
15 |
+
from elevenlabs import clone, generate, get_api_key, set_api_key
|
16 |
+
|
17 |
+
|
18 |
+
css="""
|
19 |
+
#col-container{
|
20 |
+
margin: 0 auto;
|
21 |
+
max-width: 840px;
|
22 |
+
text-align: left;
|
23 |
+
}
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。'
|
28 |
+
|
29 |
+
|
30 |
+
openai.api_type = 'azure'
|
31 |
+
openai.api_base = 'https://tencent-openai01.openai.azure.com'
|
32 |
+
openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
|
33 |
+
openai.api_version = "2023-05-15"
|
34 |
+
openai.log = "debug"
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
#*************************#
|
39 |
+
# 1. Resize the video #
|
40 |
+
# 2. Extract the audio #
|
41 |
+
# 3. Translate the text from audio #
|
42 |
+
# 4. Translate the text #
|
43 |
+
# 5. Voice Synthesis #
|
44 |
+
# 6. Wave2lip #
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
start = time.perf_counter()
|
49 |
+
model = whisper.load_model("base",download_root='./checkpoints')
|
50 |
+
end = time.perf_counter()
|
51 |
+
|
52 |
+
print('whisper load model time: ', end - start)
|
53 |
+
|
54 |
+
set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
|
55 |
+
|
56 |
+
print('elevenlab api key', get_api_key())
|
57 |
+
|
58 |
+
language_mapping = {
|
59 |
+
'English':'英语',
|
60 |
+
'Spanish':'西班牙语',
|
61 |
+
'French': '法语',
|
62 |
+
'German': '德语',
|
63 |
+
'Italian': '意大利语',
|
64 |
+
'Portuguese': '葡萄牙语',
|
65 |
+
'Polish': '波兰语',
|
66 |
+
'Turkish': '土耳其语',
|
67 |
+
'Russian': '俄语',
|
68 |
+
'Dutch': '荷兰语',
|
69 |
+
'Czech': '捷克语',
|
70 |
+
'Arabic': '阿拉伯语',
|
71 |
+
'Chinese': '中文普通话'
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
def resize_video(video_source):
|
77 |
+
|
78 |
+
return video_source
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
def extract_audio(video_source, output_dir='./'):
|
83 |
+
|
84 |
+
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
|
85 |
+
|
86 |
+
ff = FFmpeg(
|
87 |
+
inputs={video_source: None},
|
88 |
+
outputs={output_audio: '-acodec pcm_s24le -ar 48000 -q:a 0 -map a -y'}
|
89 |
+
)
|
90 |
+
|
91 |
+
print('ffmpeg command: ', ff.cmd)
|
92 |
+
ff.run()
|
93 |
+
|
94 |
+
return output_audio
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
def clone_audio(audio_file, audio_text):
|
99 |
+
|
100 |
+
voice = clone(
|
101 |
+
name=uuid.uuid4().hex,
|
102 |
+
description="", # Optional
|
103 |
+
files=[audio_file])
|
104 |
+
|
105 |
+
print('voice: ', voice)
|
106 |
+
audio = generate(text=audio_text, voice=voice, model='eleven_multilingual_v2')
|
107 |
+
|
108 |
+
return audio
|
109 |
+
|
110 |
+
|
111 |
+
# todo
|
112 |
+
def translate_text(text, target_language):
|
113 |
+
|
114 |
+
target_language_name = language_mapping[target_language]
|
115 |
+
|
116 |
+
chat_completion = openai.ChatCompletion.create(
|
117 |
+
engine="gpt-4",
|
118 |
+
temperature=0.1,
|
119 |
+
max_tokens=2048,
|
120 |
+
messages=[
|
121 |
+
{"role":"system", "content": default_prompt.replace('{{target_lang}}', target_language_name)},
|
122 |
+
{"role": "user", "content": text}])
|
123 |
+
|
124 |
+
# print the completion
|
125 |
+
print(chat_completion.choices[0].message.content)
|
126 |
+
|
127 |
+
|
128 |
+
translated_text = chat_completion.choices[0].message.content
|
129 |
+
|
130 |
+
return translated_text
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
def infer(video_source, target_language):
|
135 |
+
|
136 |
+
print('video_source: ', video_source)
|
137 |
+
|
138 |
+
# check the video format
|
139 |
+
|
140 |
+
# Create a temporary directory to store the output file
|
141 |
+
output_dir = tempfile.mkdtemp()
|
142 |
+
output_video_file = os.path.join(output_dir, 'output_video.mp4')
|
143 |
+
print("Output file: ", output_video_file)
|
144 |
+
|
145 |
+
output_audio = extract_audio(video_source, output_dir=output_dir)
|
146 |
+
|
147 |
+
|
148 |
+
result = model.transcribe(output_audio)
|
149 |
+
whisper_text = result["text"]
|
150 |
+
whisper_language = result['language']
|
151 |
+
|
152 |
+
print("Whisper text: ", whisper_text, whisper_language)
|
153 |
+
|
154 |
+
target_language_code = language_mapping[target_language]
|
155 |
+
|
156 |
+
print("Target language code: ", target_language_code)
|
157 |
+
|
158 |
+
translated_text = translate_text(whisper_text, target_language)
|
159 |
+
|
160 |
+
print("Translated text: ", translated_text)
|
161 |
+
|
162 |
+
# 声音 clone && 合成
|
163 |
+
audio = clone_audio(output_audio, translated_text)
|
164 |
+
|
165 |
+
audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
|
166 |
+
|
167 |
+
with open(audio_file, 'wb') as f:
|
168 |
+
f.write(audio)
|
169 |
+
|
170 |
+
# 合成视频
|
171 |
+
|
172 |
+
wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}"
|
173 |
+
|
174 |
+
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
|
175 |
+
|
176 |
+
print("Video conversion successful.")
|
177 |
+
|
178 |
+
return output_video_file
|
179 |
+
|
180 |
+
|
181 |
+
with gr.Blocks(css=css) as demo:
|
182 |
+
with gr.Column(elem_id="col-container"):
|
183 |
+
|
184 |
+
gr.Markdown("""
|
185 |
+
<h1 style="text-align: center;">AI Translation</h1>
|
186 |
+
<p style="text-align: center;">
|
187 |
+
This is a demo for AI Translation.
|
188 |
+
</p>
|
189 |
+
|
190 |
+
""")
|
191 |
+
|
192 |
+
with gr.Row():
|
193 |
+
with gr.Column():
|
194 |
+
video_source = gr.Video(label="Source Video", show_label=True,interactive=True)
|
195 |
+
target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!",value="English")
|
196 |
+
|
197 |
+
submit_btn = gr.Button(value="Submit")
|
198 |
+
|
199 |
+
with gr.Column():
|
200 |
+
result = gr.Video(label="Result")
|
201 |
+
|
202 |
+
with gr.Row():
|
203 |
+
gr.Examples(
|
204 |
+
label="Video Examples",
|
205 |
+
examples=['dictator.mp4'],
|
206 |
+
inputs=[video_source]
|
207 |
+
)
|
208 |
+
submit_btn.click(infer, inputs=[video_source,target_language], outputs=result)
|
209 |
+
|
210 |
+
demo.launch()
|
checkpoints/base.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e
|
3 |
+
size 145262807
|
checkpoints/mobilenet.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
|
3 |
+
size 1789735
|
checkpoints/resnet50.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
|
3 |
+
size 109497761
|
checkpoints/s3fd-619a316812.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:619a31681264d3f7f7fc7a16a42cbbe8b23f31a256f75a366e5a1bcd59b33543
|
3 |
+
size 89843225
|
checkpoints/wav2lip_gan.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca9ab7b7b812c0e80a6e70a5977c545a1e8a365a6c49d5e533023c034d7ac3d8
|
3 |
+
size 435801865
|
dictator.mp4
ADDED
Binary file (438 kB). View file
|
|
download.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
set -ex
|
4 |
+
|
5 |
+
wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
|
6 |
+
wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
|
7 |
+
wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
|
8 |
+
wget -c -O checkpoints/s3fd-619a316812.pth 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth'
|
9 |
+
|
10 |
+
|
11 |
+
# whisper
|
12 |
+
wget -c -O checkpoints/base.pt 'https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt'
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
ffmpy
|
3 |
+
elevenlabs
|
4 |
+
openai-whisper
|
5 |
+
openai
|
6 |
+
gradio
|
7 |
+
|
8 |
+
|
9 |
+
httpx==0.24.1
|
10 |
+
numpy==1.23.4
|
11 |
+
librosa==0.7.0
|
12 |
+
tqdm==4.45.0
|
13 |
+
numba==0.48
|
14 |
+
mediapipe==0.8.11
|
15 |
+
opencv-python==4.6.0.66
|
16 |
+
--extra-index-url=https://download.pytorch.org/whl/cu116
|
17 |
+
torch==1.12.1+cu116
|
18 |
+
--extra-index-url=https://download.pytorch.org/whl/cu116
|
19 |
+
torchvision==0.13.1+cu116
|
run.sh
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
set -ex
|
4 |
+
|
5 |
+
pad_top=0
|
6 |
+
pad_bottom=20
|
7 |
+
pad_left=0
|
8 |
+
pad_right=0
|
9 |
+
rescaleFactor=1
|
10 |
+
|
11 |
+
python3 inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face './musk.mp4' --audio './musk.mp3' --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth --outfile './output_video.mp4'
|