Spaces:

leeoxiang
/

video-translation

Runtime error

App Files Files Community

leolxliu commited on Nov 5, 2023

Commit

0b94fa2

1 Parent(s): a8f923d

init repo

Browse files

Files changed (13) hide show

.gitignore +4 -0
Dockerfile +61 -0
README.md +10 -4
app.py +210 -0
checkpoints/base.pt +3 -0
checkpoints/mobilenet.pth +3 -0
checkpoints/resnet50.pth +3 -0
checkpoints/s3fd-619a316812.pth +3 -0
checkpoints/wav2lip_gan.pth +3 -0
dictator.mp4 +0 -0
download.sh +12 -0
requirements.txt +19 -0
run.sh +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ __pycache__/
2	+
3	+
4	+

Dockerfile ADDED Viewed

	@@ -0,0 +1,61 @@

+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND=noninteractive
+# install python via pyenv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    make \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    wget \
+    curl \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libffi-dev \
+    liblzma-dev \
+    git \
+    ca-certificates \
+    libgl1 \
+    && rm -rf /var/lib/apt/lists/*
+ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
+ARG PYTHON_VERSION=3.8
+RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
+    pyenv install $PYTHON_VERSION && \
+    pyenv global $PYTHON_VERSION
+# install deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+ENV HOME=/root \
+    CUDA_HOME=/usr/local/cuda \
+    PATH=/root/.local/bin:$PATH \
+    LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} \
+    LIBRARY_PATH=${CUDA_HOME}/lib64/stubs:${LIBRARY_PATH}
+WORKDIR $HOME/app
+RUN git clone -b master https://github.com/Rudrabha/Wav2Lip $HOME/app
+COPY . $HOME/app/
+COPY ./checkpoints/s3fd-619a316812.pth  $HOME/app/face_detection/detection/sfd/s3fd.pth
+RUN pip install --upgrade pip
+RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
+RUN pip install --no-cache-dir -r requirements.txt
+ENV PYTHONUNBUFFERED=1
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,11 +1,17 @@
 ---
-title: Video Translation
-emoji: 👀
-colorFrom: blue
 colorTo: pink
-sdk: docker
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Ai Translation
+emoji: 📊
+colorFrom: yellow
 colorTo: pink
+sdk: gradio
+sdk_version: 3.50.2
+app_file: app.py
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+docker run --gpus all -it ai:v1  /bin/bash

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import tempfile
+import os
+import uuid
+import time
+import subprocess
+import openai
+import whisper
+from ffmpy import FFmpeg
+import gradio as gr
+from elevenlabs import clone, generate, get_api_key, set_api_key
+css="""
+#col-container{
+    margin: 0 auto;
+    max-width: 840px;
+    text-align: left;
+}
+"""
+default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}}，注意保留数字和换行符，请勿自行创建内容，除了翻译，不要输出任何其他文本。'
+openai.api_type = 'azure'
+openai.api_base = 'https://tencent-openai01.openai.azure.com'
+openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba'
+openai.api_version = "2023-05-15"
+openai.log = "debug"
+#*************************#
+# 1. Resize the video     #
+# 2. Extract the audio    #
+# 3. Translate the text from audio #
+# 4. Translate the text #
+# 5. Voice Synthesis #
+# 6. Wave2lip  #
+start = time.perf_counter()
+model = whisper.load_model("base",download_root='./checkpoints')
+end = time.perf_counter()
+print('whisper load model time: ', end - start)
+set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
+print('elevenlab api key', get_api_key())
+language_mapping = {
+'English':'英语',
+'Spanish':'西班牙语',
+'French': '法语',
+'German': '德语',
+'Italian': '意大利语',
+'Portuguese': '葡萄牙语',
+'Polish': '波兰语',
+'Turkish': '土耳其语',
+'Russian': '俄语',
+'Dutch': '荷兰语',
+'Czech': '捷克语',
+'Arabic': '阿拉伯语',
+'Chinese': '中文普通话'
+}
+def resize_video(video_source):
+    return video_source
+def extract_audio(video_source, output_dir='./'):
+    output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
+    ff = FFmpeg(
+        inputs={video_source: None},
+        outputs={output_audio: '-acodec pcm_s24le -ar 48000 -q:a 0 -map a  -y'}
+    )
+    print('ffmpeg command: ', ff.cmd)
+    ff.run()
+    return output_audio
+def clone_audio(audio_file, audio_text):
+    voice = clone(
+        name=uuid.uuid4().hex,
+        description="", # Optional
+        files=[audio_file])
+    print('voice: ', voice)
+    audio = generate(text=audio_text, voice=voice, model='eleven_multilingual_v2')
+    return audio
+# todo
+def translate_text(text, target_language):
+    target_language_name = language_mapping[target_language]
+    chat_completion = openai.ChatCompletion.create(
+                            engine="gpt-4",
+                            temperature=0.1,
+                            max_tokens=2048,
+                            messages=[
+                                {"role":"system", "content": default_prompt.replace('{{target_lang}}', target_language_name)},
+                                {"role": "user", "content": text}])
+    # print the completion
+    print(chat_completion.choices[0].message.content)
+    translated_text = chat_completion.choices[0].message.content
+    return translated_text
+def infer(video_source, target_language):
+    print('video_source: ', video_source)
+    # check the video format
+    # Create a temporary directory to store the output file
+    output_dir = tempfile.mkdtemp()
+    output_video_file = os.path.join(output_dir, 'output_video.mp4')
+    print("Output file: ", output_video_file)
+    output_audio = extract_audio(video_source, output_dir=output_dir)
+    result = model.transcribe(output_audio)
+    whisper_text = result["text"]
+    whisper_language = result['language']
+    print("Whisper text: ", whisper_text, whisper_language)
+    target_language_code = language_mapping[target_language]
+    print("Target language code: ", target_language_code)
+    translated_text =  translate_text(whisper_text, target_language)
+    print("Translated text: ", translated_text)
+    # 声音 clone &&  合成
+    audio = clone_audio(output_audio, translated_text)
+    audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
+    with open(audio_file, 'wb') as f:
+        f.write(audio)
+    # 合成视频
+    wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file}  --resize_factor 1 --nosmooth --outfile {output_video_file}"
+    subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
+    print("Video conversion successful.")
+    return output_video_file
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("""
+        <h1 style="text-align: center;">AI Translation</h1>
+        <p style="text-align: center;">
+        This is a demo for AI Translation.
+        </p>
+        """)
+        with gr.Row():
+            with gr.Column():
+                video_source = gr.Video(label="Source Video", show_label=True,interactive=True)
+                target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!",value="English")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                result = gr.Video(label="Result")
+        with gr.Row():
+            gr.Examples(
+                  label="Video Examples",
+                  examples=['dictator.mp4'],
+                  inputs=[video_source]
+                )
+    submit_btn.click(infer, inputs=[video_source,target_language], outputs=result)
+demo.launch()

checkpoints/base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e
+size 145262807

checkpoints/mobilenet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
+size 1789735

checkpoints/resnet50.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
+size 109497761

checkpoints/s3fd-619a316812.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:619a31681264d3f7f7fc7a16a42cbbe8b23f31a256f75a366e5a1bcd59b33543
+size 89843225

checkpoints/wav2lip_gan.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9ab7b7b812c0e80a6e70a5977c545a1e8a365a6c49d5e533023c034d7ac3d8
+size 435801865

dictator.mp4 ADDED Viewed

Binary file (438 kB). View file

download.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/usr/bin/env bash
+set -ex
+wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
+wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
+wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
+wget -c -O checkpoints/s3fd-619a316812.pth 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth'
+# whisper
+wget -c -O checkpoints/base.pt 'https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt'

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+ffmpy
+elevenlabs
+openai-whisper
+openai
+gradio
+httpx==0.24.1
+numpy==1.23.4
+librosa==0.7.0
+tqdm==4.45.0
+numba==0.48
+mediapipe==0.8.11
+opencv-python==4.6.0.66
+--extra-index-url=https://download.pytorch.org/whl/cu116
+torch==1.12.1+cu116
+--extra-index-url=https://download.pytorch.org/whl/cu116
+torchvision==0.13.1+cu116

run.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+set -ex
+pad_top=0
+pad_bottom=20
+pad_left=0
+pad_right=0
+rescaleFactor=1
+python3 inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face './musk.mp4' --audio './musk.mp3' --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth --outfile './output_video.mp4'