Spaces:

biodatlab
/

whisper-thai-yt-subtitles

Runtime error

App Files Files Community

tensorops commited on Jan 1, 2023

Commit

0c38b7e

1 Parent(s): ac3a02e

Add application file

Browse files

Files changed (2) hide show

README.md +2 -0
app.py +307 -0

README.md CHANGED Viewed

@@ -10,4 +10,6 @@ pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+Adapted from and credits to https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import torch
+import psutil
+from pytube import YouTube
+import time
+import re
+import pandas as pd
+import pysrt
+from pathlib import Path
+import gradio as gr
+import os
+import requests
+import json
+import base64
+os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
+os.system('make -C ./whisper.cpp')
+os.system('wget https://huggingface.co/datasets/tensorops/ggml-whisper-medium-th-combined/resolve/main/ggml-whisper-medium-th-combined.bin')
+num_cores = psutil.cpu_count()
+os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
+transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("DEVICE IS: ")
+print(device)
+videos_out_path = Path("./videos_out")
+videos_out_path.mkdir(parents=True, exist_ok=True)
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
+        'resolution').desc().first().download()
+    return abs_video_path
+def speech_to_text(video_file_path):
+    """
+    # Youtube with translated subtitles using OpenAI Whisper models.
+    # Currently supports only Thai audio
+    This space allows you to:
+    1. Download youtube video with a given url
+    2. Watch it in the first video component
+    3. Run automatic speech recognition on the video using fast Whisper models
+    4. Burn the transcriptions to the original video and watch the video in the 2nd video component
+    Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
+    This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
+    """
+    if (video_file_path == None):
+        raise ValueError("Error no video input")
+    print(video_file_path)
+    try:
+        _, file_ending = os.path.splitext(f'{video_file_path}')
+        print(f'file enging is {file_ending}')
+        print("starting conversion to wav")
+        os.system(
+            f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
+        print("conversion to wav ready")
+        print("starting whisper c++")
+        srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
+        os.system(f'rm -f {srt_path}')
+        os.system(
+            f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l "th" -m ./ggml-whisper-medium-th-combined.bin -osrt')
+        print("starting whisper done with whisper")
+    except Exception as e:
+        raise RuntimeError("Error converting video to audio")
+    try:
+        df = pd.DataFrame(columns=['start', 'end', 'text'])
+        srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
+        subs = pysrt.open(srt_path)
+        objects = []
+        for sub in subs:
+            start_hours = str(str(sub.start.hours) + "00")[0:2] if len(
+                str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
+            end_hours = str(str(sub.end.hours) + "00")[0:2] if len(
+                str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
+            start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(
+                str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
+            end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(
+                str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
+            start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(
+                str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
+            end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(
+                str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
+            start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
+            end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
+            objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}',
+                           f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
+        for object in objects:
+            srt_to_df = {
+                'start': [object[1]],
+                'end': [object[2]],
+                'text': [object[0]]
+            }
+            df = pd.concat([df, pd.DataFrame(srt_to_df)])
+        df.to_csv('subtitles.csv', index=False)
+        print("Starting SRT-file creation")
+        df.reset_index(inplace=True)
+        with open('subtitles.vtt', 'w', encoding="utf-8") as file:
+            print("Starting WEBVTT-file creation")
+            for i in range(len(df)):
+                if i == 0:
+                    file.write('WEBVTT')
+                    file.write('\n')
+                else:
+                    file.write(str(i+1))
+                    file.write('\n')
+                    start = df.iloc[i]['start']
+                    file.write(f"{start.strip()}")
+                    stop = df.iloc[i]['end']
+                    file.write(' --> ')
+                    file.write(f"{stop}")
+                    file.write('\n')
+                    file.writelines(df.iloc[i]['text'])
+                    if int(i) != len(df)-1:
+                        file.write('\n\n')
+        print("WEBVTT DONE")
+        with open('subtitles.srt', 'w', encoding="utf-8") as file:
+            print("Starting SRT-file creation")
+            for i in range(len(df)):
+                file.write(str(i+1))
+                file.write('\n')
+                start = df.iloc[i]['start']
+                file.write(f"{start.strip()}")
+                stop = df.iloc[i]['end']
+                file.write(' --> ')
+                file.write(f"{stop}")
+                file.write('\n')
+                file.writelines(df.iloc[i]['text'])
+                if int(i) != len(df)-1:
+                    file.write('\n\n')
+        print("SRT DONE")
+        subtitle_files = ['subtitles.vtt', 'subtitles.srt', 'subtitles.csv']
+        return df, subtitle_files
+    except Exception as e:
+        raise RuntimeError("Error Running inference with local model", e)
+def burn_srt_to_video(srt_file, video_in):
+    print("Starting creation of video wit srt")
+    try:
+        video_out = video_in.replace('.mp4', '_out.mp4')
+        print(os.system('ls -lrth'))
+        print(video_in)
+        print(video_out)
+        command = 'ffmpeg -i "{}" -y -vf subtitles=./subtitles.srt "{}"'.format(
+            video_in, video_out)
+        os.system(command)
+        return video_out
+    except Exception as e:
+        print(e)
+        return video_out
+def create_video_player(subtitle_files, video_in):
+    with open(video_in, "rb") as file:
+        video_base64 = base64.b64encode(file.read())
+    with open('./subtitles.vtt', "rb") as file:
+        subtitle_base64 = base64.b64encode(file.read())
+    video_player = f'''<video id="video" controls preload="metadata">
+      <source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" />
+      <track
+        label="Thai"
+        kind="subtitles"
+        srclang="th"
+        src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}"
+        default />
+    </video>
+    '''
+    return video_player
+# ---- Gradio Layout -----
+video_in = gr.Video(label="Video file", mirror_webcam=False)
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+video_out = gr.Video(label="Video Out", mirror_webcam=False)
+df_init = pd.DataFrame(columns=['start', 'end', 'text', 'translation'])
+transcription_df = gr.DataFrame(value=df_init, label="Transcription dataframe", row_count=(
+    0, "dynamic"), max_rows=10, wrap=True, overflow_row_behaviour='paginate')
+transcription_and_translation_df = gr.DataFrame(
+    value=df_init, label="Transcription and translation dataframe", max_rows=10, wrap=True, overflow_row_behaviour='paginate')
+subtitle_files = gr.File(
+    label="Download srt-file",
+    file_count="multiple",
+    type="file",
+    interactive=False,
+)
+video_player = gr.HTML(
+    '<p>video will be played here after you press the button at step 3')
+demo = gr.Blocks(css='''
+#cut_btn, #reset_btn { align-self:stretch; }
+#\\31 3 { max-width: 540px; }
+.output-markdown {max-width: 65ch !important;}
+''')
+demo.encrypt = False
+with demo:
+    transcription_var = gr.Variable()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ### This space allows you to:
+            ##### 1. Download youtube video with a given URL
+            ##### 2. Watch it in the first video component
+            ##### 3. Run automatic Thai speech recognition on the video using Whisper
+            ##### 4. Burn the translations to the original video and watch the video in the 2nd video component
+            ''')
+        with gr.Column():
+            gr.Markdown('''
+            ### 1. Insert Youtube URL below. Some test videos below:
+            ##### 1. https://www.youtube.com/watch?v=UIHPIESyIXM
+            ##### 2. https://www.youtube.com/watch?v=YlfaFK7OFUo
+            ''')
+    with gr.Row():
+        with gr.Column():
+            youtube_url_in.render()
+            download_youtube_btn = gr.Button("Step 1. Download Youtube video")
+            download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                video_in])
+            print(video_in)
+    with gr.Row():
+        with gr.Column():
+            video_in.render()
+            with gr.Column():
+                gr.Markdown('''
+                ##### Here you can start the transcription process.
+                ##### Be aware that processing will take some time.
+                ''')
+            transcribe_btn = gr.Button("Step 2. Transcribe audio")
+            transcribe_btn.click(speech_to_text, [
+                                 video_in], [transcription_df, subtitle_files])
+    with gr.Row():
+        gr.Markdown('''
+        ##### Here you will get transcription output
+        ##### ''')
+    with gr.Row():
+        with gr.Column():
+            transcription_df.render()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                '''##### From here, you can download the transcription output in different formats. ''')
+            subtitle_files.render()
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            ##### Now press the Step 3. Button to create output video with translated transcriptions
+            ##### ''')
+            create_video_button = gr.Button(
+                "Step 3. Create and add subtitles to video")
+            print(video_in)
+            create_video_button.click(create_video_player, [subtitle_files, video_in], [
+                video_player])
+            video_player.render()
+demo.launch()