Spaces:

NeuralFalcon
/

Whisper-Turbo-Subtitle

Running

App Files Files Community

NeuralFalcon commited on Oct 21, 2024

Commit

54fe84c

verified ·

1 Parent(s): de8072a

Create app.py

Browse files

Files changed (1) hide show

app.py +328 -0

app.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# app.py
+from utils import language_dict
+import math
+import torch
+import gc
+import time
+import subprocess
+from faster_whisper import WhisperModel
+import os
+import mimetypes
+import shutil
+import re
+import uuid
+from pydub import AudioSegment
+import torch
+def get_language_name(lang_code):
+    global language_dict
+    # Iterate through the language dictionary
+    for language, details in language_dict.items():
+        # Check if the language code matches
+        if details["lang_code"] == lang_code:
+            return language  # Return the language name
+    return None
+def clean_file_name(file_path):
+    # Get the base file name and extension
+    file_name = os.path.basename(file_path)
+    file_name, file_extension = os.path.splitext(file_name)
+    # Replace non-alphanumeric characters with an underscore
+    cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
+    # Remove any multiple underscores
+    clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
+    # Generate a random UUID for uniqueness
+    random_uuid = uuid.uuid4().hex[:6]
+    # Combine cleaned file name with the original extension
+    clean_file_path = os.path.join(os.path.dirname(file_path), clean_file_name + f"_{random_uuid}" + file_extension)
+    return clean_file_path
+def get_audio_file(uploaded_file):
+    global base_path
+    # ,device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Detect the file type (audio/video)
+    mime_type, _ = mimetypes.guess_type(uploaded_file)
+    # Create the folder path to store audio files
+    audio_folder = f"{base_path}/subtitle_audio"
+    os.makedirs(audio_folder, exist_ok=True)
+    # Initialize variable for the audio file path
+    audio_file_path = ""
+    if mime_type and mime_type.startswith('audio'):
+        # If it's an audio file, save it as is
+        audio_file_path = os.path.join(audio_folder, os.path.basename(uploaded_file))
+        audio_file_path=clean_file_name(audio_file_path)
+        shutil.copy(uploaded_file, audio_file_path)  # Move file to audio folder
+    elif mime_type and mime_type.startswith('video'):
+        # If it's a video file, extract the audio
+        audio_file_name = os.path.splitext(os.path.basename(uploaded_file))[0] + ".mp3"
+        audio_file_path = os.path.join(audio_folder, audio_file_name)
+        audio_file_path=clean_file_name(audio_file_path)
+        # Extract the file extension from the uploaded file
+        file_extension = os.path.splitext(uploaded_file)[1]  # Includes the dot, e.g., '.mp4'
+        # Generate a random UUID and create a new file name with the same extension
+        random_uuid = uuid.uuid4().hex[:6]
+        new_file_name = random_uuid + file_extension
+        # Set the new file path in the subtitle_audio folder
+        new_file_path = os.path.join(audio_folder, new_file_name)
+        # Copy the original video file to the new location with the new name
+        shutil.copy(uploaded_file, new_file_path)
+        if device=="cuda":
+          command = f"ffmpeg -hwaccel cuda -i {new_file_path} {audio_file_path} -y"
+        else:
+          command = f"ffmpeg -i {new_file_path} {audio_file_path} -y"
+        subprocess.run(command, shell=True)
+        if os.path.exists(new_file_path):
+          os.remove(new_file_path)
+    # Return the saved audio file path
+    audio = AudioSegment.from_file(audio_file_path)
+    # Get the duration in seconds
+    duration_seconds = len(audio) / 1000.0  # pydub measures duration in milliseconds
+    return audio_file_path,duration_seconds
+def format_segments(segments):
+    saved_segments = list(segments)
+    sentence_timestamp = []
+    words_timestamp = []
+    speech_to_text = ""
+    for i in saved_segments:
+        temp_sentence_timestamp = {}
+        # Store sentence information in sentence_timestamp
+        text = i.text.strip()
+        sentence_id = len(sentence_timestamp)  # Get the current index for the new entry
+        sentence_timestamp.append({
+            "id": sentence_id,  # Use the index as the id
+            "text": text,
+            "start": i.start,
+            "end": i.end,
+            "words": []  # Initialize words as an empty list within the sentence
+        })
+        speech_to_text += text + " "
+        # Process each word in the sentence
+        for word in i.words:
+            word_data = {
+                "word": word.word.strip(),
+                "start": word.start,
+                "end": word.end
+            }
+            # Append word timestamps to the sentence's word list
+            sentence_timestamp[sentence_id]["words"].append(word_data)
+            # Optionally, add the word data to the global words_timestamp list
+            words_timestamp.append(word_data)
+    return sentence_timestamp, words_timestamp, speech_to_text
+def combine_word_segments(words_timestamp, max_words_per_subtitle=8, min_silence_between_words=0.5):
+    before_translate = {}
+    id = 1
+    text = ""
+    start = None
+    end = None
+    word_count = 0
+    last_end_time = None
+    for i in words_timestamp:
+        try:
+            word = i['word']
+            word_start = i['start']
+            word_end = i['end']
+            # Check for sentence-ending punctuation
+            is_end_of_sentence = word.endswith(('.', '?', '!'))
+            # Check for conditions to create a new subtitle
+            if ((last_end_time is not None and word_start - last_end_time > min_silence_between_words)
+                or word_count >= max_words_per_subtitle
+                or is_end_of_sentence):
+                # Store the previous subtitle if there's any
+                if text:
+                    before_translate[id] = {
+                        "text": text,
+                        "start": start,
+                        "end": end
+                    }
+                    id += 1
+                # Reset for the new subtitle segment
+                text = word
+                start = word_start  # Set the start time for the new subtitle
+                word_count = 1
+            else:
+                if word_count == 0:  # First word in the subtitle
+                    start = word_start  # Ensure the start time is set
+                text += " " + word
+                word_count += 1
+            end = word_end  # Update the end timestamp
+            last_end_time = word_end  # Update the last end timestamp
+        except KeyError as e:
+            print(f"KeyError: {e} - Skipping word")
+            pass
+    # After the loop, make sure to add the last subtitle segment
+    if text:
+        before_translate[id] = {
+            "text": text,
+            "start": start,
+            "end": end
+        }
+    return before_translate
+def convert_time_to_srt_format(seconds):
+    """ Convert seconds to SRT time format (HH:MM:SS,ms) """
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
+def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
+    # Open the file with UTF-8 encoding
+    with open(filename, 'w', encoding='utf-8') as f:
+        for id, entry in subtitles.items():
+            # Write the subtitle index
+            f.write(f"{id}\n")
+            if entry['start'] is None or entry['end'] is None:
+              print(id)
+            # Write the start and end time in SRT format
+            start_time = convert_time_to_srt_format(entry['start'])
+            end_time = convert_time_to_srt_format(entry['end'])
+            f.write(f"{start_time} --> {end_time}\n")
+            # Write the text and speaker information
+            f.write(f"{entry['text']}\n\n")
+def word_level_srt(words_timestamp, srt_path="world_level_subtitle.srt"):
+    with open(srt_path, 'w', encoding='utf-8') as srt_file:
+        for i, word_info in enumerate(words_timestamp, start=1):
+            start_time = convert_time_to_srt_format(word_info['start'])
+            end_time = convert_time_to_srt_format(word_info['end'])
+            srt_file.write(f"{i}\n{start_time} --> {end_time}\n{word_info['word']}\n\n")
+def generate_srt_from_sentences(sentence_timestamp, srt_path="default_subtitle.srt"):
+    with open(srt_path, 'w', encoding='utf-8') as srt_file:
+        for index, sentence in enumerate(sentence_timestamp):
+            start_time = convert_time_to_srt_format(sentence['start'])
+            end_time = convert_time_to_srt_format(sentence['end'])
+            srt_file.write(f"{index + 1}\n{start_time} --> {end_time}\n{sentence['text']}\n\n")
+def whisper_subtitle(uploaded_file,Source_Language,max_words_per_subtitle=8):
+  global language_dict,base_path
+  #Load model
+  if torch.cuda.is_available():
+      # If CUDA is available, use GPU with float16 precision
+      device = "cuda"
+      compute_type = "float16"
+      # compute_type="int8_float16"
+  else:
+      # If CUDA is not available, use CPU with int8 precision
+      device = "cpu"
+      compute_type = "int8"
+  faster_whisper_model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
+  audio_path,audio_duration=get_audio_file(uploaded_file)
+  if Source_Language=="Automatic":
+      segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True)
+      lang_code=d.language
+      src_lang=get_language_name(lang_code)
+  else:
+    lang=language_dict[Source_Language]['lang_code']
+    segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True,language=lang)
+    src_lang=Source_Language
+  if os.path.exists(audio_path):
+    os.remove(audio_path)
+  sentence_timestamp,words_timestamp,text=format_segments(segments)
+  del faster_whisper_model
+  gc.collect()
+  torch.cuda.empty_cache()
+  word_segments=combine_word_segments(words_timestamp, max_words_per_subtitle=max_words_per_subtitle, min_silence_between_words=0.5)
+  #setup srt file names
+  base_name = os.path.basename(uploaded_file).rsplit('.', 1)[0][:30]
+  save_name = f"{base_path}/generated_subtitle/{base_name}_{src_lang}.srt"
+  original_srt_name=clean_file_name(save_name)
+  original_txt_name=original_srt_name.replace(".srt",".txt")
+  word_level_srt_name=original_srt_name.replace(".srt","_word_level.srt")
+  default_srt_name=original_srt_name.replace(".srt","_default.srt")
+  generate_srt_from_sentences(sentence_timestamp, srt_path=default_srt_name)
+  word_level_srt(words_timestamp, srt_path=word_level_srt_name)
+  write_subtitles_to_file(word_segments, filename=original_srt_name)
+  with open(original_txt_name, 'w', encoding='utf-8') as f1:
+    f1.write(text)
+  return default_srt_name,original_srt_name,word_level_srt_name,original_txt_name
+#@title Using Gradio Interface
+def subtitle_maker(Audio_or_Video_File,Source_Language,max_words_per_subtitle):
+  try:
+    default_srt_path,customize_srt_path,word_level_srt_path,text_path=whisper_subtitle(Audio_or_Video_File,Source_Language,max_words_per_subtitle=max_words_per_subtitle)
+  except:
+    default_srt_path,customize_srt_path,word_level_srt_path,text_path=None,None,None,None
+  return default_srt_path,customize_srt_path,word_level_srt_path,text_path
+import gradio as gr
+import click
+base_path="."
+if not os.path.exists(f"{base_path}/generated_subtitle"):
+    os.makedirs(f"{base_path}/generated_subtitle", exist_ok=True)
+source_lang_list = ['Automatic']
+available_language=language_dict.keys()
+source_lang_list.extend(available_language)
+@click.command()
+@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
+@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
+def main(debug, share):
+    # Define Gradio inputs and outputs
+    gradio_inputs = [
+        gr.File(label="Upload Audio or Video File"),
+        gr.Dropdown(label="Language", choices=source_lang_list, value="Automatic"),
+        gr.Number(label="Max Word Per Subtitle Segment", value=8)
+    ]
+    gradio_outputs = [
+        gr.File(label="Default SRT File", show_label=True),
+        gr.File(label="Customize SRT File", show_label=True),
+        gr.File(label="Word Level SRT File", show_label=True),
+        gr.File(label="Text File", show_label=True)
+    ]
+    # Create Gradio interface
+    demo = gr.Interface(fn=subtitle_maker, inputs=gradio_inputs, outputs=gradio_outputs, title="Whisper-Large-V3-Turbo-Ct2 Subtitle Maker")
+    # Launch Gradio with command-line options
+    demo.launch(debug=debug, share=share)
+if __name__ == "__main__":
+    main()