marquesafonso commited on
Commit
ffa3aaf
·
1 Parent(s): 655abb7

add mvp api with desired functionalities

Browse files
Files changed (5) hide show
  1. Pipfile +15 -0
  2. Pipfile.lock +0 -0
  3. app.py +21 -0
  4. requirements.txt +0 -0
  5. src/transcriber.py +59 -0
Pipfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ faster-whisper = "*"
8
+ gradio = "*"
9
+ moviepy = "*"
10
+
11
+ [dev-packages]
12
+
13
+ [requires]
14
+ python_version = "3.11"
15
+ python_full_version = "3.11.9"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.transcriber import transcriber
3
+
4
+ def main():
5
+ with gr.Blocks(analytics_enabled=False, title='multilang-asr-transcriber') as demo:
6
+ gr.Markdown('# multilang-asr-transcriber')
7
+ gr.Markdown('### A multilingual automatic speech transcription tool using [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Supports translation to english and user setting of max words per line.',)
8
+ video_file = gr.File(file_types=["video"],type="filepath")
9
+ max_words_per_line = gr.Number(value=6, label="Max words per line")
10
+ task = gr.Dropdown(choices=["transcribe", "translate"], value="transcribe", label="Select Task")
11
+ text_output = gr.Textbox(label="Text transcription")
12
+ srt_file = gr.File(file_count="single", file_types=[".srt"], label="SRT file")
13
+ gr.Interface(transcriber,
14
+ inputs=[video_file, max_words_per_line, task],
15
+ outputs=[text_output,srt_file],
16
+ allow_flagging="never",
17
+ analytics_enabled=False)
18
+ demo.launch()
19
+
20
+ if __name__ == '__main__':
21
+ main()
requirements.txt ADDED
Binary file (6.88 kB). View file
 
src/transcriber.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from faster_whisper import WhisperModel
4
+ from moviepy.editor import VideoFileClip
5
+
6
+ def convert_video_to_audio(video_input):
7
+ video_clip = VideoFileClip(video_input)
8
+ audio_clip = video_clip.audio
9
+ audio_clip_filepath = os.path.normpath(f"{video_input.split('.')[0]}.m4a")
10
+ audio_clip.write_audiofile(audio_clip_filepath, codec='aac')
11
+ audio_clip.close()
12
+ video_clip.close()
13
+ return audio_clip_filepath
14
+
15
+ def convert_seconds_to_time(seconds):
16
+ seconds = float(seconds)
17
+ hours, remainder = divmod(seconds, 3600)
18
+ minutes, remainder = divmod(remainder, 60)
19
+ whole_seconds = int(remainder)
20
+ milliseconds = int((remainder - whole_seconds) * 1000)
21
+ return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
22
+
23
+ def write_srt(segments, max_words_per_line, srt_path):
24
+ with open(srt_path, "w", encoding='utf-8') as file:
25
+ result = ''
26
+ line_counter = 1
27
+ for _, segment in enumerate(segments):
28
+ words_in_line = []
29
+ for w, word in enumerate(segment.words):
30
+ words_in_line.append(word)
31
+ # Write the line if max words limit reached or it's the last word in the segment
32
+ if len(words_in_line) == max_words_per_line or w == len(segment.words) - 1:
33
+ if words_in_line: # Check to avoid writing a line if there are no words
34
+ start_time = convert_seconds_to_time(words_in_line[0].start)
35
+ end_time = convert_seconds_to_time(words_in_line[-1].end)
36
+ line_text = ' '.join([w.word.strip() for w in words_in_line])
37
+ result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
38
+ # Reset for the next line and increment line counter
39
+ line_counter += 1
40
+ words_in_line = [] # Reset words list for the next line
41
+ file.write(result)
42
+ return result, srt_path
43
+
44
+ def transcriber(video_input:gr.File,
45
+ max_words_per_line:int,
46
+ task:str):
47
+ srt_filepath = os.path.normpath(f"{video_input.split('.')[0]}.srt")
48
+ audio_input = convert_video_to_audio(video_input)
49
+ model_size = "large-v3"
50
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
51
+ segments, _ = model.transcribe(
52
+ audio_input,
53
+ beam_size=5,
54
+ task=task,
55
+ vad_filter=True,
56
+ vad_parameters=dict(min_silence_duration_ms=500),
57
+ word_timestamps=True
58
+ )
59
+ return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath)