Spaces:
Running
Running
### ----------------------------------------------------------------------- | |
### Transkriber version_1.00 | |
### app.py | |
### ----------------------------------------------------------------------- | |
# ------------------------------------------------------------------------- | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ------------------------------------------------------------------------- | |
import os | |
import re | |
import uuid | |
import time | |
import psutil | |
import subprocess | |
from tqdm import tqdm | |
import tempfile | |
from fpdf import FPDF | |
from pathlib import Path | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
from gpuinfo import GPUInfo | |
from pydub import AudioSegment | |
from IPython.display import Audio | |
import gradio as gr | |
import huggingface_hub | |
############################################################################### | |
# # Configuration | @version 1.05? | |
# You are an intelligent assistant specializing in interviews with business clients | |
# for in-depth content creation, etc..() | |
############################################################################### | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
############################################################################### | |
# Function to detect leading silence | |
############################################################################### | |
def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10): | |
trim_ms = 0 | |
assert chunk_size > 0 | |
while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound): | |
trim_ms += chunk_size | |
return trim_ms | |
############################################################################### | |
# Trim the start of the audio file | |
############################################################################### | |
def trim_start(filepath): | |
path = Path(filepath) | |
directory = path.parent | |
filename = path.name | |
audio = AudioSegment.from_file(filepath, format="wav") | |
start_trim = milliseconds_until_sound(audio) | |
trimmed = audio[start_trim:] | |
new_filename = directory / f"trimmed_{filename}" | |
trimmed.export(new_filename, format="wav") | |
return trimmed, new_filename | |
############################################################################### | |
# -- segment the audio into smaller parts (1-minute segments for large files) | |
############################################################################### | |
def segment_audio(trimmed_audio, output_dir_trimmed): | |
one_minute = 1 * 60 * 1000 # 1 minute in milliseconds | |
start_time = 0 | |
i = 0 | |
# -- iterate through trimmed audio, segment it | |
segmented_files = [] | |
while start_time < len(trimmed_audio): | |
segment = trimmed_audio[start_time:start_time + one_minute] | |
# -- filename for each segment | |
file_name = f"trimmed_{i:02d}.wav" | |
# --export each segment, save to the Hugging Face hub directly | |
file_path = file_name | |
segment.export(file_path, format="wav") | |
segmented_files.append(file_path) | |
start_time += one_minute | |
i += 1 | |
return segmented_files | |
############################################################################### | |
# Transcription logic | |
############################################################################### | |
def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)): | |
file = file_upload | |
start_time = time.time() | |
# -- trim auio, segment it for processing | |
trimmed_audio, trimmed_filename = trim_start(file) | |
segmented_files = segment_audio(trimmed_audio, "trimmed_audio") | |
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device) | |
transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files] | |
text = ''.join(transcriptions) | |
end_time = time.time() | |
output_time = end_time - start_time | |
# --Word count | |
word_count = len(text.split()) | |
# --CPU metric | |
cpu_usage = psutil.cpu_percent(interval=1) | |
# --system info string | |
system_info = f""" | |
Processing time: {output_time:.2f} seconds. | |
Number of words: {word_count} | |
CPU Usage: {cpu_usage}% | |
""" | |
return text, system_info | |
############################################################################### | |
# Interface | |
############################################################################### | |
HEADER_INFO = """ | |
# This space uses the *Norwegian NB-Whisper Large* model by **NbAiLab** to transcribe long-form microphone or audio inputs in Norwegian of arbitrary length. | |
""".strip() | |
css = """ | |
#transcription_output textarea { | |
background-color: #000000; /* black */ | |
color: #00FF00 !important; /* text color */ | |
font-size: 18px; /* font size */ | |
} | |
#system_info_box textarea { | |
background-color: #ffe0b3; /* orange */ | |
color: black !important; /* text color */ | |
font-size: 16px; /* font size */ | |
font-weight: bold; /* bold font */ | |
} | |
""" | |
iface = gr.Blocks(css=css) | |
with iface: | |
gr.Markdown(HEADER_INFO) | |
with gr.Row(): | |
upload = gr.Audio(label="Upload audio", sources="upload", type="filepath") | |
transcribe_btn = gr.Button("Transkriber") | |
with gr.Row(): | |
with gr.Column(scale=3): | |
text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output") | |
with gr.Column(scale=1): | |
system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box") | |
with gr.Row(): | |
gr.Markdown(''' | |
<div style="text-align:center;"> | |
<a href="https://opensource.com/resources/what-open-source" style="display: inline-block;"> | |
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!" style="vertical-align: middle;"> | |
</a> | |
<span style="display:inline-block; width: 20px;"></span> <!-- This adds space between the logos --> | |
<a href="https://opensource.org/licenses/Apache-2.0" style="display: inline-block;"> | |
<img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0" style="vertical-align: middle;"> | |
</a> | |
</div> | |
''') | |
transcribe_btn.click( | |
fn=transcribe, | |
inputs=[upload], | |
outputs=[text_output, system_info] | |
) | |
iface.launch(debug=True) |