|
import gradio as gr |
|
import numpy as np |
|
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio |
|
import torch |
|
import pandas as pd |
|
import gdown |
|
|
|
def process_audio(audio_input, window_size_samples): |
|
wav = read_audio(audio_input, sampling_rate=16_000) |
|
audio_length_samples = len(wav) |
|
probs = get_speech_probs(wav, window_size_samples=window_size_samples, sampling_rate=16_000) |
|
return make_visualization(probs, 512 / 16_000), probs, audio_length_samples |
|
|
|
def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms): |
|
min_speech_duration_ms *= 1000 |
|
min_silence_duration_ms *= 1000 |
|
timestamps = probs2speech_timestamps(probs, audio_length_samples, |
|
threshold = threshold, |
|
min_speech_duration_ms = min_speech_duration_ms, |
|
min_silence_duration_ms=min_silence_duration_ms, |
|
window_size_samples=window_size_samples, |
|
speech_pad_ms=speech_pad_ms, |
|
return_seconds=True, |
|
rounding=3) |
|
df = pd.DataFrame(timestamps) |
|
df["note"] = "" |
|
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False) |
|
return "timestamps.txt", df |
|
|
|
def download_gdrive(id): |
|
output_file = "audio.wav" |
|
|
|
gdown.download(f"https://drive.google.com/uc?id={id}", output_file) |
|
return output_file |
|
|
|
def main(): |
|
with gr.Blocks() as demo: |
|
probs = gr.State() |
|
audio_length_samples = gr.State() |
|
with gr.Row(): |
|
info = """Input the Google Drive file id from the shared link. |
|
It comes after https://drive.google.com/file/d/ <id here. |
|
For example the link https://drive.google.com/file/d/15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8/view?usp=drive_link has id 15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8""" |
|
gdrive_str = gr.Text(label="File ID", info = info) |
|
download_button = gr.Button("Download Audio") |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(type="filepath") |
|
with gr.Column(): |
|
md = gr.Markdown("[Parameter Documentation](https://github.com/snakers4/silero-vad/blob/master/utils_vad.py#L198)") |
|
window_size_samples = gr.Dropdown(label="Window Size (samples)", choices=[512, 1024, 1536], value=512) |
|
button1 = gr.Button("Compute Speech Probabilities") |
|
figure = gr.Plot() |
|
|
|
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input) |
|
|
|
button1.click(process_audio, inputs=[audio_input, window_size_samples], outputs=[figure, probs, audio_length_samples]) |
|
|
|
with gr.Row(): |
|
threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0) |
|
min_speech_duration_ms = gr.Number(label="Mininmum Speech Duration (s)", value=10.5) |
|
min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (s)", value=5.5) |
|
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30) |
|
button2 = gr.Button("Compute Speech Timestamps") |
|
output_file = gr.File() |
|
with gr.Row(): |
|
output_df = gr.DataFrame() |
|
|
|
button2.click(process_parameters, inputs=[probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], |
|
outputs=[output_file, output_df]) |
|
|
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|