Spaces:
Running
Running
import gradio as gr | |
import os | |
import random2 | |
from spleeter.separator import Separator | |
from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM | |
# Initiate a file separator with 2 stems (instruments and vocals) and 16khz bitrate, required for ASR | |
separator = Separator('spleeter:2stems-16kHz') | |
# Initiate Speech to text model with Wave2Vec english | |
# https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english | |
pipe = pipeline("automatic-speech-recognition", "jonatasgrosman/wav2vec2-large-xlsr-53-english") | |
# Gradio function to split audio stems, transcribe vocals and return their filepaths | |
def extract_stems(audio): | |
# initiate a unique folder name for splitted files | |
foldername = str(random2.randrange(100000000)) | |
# Separate audio input. Synchronous is true to wait for the end of split before going further | |
separator.separate_to_file(audio, "output/", filename_format= foldername + "/{instrument}.wav", synchronous=True) | |
# build filepaths for vocals and accompaniment files | |
vocals = f"./output/"+ foldername +"/vocals.wav" | |
accompaniment = f"./output/"+ foldername +"/accompaniment.wav" | |
# Get a transcript of the vocals, by using the huggingface pipeline | |
transcript = pipe(vocals, chunk_length_s=10) | |
return vocals, accompaniment, transcript | |
# Launch a Gradio interface | |
# Input is an audio file, | |
# Output is two audio files and a transcript | |
title = "Demo: Deezer Spleeter + english Automatic Speech Recognition" | |
description = "<p>This demo is a basic interface for <a href='https://research.deezer.com/projects/spleeter.html' target='_blank'>Deezer Spleeter</a>.</p><p>It uses the Spleeter library for separate audio file in two stems : accompaniments and vocals.</p><p>Once splitted, it performs ASR (Automatic Speech Recognition) based on a Wav2vec2 english model.</p>" | |
examples = [["examples/" + mp3] for mp3 in os.listdir("examples/")] | |
demo = gr.Interface( | |
fn=extract_stems, | |
inputs=gr.Audio(source="upload", type="filepath"), | |
outputs=[gr.Audio(label="Vocals stem", source="upload", type="filepath"), gr.Audio(label="Accompaniment stem", source="upload", type="filepath"), gr.Textbox(label="Wave2vec2 Automatic Speech Recognition (English)")], | |
title=title, | |
description=description, | |
examples=examples, | |
allow_flagging="never" | |
) | |
demo.launch() |