Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 3,667 Bytes

import torch
import soundfile as sf
import gradio as gr
from clearvoice import ClearVoice

def fn_clearvoice_se(input_wav):
    myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, 16000)
    return 'enhanced.wav'

def fn_clearvoice_ss(input_wav):
    myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav_list = output_wav_dict[key]
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    else:
        output_wav_list = output_wav_dict
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    sf.write('separated_s1.wav', output_wav_s1, 16000)
    sf.write('separated_s2.wav', output_wav_s2, 16000)
    return "separated_s1.wav", "separated_s2.wav"

demo = gr.Blocks()

se_demo = gr.Interface(
    fn=fn_clearvoice_se,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Enhancement",
    description = ("Gradio demo for Speech enhancement with ClearVoice. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
              ),
    examples = [
        ['examples/mandarin_speech_16kHz.wav'],
        ['examples/english_speech_48kHz.wav'],
    ],
    cache_examples = True,
)

ss_demo = gr.Interface(
    fn=fn_clearvoice_ss,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Separation",
    description = ("Gradio demo for Speech enhancement with ClearVoice. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
    examples = [
        ['examples/female_female_speech.wav'],
        ['examples/female_male_speech.wav'],
    ],
    cache_examples = True,
)

with demo:
    #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
    gr.TabbedInterface([se_demo, ss_demo], ["Speech Enhancement", "Speech Separation"])
    
demo.launch()