File size: 4,607 Bytes
02c7bdf
bdaf47a
02c7bdf
7b02833
02c7bdf
 
e537531
e63a812
fef598a
e63a812
 
 
 
 
a9e592e
132a2a9
 
 
 
 
e63a812
02c7bdf
3192961
e537531
b78b7d0
 
 
 
 
4554491
 
 
b78b7d0
4554491
 
 
 
 
 
b78b7d0
3192961
 
3956066
b78b7d0
 
e805751
e63a812
fef598a
e63a812
b78b7d0
 
e805751
b78b7d0
 
e63a812
 
 
6963e61
 
b78b7d0
ed890e5
 
b78b7d0
 
 
 
 
8bb6908
3956066
e805751
3956066
 
e805751
 
3956066
b78b7d0
e63a812
 
 
6963e61
 
3956066
6320c59
 
d04ae35
3956066
 
b78b7d0
99710ec
6963e61
 
49effbd
0f8dddd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice

@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
    if sr == "16000":
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
        fs = 16000
    else:
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        fs = 48000
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, fs)
    return 'enhanced.wav'

@spaces.GPU
def fn_clearvoice_ss(input_wav):
    myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav_list = output_wav_dict[key]
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    else:
        output_wav_list = output_wav_dict
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    sf.write('separated_s1.wav', output_wav_s1, 16000)
    sf.write('separated_s2.wav', output_wav_s2, 16000)
    return "separated_s1.wav", "separated_s2.wav"

demo = gr.Blocks()

se_demo = gr.Interface(
    fn=fn_clearvoice_se,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Dropdown(
            ["16000", "48000"], value=["16000"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output."
        ),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Enhancement",
    description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. "
                   "We provide the generalized models trained on large scale of data for handling various of background environments. "
                   "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
              ),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", "16000"],
        ["examples/english_speech_48kHz.wav", "48000"],
    ],
    cache_examples = True,
)

ss_demo = gr.Interface(
    fn=fn_clearvoice_ss,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Separation",
    description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
                   "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
                    "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
    examples = [
        ['examples/female_female_speech.wav'],
        ['examples/female_male_speech.wav'],
    ],
    cache_examples = True,
)

with demo:
    #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
    gr.TabbedInterface([se_demo, ss_demo], ["Speech Enhancement", "Speech Separation"])

demo.launch()