Spaces:

alibabasglab
/

ClearVoice

Running on Zero

App Files Files Community

alibabasglab commited on Oct 21, 2024

Commit

f8605aa

verified ·

1 Parent(s): 6ca9d30

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -1

app.py CHANGED Viewed

@@ -38,6 +38,29 @@ def fn_clearvoice_ss(input_wav):
     sf.write('separated_s2.wav', output_wav_s2, 16000)
     return "separated_s1.wav", "separated_s2.wav"
 demo = gr.Blocks()
 se_demo = gr.Interface(
@@ -86,8 +109,30 @@ ss_demo = gr.Interface(
     cache_examples = True,
 )
 with demo:
     #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
-    gr.TabbedInterface([se_demo, ss_demo], ["Speech Enhancement", "Speech Separation"])
 demo.launch()

     sf.write('separated_s2.wav', output_wav_s2, 16000)
     return "separated_s1.wav", "separated_s2.wav"
+def find_mp4_files(directory):
+    mp4_files = []
+    # Walk through the directory and its subdirectories
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            # Check if the file ends with .mp4
+            if file.endswith(".wav"):
+                mp4_files.append(os.path.join(root, file))
+    return mp4_files
+@spaces.GPU
+def fn_clearvoice_tse(input_wav):
+    myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
+    #output_wav_dict =
+    myClearVoice(input_path=input_wav, online_write=True, output_path='path_to_output_videos_tse')
+    output_list = find_mp4_files('path_to_output_videos_tse')
+    print(output_list)
+    assert 2==4
+    return "separated_s1.wav", "separated_s2.wav"
 demo = gr.Blocks()
 se_demo = gr.Interface(
     cache_examples = True,
 )
+tse_demo = gr.Interface(
+    fn=fn_clearvoice_ss,
+    inputs = [
+        gr.Video(label="Input Audio", type="filepath"),
+    ],
+    outputs = [
+        gr.Audio(label="Output Audio", type="filepath"),
+        gr.Audio(label="Output Audio", type="filepath"),
+    ],
+    title = "ClearVoice: Speech Separation",
+    description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
+                   "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
+                    "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
+    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
+              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
+    examples = [
+        ['examples/female_female_speech.wav'],
+        ['examples/female_male_speech.wav'],
+    ],
+    cache_examples = True,
+)
 with demo:
     #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
+    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])
 demo.launch()