Spaces:

alibabasglab
/

ClearVoice

Running on Zero

App Files Files Community

alibabasglab commited on Oct 22, 2024

Commit

9e426ab

verified ·

1 Parent(s): 7f6bcd0

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -6

app.py CHANGED Viewed

@@ -51,7 +51,7 @@ def find_mp4_files(directory):
     return mp4_files
-@spaces.GPU(duration=300)
 def fn_clearvoice_tse(input_video):
     myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
     #output_wav_dict =
@@ -119,11 +119,10 @@ tse_demo = gr.Interface(
         gr.Gallery(label="Output Video List")
     ],
     title = "ClearVoice: Audio-visual speaker extraction",
-    description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. "
-                   "We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. "
                     "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
-    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
-              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
     examples = [
         ['examples/001.mp4'],
         ['examples/002.mp4'],
@@ -133,6 +132,6 @@ tse_demo = gr.Interface(
 with demo:
     #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
-    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])
 demo.launch()

     return mp4_files
 def fn_clearvoice_tse(input_video):
     myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
     #output_wav_dict =
         gr.Gallery(label="Output Video List")
     ],
     title = "ClearVoice: Audio-visual speaker extraction",
+    description = ("Gradio demo for audio-visual speaker extraction with ClearVoice."
                     "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
+    # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
+    #           "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
     examples = [
         ['examples/001.mp4'],
         ['examples/002.mp4'],
 with demo:
     #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
+    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Audio-visual Speaker Extraction"])
 demo.launch()