alibabasglab commited on
Commit
9e426ab
·
verified ·
1 Parent(s): 7f6bcd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -6
app.py CHANGED
@@ -51,7 +51,7 @@ def find_mp4_files(directory):
51
 
52
  return mp4_files
53
 
54
- @spaces.GPU(duration=300)
55
  def fn_clearvoice_tse(input_video):
56
  myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
57
  #output_wav_dict =
@@ -119,11 +119,10 @@ tse_demo = gr.Interface(
119
  gr.Gallery(label="Output Video List")
120
  ],
121
  title = "ClearVoice: Audio-visual speaker extraction",
122
- description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. "
123
- "We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. "
124
  "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
125
- article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
126
- "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
127
  examples = [
128
  ['examples/001.mp4'],
129
  ['examples/002.mp4'],
@@ -133,6 +132,6 @@ tse_demo = gr.Interface(
133
 
134
  with demo:
135
  #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
136
- gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])
137
 
138
  demo.launch()
 
51
 
52
  return mp4_files
53
 
54
+
55
  def fn_clearvoice_tse(input_video):
56
  myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
57
  #output_wav_dict =
 
119
  gr.Gallery(label="Output Video List")
120
  ],
121
  title = "ClearVoice: Audio-visual speaker extraction",
122
+ description = ("Gradio demo for audio-visual speaker extraction with ClearVoice."
 
123
  "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
124
+ # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
125
+ # "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
126
  examples = [
127
  ['examples/001.mp4'],
128
  ['examples/002.mp4'],
 
132
 
133
  with demo:
134
  #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
135
+ gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Audio-visual Speaker Extraction"])
136
 
137
  demo.launch()