alibabasglab commited on
Commit
b02e870
·
verified ·
1 Parent(s): 6656f98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -46,7 +46,7 @@ def find_mp4_files(directory):
46
  for root, dirs, files in os.walk(directory):
47
  for file in files:
48
  # Check if the file ends with .mp4
49
- if file.endswith(".wav"):
50
  mp4_files.append(os.path.join(root, file))
51
 
52
  return mp4_files
@@ -61,7 +61,7 @@ def fn_clearvoice_tse(input_video):
61
  output_list = find_mp4_files('path_to_output_videos_tse/')
62
  print(output_list)
63
 
64
- return output_list[0], output_list[1]
65
 
66
  demo = gr.Blocks()
67
 
@@ -117,13 +117,12 @@ tse_demo = gr.Interface(
117
  gr.Video(label="Input Video"),
118
  ],
119
  outputs = [
120
- gr.Audio(label="Output Audio", type="filepath"),
121
- gr.Audio(label="Output Audio", type="filepath"),
122
  ],
123
- title = "ClearVoice: Speech Separation",
124
- description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
125
- "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
126
- "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
127
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
128
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
129
  examples = [
 
46
  for root, dirs, files in os.walk(directory):
47
  for file in files:
48
  # Check if the file ends with .mp4
49
+ if file.endswith(".mp4") and file[:3] == 'est':
50
  mp4_files.append(os.path.join(root, file))
51
 
52
  return mp4_files
 
61
  output_list = find_mp4_files('path_to_output_videos_tse/')
62
  print(output_list)
63
 
64
+ return output_list
65
 
66
  demo = gr.Blocks()
67
 
 
117
  gr.Video(label="Input Video"),
118
  ],
119
  outputs = [
120
+ gr.Video(label="Output Video List", type="filepath", multiple=True)
 
121
  ],
122
+ title = "ClearVoice: Audio-visual speaker extraction",
123
+ description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. "
124
+ "We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. "
125
+ "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
126
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
127
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
128
  examples = [