alibabasglab commited on
Commit
f8605aa
·
verified ·
1 Parent(s): 6ca9d30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -1
app.py CHANGED
@@ -38,6 +38,29 @@ def fn_clearvoice_ss(input_wav):
38
  sf.write('separated_s2.wav', output_wav_s2, 16000)
39
  return "separated_s1.wav", "separated_s2.wav"
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  demo = gr.Blocks()
42
 
43
  se_demo = gr.Interface(
@@ -86,8 +109,30 @@ ss_demo = gr.Interface(
86
  cache_examples = True,
87
  )
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  with demo:
90
  #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
91
- gr.TabbedInterface([se_demo, ss_demo], ["Speech Enhancement", "Speech Separation"])
92
 
93
  demo.launch()
 
38
  sf.write('separated_s2.wav', output_wav_s2, 16000)
39
  return "separated_s1.wav", "separated_s2.wav"
40
 
41
+ def find_mp4_files(directory):
42
+ mp4_files = []
43
+
44
+ # Walk through the directory and its subdirectories
45
+ for root, dirs, files in os.walk(directory):
46
+ for file in files:
47
+ # Check if the file ends with .mp4
48
+ if file.endswith(".wav"):
49
+ mp4_files.append(os.path.join(root, file))
50
+
51
+ return mp4_files
52
+
53
+ @spaces.GPU
54
+ def fn_clearvoice_tse(input_wav):
55
+ myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
56
+ #output_wav_dict =
57
+ myClearVoice(input_path=input_wav, online_write=True, output_path='path_to_output_videos_tse')
58
+
59
+ output_list = find_mp4_files('path_to_output_videos_tse')
60
+ print(output_list)
61
+ assert 2==4
62
+ return "separated_s1.wav", "separated_s2.wav"
63
+
64
  demo = gr.Blocks()
65
 
66
  se_demo = gr.Interface(
 
109
  cache_examples = True,
110
  )
111
 
112
+ tse_demo = gr.Interface(
113
+ fn=fn_clearvoice_ss,
114
+ inputs = [
115
+ gr.Video(label="Input Audio", type="filepath"),
116
+ ],
117
+ outputs = [
118
+ gr.Audio(label="Output Audio", type="filepath"),
119
+ gr.Audio(label="Output Audio", type="filepath"),
120
+ ],
121
+ title = "ClearVoice: Speech Separation",
122
+ description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
123
+ "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
124
+ "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
125
+ article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
126
+ "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
127
+ examples = [
128
+ ['examples/female_female_speech.wav'],
129
+ ['examples/female_male_speech.wav'],
130
+ ],
131
+ cache_examples = True,
132
+ )
133
+
134
  with demo:
135
  #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
136
+ gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])
137
 
138
  demo.launch()