Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,536 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 b7f0660 02c7bdf e537531 e63a812 3e28721 e63a812 a9e592e 132a2a9 b7f0660 02c7bdf 3192961 e537531 b78b7d0 4554491 b78b7d0 4554491 b7f0660 4554491 b78b7d0 f8605aa b02e870 f8605aa 9e426ab 0e2caa6 d431bb0 f8605aa d431bb0 f8605aa 27ef3d7 3ee12a7 b02e870 b7f0660 f8605aa b7f0660 3192961 3956066 b78b7d0 e805751 e63a812 3e28721 e63a812 b78b7d0 e805751 b78b7d0 963e3bf 8310825 469b4a9 b78b7d0 dc8fb4a b78b7d0 8bb6908 3956066 e805751 3956066 e805751 3956066 963e3bf 8310825 469b4a9 3956066 6320c59 d04ae35 3956066 b78b7d0 f8605aa 7be4073 f8605aa d431bb0 f8605aa 56f7076 f8605aa 963e3bf 8310825 469b4a9 f8605aa c7ca2a0 f8605aa b7f0660 963e3bf b7f0660 469b4a9 b7f0660 99710ec b7f0660 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
import random
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
if sr == "16000 Hz":
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
fs = 16000
else:
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
fs = 48000
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced.wav', output_wav[0,:], fs)
return 'enhanced.wav'
@spaces.GPU
def fn_clearvoice_ss(input_wav):
myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav_list = output_wav_dict[key]
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
else:
output_wav_list = output_wav_dict
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
sf.write('separated_s1.wav', output_wav_s1[0,:], 16000)
sf.write('separated_s2.wav', output_wav_s2[0,:], 16000)
return "separated_s1.wav", "separated_s2.wav"
def find_mp4_files(directory):
mp4_files = []
# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(directory):
for file in files:
# Check if the file ends with .mp4
if file.endswith(".mp4") and file[:3] == 'est':
mp4_files.append(os.path.join(root, file))
return mp4_files
@spaces.GPU()
def fn_clearvoice_tse(input_video):
myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
#output_wav_dict =
print(f'input_video: {input_video}')
myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
return output_list
@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):
wavname = input_wav.split('/')[-1]
myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
fs = 48000
if apply_se:
new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
input_wav = new_wavname
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
return 'enhanced_high_res.wav'
demo = gr.Blocks()
se_demo = gr.Interface(
fn=fn_clearvoice_se,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Dropdown(
["16000 Hz", "48000 Hz"], value="16000 Hz", multiselect=False, info="Choose a sampling rate for your output."
),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio' target='_blank'>ClearVoice<a/>: Speech Enhancement",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio)) is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement (ICASSP 2022)</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation (ICASSP 2024)</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
["examples/english_speech_48kHz.wav", "48000 Hz"],
],
cache_examples = True,
)
ss_demo = gr.Interface(
fn=fn_clearvoice_ss,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio' target='_blank'>ClearVoice<a/>: Speech Separation",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions (ICASSP 2023)</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation (ICASSP 2024)</a> </p>"),
examples = [
['examples/female_female_speech.wav'],
['examples/female_male_speech.wav'],
],
cache_examples = True,
)
tse_demo = gr.Interface(
fn=fn_clearvoice_tse,
inputs = [
gr.Video(label="Input Video"),
],
outputs = [
gr.Gallery(label="Output Video List")
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
"To try it, simply upload your video, or click one of the examples. "),
# article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions (ICASSP 2023)</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
# "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation (ICASSP 2024)</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
examples = [
['examples/001.mp4'],
['examples/002.mp4'],
],
cache_examples = True,
)
sr_demo = gr.Interface(
fn=fn_clearvoice_sr,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Checkbox(label="Apply Speech Enhancement", value=True),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement (ICASSP 2022)</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation (ICASSP 2024)</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2501.10045' target='_blank'>HiFi-SR: A Unified Generative Transformer-Convolutional Adversarial Network for High-Fidelity Speech Super-Resolution (ICASSP 2025)</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", True],
["examples/LJSpeech-001-0001-22k.wav", True],
["examples/LibriTTS_986_129388_24k.wav", True],
["examples/english_speech_48kHz.wav", True],
],
cache_examples = True,
)
with demo:
gr.TabbedInterface([se_demo, ss_demo, sr_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Speech Super Resolution", "Task 4: Audio-Visual Speaker Extraction"])
demo.launch() |