Spaces:

alibabasglab
/

ClearVoice

Running on Zero

App Files Files Community

ClearVoice / app.py

alibabasglab

Update app.py

e63a812 verified 3 months ago

raw

history blame

4.58 kB

	import torch
	import soundfile as sf
	import gradio as gr
	from clearvoice import ClearVoice

	def fn_clearvoice_se(input_wav, sr):
	if sr[0] == '16 kHz':
	myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
	fs = 16000
	else:
	myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
	fs = 48000
	output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
	if isinstance(output_wav_dict, dict):
	key = next(iter(output_wav_dict))
	output_wav = output_wav_dict[key]
	else:
	output_wav = output_wav_dict
	sf.write('enhanced.wav', output_wav, fs)
	return 'enhanced.wav'

	def fn_clearvoice_ss(input_wav):
	myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
	output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
	if isinstance(output_wav_dict, dict):
	key = next(iter(output_wav_dict))
	output_wav_list = output_wav_dict[key]
	output_wav_s1 = output_wav_list[0]
	output_wav_s2 = output_wav_list[1]
	else:
	output_wav_list = output_wav_dict
	output_wav_s1 = output_wav_list[0]
	output_wav_s2 = output_wav_list[1]
	sf.write('separated_s1.wav', output_wav_s1, 16000)
	sf.write('separated_s2.wav', output_wav_s2, 16000)
	return "separated_s1.wav", "separated_s2.wav"

	demo = gr.Blocks()

	se_demo = gr.Interface(
	fn=fn_clearvoice_se,
	inputs = [
	gr.Audio(label="Input Audio", type="filepath"),
	gr.Dropdown(
	["16 kHz", "48 kHz"], value=["16 kHz"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output."
	),
	],
	outputs = [
	gr.Audio(label="Output Audio", type="filepath"),
	],
	title = "ClearVoice: Speech Enhancement",
	description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. "
	"We provide the generalized models trained on large scale of data for handling various of background environments. "
	"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
	article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> \| <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
	),
	examples = [
	["examples/mandarin_speech_16kHz.wav", "16 kHz"],
	["examples/english_speech_48kHz.wav", "48 kHz"],
	],
	cache_examples = True,
	)

	ss_demo = gr.Interface(
	fn=fn_clearvoice_ss,
	inputs = [
	gr.Audio(label="Input Audio", type="filepath"),
	],
	outputs = [
	gr.Audio(label="Output Audio", type="filepath"),
	gr.Audio(label="Output Audio", type="filepath"),
	],
	title = "ClearVoice: Speech Separation",
	description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
	"We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
	"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
	article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> \| <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
	"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> \| <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
	examples = [
	['examples/female_female_speech.wav'],
	['examples/female_male_speech.wav'],
	],
	cache_examples = True,
	)

	with demo:
	#gr.TabbedInterface([se_demo], ["Speech Enhancement"])
	gr.TabbedInterface([se_demo, ss_demo], ["Speech Enhancement", "Speech Separation"])

	demo.launch()