Spaces:

CAMB-AI
/

mars6-turbo-demo

Running

App Files Files Community

mars6-turbo-demo / app.py

baas-cambai

update default

6426430 17 days ago

raw

history blame

4.31 kB

	import gradio as gr
	import os
	import httpx
	import numpy as np
	import base64
	import torch
	import torchaudio
	import io

	URL = os.environ['TEMP_HOSTING_URL']
	API_KEY = os.environ['TEMP_CALLING_KEY']

	def inference(reference_audio, text, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method):
	_sr, _wav = reference_audio

	wav = torch.from_numpy(_wav).float()
	wav = wav / 32768.0
	if wav.dim() == 1: wav = wav[None]
	else:
	wav = wav.mean(dim=-1)[None]

	wav = torchaudio.functional.resample(wav, _sr, 24000)

	io_data = io.BytesIO()
	torchaudio.save(io_data, wav, sample_rate=24000, format='wav')
	io_data.seek(0)

	encoded_data = base64.b64encode(io_data.read())
	encoded_str = encoded_data.decode("utf-8")

	if clone_method == 'deep-clone':
	dlc = 'fixed-ref'
	elif clone_method == 'shallow-clone':
	dlc = 'none'
	elif clone_method == 'follow-on deep-clone':
	dlc = 'per-chunk'

	data = {
	"text": text,
	"reference_audio": encoded_str, # reference audio, b64 encoded. Should be <=15s.
	"reference_text": reference_text if reference_text is not None and len(reference_text) > 0 else None,
	"language": 'en-us',
	"inference_settings": {'top_p': top_p, "prefix": quality_prefix, 'ras_K': ras_K, 'ras_t_r': ras_t_r, 'deep_clone_mode': dlc},
	}
	print(f"Calling with payload {data['inference_settings']}")

	# Send the POST request
	headers={"Authorization": f"Api-Key {API_KEY}"}
	response = httpx.post(URL, headers=headers, json=data, timeout=300)
	# Check the response status code
	if response.status_code == 200: print("Request successful!")
	else: print("Request failed with status code", response.status_code)
	full_audio_bytes = base64.b64decode(response.json()['output'])

	wav, sr = torchaudio.load(io.BytesIO(full_audio_bytes))
	wav = wav.numpy()

	return (sr, wav.T)

	with gr.Blocks() as demo:
	with gr.Row():
	gr.Markdown("## Reference Audio")
	with gr.Row():
	reference_audio = gr.Audio(label="Drop Audio Here", max_length=16)
	with gr.Row():
	gr.Markdown("## Text to Generate")
	with gr.Row():
	text_input = gr.Textbox(label="Text to Generate")
	with gr.Row():
	synthesize_button = gr.Button("Synthesize", variant="primary")
	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	reference_text = gr.Textbox(label="Reference Text",
	info="Leave blank to automatically transcribe the reference audio. Inference will be slightly faster if you specify the correct reference transcript below.")
	with gr.Row():
	ras_K = gr.Slider(minimum=1, maximum=20, step=1, value=10, label="RAS_K", info="RAS sampling K value")
	with gr.Row():
	ras_t_r = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.09, label="RAS_t_r", info="RAS sampling t_r value")
	with gr.Row():
	top_p = gr.Slider(minimum=0.001, maximum=1, step=0.001, value=0.2, label="top_p", info="top-p sampling value")
	with gr.Row():
	quality_prefix = gr.Textbox('48000', label="quality_prefix", info="quality prefix string to append to generation", lines=1)
	with gr.Row():
	gr.Markdown("Cloning method to use. Deep clone and shallow clone use the method described in the paper, " +
	"while `follow-on deep clone` uses deep cloning, but always using the previous generated segment as the deep clone conditioning. " +
	"This only makes a difference for long text inputs where the text is internally chunked up and generated in chunks.")
	clone_method = gr.Radio(choices=['deep-clone', 'shallow-clone', 'follow-on deep-clone'], value='deep-clone', label="cloning method", info="cloning method to use")


	with gr.Row():
	gr.Markdown("## Synthesized Audio")
	with gr.Row():
	audio_output = gr.Audio(label="Synthesized Audio")

	synthesize_button.click(
	inference,
	inputs=[reference_audio, text_input, reference_text, ras_K, ras_t_r, top_p, quality_prefix, clone_method],
	outputs=[audio_output]
	)

	if __name__ == "__main__":
	demo.launch(share=False)