VoiceClone / app.py
fantos's picture
Update app.py
0473607 verified
import os
import gradio as gr
import outetts
from outetts.version.v2.interface import _DEFAULT_SPEAKERS
import torch
import spaces
def get_available_speakers():
speakers = list(_DEFAULT_SPEAKERS.keys())
return speakers
@spaces.GPU
def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B",
dtype=torch.bfloat16,
device="cuda"
)
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
try:
if reference_audio:
speaker = interface.create_speaker(reference_audio)
elif speaker_selection and speaker_selection != "None":
speaker = interface.load_default_speaker(speaker_selection)
else:
speaker = None
gen_cfg = outetts.GenerationConfig(
text=text,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_length=4096,
speaker=speaker,
)
output = interface.generate(config=gen_cfg)
if output.audio is None:
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
output_path = "output.wav"
output.save(output_path)
return output_path, None
except Exception as e:
return None, str(e)
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange") as demo:
gr.Markdown("# Voice Clone Multilingual TTS")
error_box = gr.Textbox(label="Error Messages", visible=False)
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here...",
lines=8
)
submit_button = gr.Button("Generate Speech")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
with gr.Group():
speaker_dropdown = gr.Dropdown(
choices=get_available_speakers(),
value="en_male_1",
label="Speaker Selection"
)
temperature = gr.Slider(
0.1, 1.0,
value=0.1,
label="Temperature (lower = more stable tone, higher = more expressive)"
)
repetition_penalty = gr.Slider(
0.5, 2.0,
value=1.1,
label="Repetition Penalty"
)
reference_audio = gr.Audio(
label="Reference Audio (for voice cloning)",
type="filepath"
)
gr.Markdown("""
### Voice Cloning Guidelines:
- Use around 7-10 seconds of clear, noise-free audio
- For transcription interface will use Whisper turbo to transcribe the audio file
- Longer audio clips will reduce maximum output length
- Custom speaker overrides speaker selection
""")
submit_button.click(
fn=generate_tts,
inputs=[
text_input,
temperature,
repetition_penalty,
speaker_dropdown,
reference_audio,
],
outputs=[audio_output, error_box]
).then(
fn=lambda x: gr.update(visible=bool(x)),
inputs=[error_box],
outputs=[error_box]
)
demo.launch()