VoiceClone / app.py
fantos's picture
Update app.py
bb90c13 verified
raw
history blame
6.34 kB
import os
import gradio as gr
import outetts
from outetts.version.v2.interface import _DEFAULT_SPEAKERS
import torch
import spaces
def get_available_speakers():
speakers = list(_DEFAULT_SPEAKERS.keys())
return speakers
@spaces.GPU
def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B",
dtype=torch.bfloat16,
device="cuda"
)
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
try:
# Validate inputs for custom speaker
if reference_audio:
speaker = interface.create_speaker(reference_audio)
# Use selected default speaker
elif speaker_selection and speaker_selection != "None":
speaker = interface.load_default_speaker(speaker_selection)
# No speaker - random characteristics
else:
speaker = None
gen_cfg = outetts.GenerationConfig(
text=text,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_length=4096,
speaker=speaker,
)
output = interface.generate(config=gen_cfg)
# Verify output
if output.audio is None:
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
# Save and return output
output_path = "output.wav"
output.save(output_path)
return output_path, None
except Exception as e:
return None, str(e)
# Custom CSS for 3D styling
custom_css = """
.container {
background: linear-gradient(145deg, #f3f4f6, #ffffff);
border-radius: 20px;
box-shadow: 10px 10px 20px #d1d1d1, -10px -10px 20px #ffffff;
padding: 2rem;
margin: 1rem;
transition: all 0.3s ease;
}
.title {
font-size: 2.5rem;
font-weight: bold;
color: #1a1a1a;
text-align: center;
margin-bottom: 2rem;
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.1);
}
.input-group {
background: #ffffff;
border-radius: 15px;
padding: 1.5rem;
margin: 1rem 0;
box-shadow: inset 5px 5px 10px #e0e0e0, inset -5px -5px 10px #ffffff;
}
.button-3d {
background: linear-gradient(145deg, #3b82f6, #2563eb);
color: white;
border: none;
padding: 0.8rem 1.5rem;
border-radius: 10px;
font-weight: bold;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff;
}
.button-3d:hover {
transform: translateY(-2px);
box-shadow: 7px 7px 15px #d1d1d1, -7px -7px 15px #ffffff;
}
.slider-3d {
height: 12px;
border-radius: 6px;
background: linear-gradient(145deg, #e6e7eb, #ffffff);
box-shadow: inset 3px 3px 6px #d1d1d1, inset -3px -3px 6px #ffffff;
}
.error-box {
background: #fee2e2;
border-left: 4px solid #ef4444;
padding: 1rem;
border-radius: 8px;
margin: 1rem 0;
}
.right-column {
display: flex;
flex-direction: column;
gap: 1rem;
}
.options-panel {
margin-top: 2rem;
background: linear-gradient(145deg, #f3f4f6, #ffffff);
border-radius: 15px;
padding: 1.5rem;
box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff;
}
"""
# Create the Gradio interface with 3D styling
with gr.Blocks(css=custom_css) as demo:
gr.Markdown('<div class="title">Voice Clone Multilingual TTS</div>')
error_box = gr.Textbox(label="Error Messages", visible=False, elem_classes="error-box")
with gr.Row(elem_classes="container"):
# Left column for text input
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here...",
elem_classes="input-group",
lines=5
)
submit_button = gr.Button(
"Generate Speech",
elem_classes="button-3d"
)
# Right column for output and options
with gr.Column(scale=1, elem_classes="right-column"):
# Audio output at the top
audio_output = gr.Audio(
label="Generated Audio",
type="filepath",
elem_classes="input-group"
)
# Options panel below the output
with gr.Box(elem_classes="options-panel"):
speaker_dropdown = gr.Dropdown(
choices=get_available_speakers(),
value="en_male_1",
label="Speaker Selection",
elem_classes="input-group"
)
temperature = gr.Slider(
0.1, 1.0,
value=0.1,
label="Temperature (lower = more stable tone, higher = more expressive)",
elem_classes="slider-3d"
)
repetition_penalty = gr.Slider(
0.5, 2.0,
value=1.1,
label="Repetition Penalty",
elem_classes="slider-3d"
)
reference_audio = gr.Audio(
label="Reference Audio (for voice cloning)",
type="filepath",
elem_classes="input-group"
)
gr.Markdown("""
### Voice Cloning Guidelines:
- Use around 7-10 seconds of clear, noise-free audio
- For transcription interface will use Whisper turbo to transcribe the audio file
- Longer audio clips will reduce maximum output length
- Custom speaker overrides speaker selection
""", elem_classes="input-group")
submit_button.click(
fn=generate_tts,
inputs=[
text_input,
temperature,
repetition_penalty,
speaker_dropdown,
reference_audio,
],
outputs=[audio_output, error_box]
).then(
fn=lambda x: gr.update(visible=bool(x)),
inputs=[error_box],
outputs=[error_box]
)
demo.launch()