VoiceClone / app.py
fantos's picture
Update app.py
3a8be35 verified
raw
history blame
4.7 kB
import os
import gradio as gr
import outetts
from outetts.version.v2.interface import *DEFAULT*SPEAKERS
import torch
import spaces
def get_available_speakers():
speakers = list(_DEFAULT_SPEAKERS.keys())
return speakers
@spaces.GPU
def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B",
dtype=torch.bfloat16,
device="cuda"
)
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
try:
if reference_audio:
speaker = interface.create_speaker(reference_audio)
elif speaker_selection and speaker_selection != "None":
speaker = interface.load_default_speaker(speaker_selection)
else:
speaker = None
gen_cfg = outetts.GenerationConfig(
text=text,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_length=4096,
speaker=speaker,
)
output = interface.generate(config=gen_cfg)
if output.audio is None:
raise ValueError("Audio generation failed. Please try again.")
output_path = "output.wav"
output.save(output_path)
return output_path, None
except Exception as e:
return None, str(e)
# Custom CSS for 3D effect and modern UI
custom_css = """
.container {
background: linear-gradient(145deg, #f0f0f0, #ffffff);
border-radius: 20px;
box-shadow: 20px 20px 60px #bebebe, -20px -20px 60px #ffffff;
padding: 2rem;
}
.title {
font-size: 2.5rem;
text-align: center;
background: linear-gradient(45deg, #2196F3, #00BCD4);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 2rem;
}
.radio-group {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
gap: 1rem;
margin: 1rem 0;
}
.control-panel {
background: rgba(255, 255, 255, 0.9);
border-radius: 15px;
padding: 1.5rem;
margin: 1rem 0;
}
"""
with gr.Blocks(css=custom_css) as demo:
with gr.Column(elem_classes="container"):
gr.Markdown("# Voice Clone Multilingual TTS", elem_classes="title")
with gr.Row():
with gr.Column(scale=2):
# Main input section with 3D effect
with gr.Group(elem_classes="control-panel"):
text_input = gr.Textbox(
label="Enter Text",
placeholder="Type your text here...",
lines=3
)
speaker_radio = gr.Radio(
choices=get_available_speakers(),
value="en_male_1",
label="Choose Voice",
elem_classes="radio-group"
)
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
label="Expression Level"
)
repetition_penalty = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.1,
label="Clarity"
)
reference_audio = gr.Audio(
label="Upload Voice Reference",
type="filepath"
)
submit_button = gr.Button(
"Generate Speech",
variant="primary"
)
with gr.Column(scale=1):
# Output section
audio_output = gr.Audio(
label="Generated Audio",
type="filepath"
)
error_box = gr.Textbox(
label="Status",
visible=False
)
submit_button.click(
fn=generate_tts,
inputs=[
text_input,
temperature,
repetition_penalty,
speaker_radio,
reference_audio,
],
outputs=[audio_output, error_box]
).then(
fn=lambda x: gr.update(visible=bool(x)),
inputs=[error_box],
outputs=[error_box]
)
demo.launch()