Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
import outetts | |
from outetts.version.v2.interface import _DEFAULT_SPEAKERS | |
import torch | |
import spaces | |
def get_available_speakers(): | |
speakers = list(_DEFAULT_SPEAKERS.keys()) | |
return speakers | |
def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio): | |
model_config = outetts.HFModelConfig_v2( | |
model_path="OuteAI/OuteTTS-0.3-1B", | |
tokenizer_path="OuteAI/OuteTTS-0.3-1B", | |
dtype=torch.bfloat16, | |
device="cuda" | |
) | |
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config) | |
try: | |
# Validate inputs for custom speaker | |
if reference_audio: | |
speaker = interface.create_speaker(reference_audio) | |
# Use selected default speaker | |
elif speaker_selection and speaker_selection != "None": | |
speaker = interface.load_default_speaker(speaker_selection) | |
# No speaker - random characteristics | |
else: | |
speaker = None | |
gen_cfg = outetts.GenerationConfig( | |
text=text, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_length=4096, | |
speaker=speaker, | |
) | |
output = interface.generate(config=gen_cfg) | |
# Verify output | |
if output.audio is None: | |
raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") | |
# Save and return output | |
output_path = "output.wav" | |
output.save(output_path) | |
return output_path, None | |
except Exception as e: | |
return None, str(e) | |
# Custom CSS for 3D styling | |
custom_css = """ | |
.container { | |
background: linear-gradient(145deg, #f3f4f6, #ffffff); | |
border-radius: 20px; | |
box-shadow: 10px 10px 20px #d1d1d1, -10px -10px 20px #ffffff; | |
padding: 2rem; | |
margin: 1rem; | |
transition: all 0.3s ease; | |
} | |
.title { | |
font-size: 2.5rem; | |
font-weight: bold; | |
color: #1a1a1a; | |
text-align: center; | |
margin-bottom: 2rem; | |
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.1); | |
} | |
.input-group { | |
background: #ffffff; | |
border-radius: 15px; | |
padding: 1.5rem; | |
margin: 1rem 0; | |
box-shadow: inset 5px 5px 10px #e0e0e0, inset -5px -5px 10px #ffffff; | |
} | |
.button-3d { | |
background: linear-gradient(145deg, #3b82f6, #2563eb); | |
color: white; | |
border: none; | |
padding: 0.8rem 1.5rem; | |
border-radius: 10px; | |
font-weight: bold; | |
cursor: pointer; | |
transition: all 0.3s ease; | |
box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff; | |
} | |
.button-3d:hover { | |
transform: translateY(-2px); | |
box-shadow: 7px 7px 15px #d1d1d1, -7px -7px 15px #ffffff; | |
} | |
.slider-3d { | |
height: 12px; | |
border-radius: 6px; | |
background: linear-gradient(145deg, #e6e7eb, #ffffff); | |
box-shadow: inset 3px 3px 6px #d1d1d1, inset -3px -3px 6px #ffffff; | |
} | |
.error-box { | |
background: #fee2e2; | |
border-left: 4px solid #ef4444; | |
padding: 1rem; | |
border-radius: 8px; | |
margin: 1rem 0; | |
} | |
.right-column { | |
display: flex; | |
flex-direction: column; | |
gap: 1rem; | |
} | |
.options-panel { | |
margin-top: 2rem; | |
background: linear-gradient(145deg, #f3f4f6, #ffffff); | |
border-radius: 15px; | |
padding: 1.5rem; | |
box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff; | |
} | |
""" | |
# Create the Gradio interface with 3D styling | |
with gr.Blocks(css=custom_css) as demo: | |
gr.Markdown('<div class="title">Voice Clone Multilingual TTS</div>') | |
error_box = gr.Textbox(label="Error Messages", visible=False, elem_classes="error-box") | |
with gr.Row(elem_classes="container"): | |
# Left column for text input | |
with gr.Column(scale=1): | |
text_input = gr.Textbox( | |
label="Text to Synthesize", | |
placeholder="Enter text here...", | |
elem_classes="input-group", | |
lines=5 | |
) | |
submit_button = gr.Button( | |
"Generate Speech", | |
elem_classes="button-3d" | |
) | |
# Right column for output and options | |
with gr.Column(scale=1, elem_classes="right-column"): | |
# Audio output at the top | |
audio_output = gr.Audio( | |
label="Generated Audio", | |
type="filepath", | |
elem_classes="input-group" | |
) | |
# Options panel below the output | |
with gr.Box(elem_classes="options-panel"): | |
speaker_dropdown = gr.Dropdown( | |
choices=get_available_speakers(), | |
value="en_male_1", | |
label="Speaker Selection", | |
elem_classes="input-group" | |
) | |
temperature = gr.Slider( | |
0.1, 1.0, | |
value=0.1, | |
label="Temperature (lower = more stable tone, higher = more expressive)", | |
elem_classes="slider-3d" | |
) | |
repetition_penalty = gr.Slider( | |
0.5, 2.0, | |
value=1.1, | |
label="Repetition Penalty", | |
elem_classes="slider-3d" | |
) | |
reference_audio = gr.Audio( | |
label="Reference Audio (for voice cloning)", | |
type="filepath", | |
elem_classes="input-group" | |
) | |
gr.Markdown(""" | |
### Voice Cloning Guidelines: | |
- Use around 7-10 seconds of clear, noise-free audio | |
- For transcription interface will use Whisper turbo to transcribe the audio file | |
- Longer audio clips will reduce maximum output length | |
- Custom speaker overrides speaker selection | |
""", elem_classes="input-group") | |
submit_button.click( | |
fn=generate_tts, | |
inputs=[ | |
text_input, | |
temperature, | |
repetition_penalty, | |
speaker_dropdown, | |
reference_audio, | |
], | |
outputs=[audio_output, error_box] | |
).then( | |
fn=lambda x: gr.update(visible=bool(x)), | |
inputs=[error_box], | |
outputs=[error_box] | |
) | |
demo.launch() |