import os import gradio as gr import outetts from outetts.version.v2.interface import _DEFAULT_SPEAKERS import torch import spaces def get_available_speakers(): speakers = list(_DEFAULT_SPEAKERS.keys()) return speakers @spaces.GPU def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio): model_config = outetts.HFModelConfig_v2( model_path="OuteAI/OuteTTS-0.3-1B", tokenizer_path="OuteAI/OuteTTS-0.3-1B", dtype=torch.bfloat16, device="cuda" ) interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config) try: if reference_audio: speaker = interface.create_speaker(reference_audio) elif speaker_selection and speaker_selection != "None": speaker = interface.load_default_speaker(speaker_selection) else: speaker = None gen_cfg = outetts.GenerationConfig( text=text, temperature=temperature, repetition_penalty=repetition_penalty, max_length=4096, speaker=speaker, ) output = interface.generate(config=gen_cfg) if output.audio is None: raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") output_path = "output.wav" output.save(output_path) return output_path, None except Exception as e: return None, str(e) with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange") as demo: gr.Markdown("# Voice Clone Multilingual TTS") error_box = gr.Textbox(label="Error Messages", visible=False) with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter text here...", lines=8 ) submit_button = gr.Button("Generate Speech") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Audio", type="filepath" ) with gr.Group(): speaker_dropdown = gr.Dropdown( choices=get_available_speakers(), value="en_male_1", label="Speaker Selection" ) temperature = gr.Slider( 0.1, 1.0, value=0.1, label="Temperature (lower = more stable tone, higher = more expressive)" ) repetition_penalty = gr.Slider( 0.5, 2.0, value=1.1, label="Repetition Penalty" ) reference_audio = gr.Audio( label="Reference Audio (for voice cloning)", type="filepath" ) gr.Markdown(""" ### Voice Cloning Guidelines: - Use around 7-10 seconds of clear, noise-free audio - For transcription interface will use Whisper turbo to transcribe the audio file - Longer audio clips will reduce maximum output length - Custom speaker overrides speaker selection """) submit_button.click( fn=generate_tts, inputs=[ text_input, temperature, repetition_penalty, speaker_dropdown, reference_audio, ], outputs=[audio_output, error_box] ).then( fn=lambda x: gr.update(visible=bool(x)), inputs=[error_box], outputs=[error_box] ) demo.launch()