File size: 3,726 Bytes
de41fea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import gradio as gr
import outetts
from outetts.version.v2.interface import _DEFAULT_SPEAKERS
import torch
import spaces

def get_available_speakers():
    speakers = list(_DEFAULT_SPEAKERS.keys())
    return speakers

@spaces.GPU
def generate_tts(
        text, temperature, repetition_penalty,
        speaker_selection, reference_audio
    ):

    model_config = outetts.HFModelConfig_v2(
        model_path="OuteAI/OuteTTS-0.3-1B",
        tokenizer_path="OuteAI/OuteTTS-0.3-1B",
        dtype=torch.bfloat16,
        device="cuda"
    )
    interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)

    """Generate TTS with error handling and new features."""
    try:
        # Validate inputs for custom speaker
        if reference_audio:
            speaker = interface.create_speaker(reference_audio)

        # Use selected default speaker
        elif speaker_selection and speaker_selection != "None":
            speaker = interface.load_default_speaker(speaker_selection)

        # No speaker - random characteristics
        else:
            speaker = None

        gen_cfg = outetts.GenerationConfig(
            text=text,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_length=4096,
            speaker=speaker,
        )
        output = interface.generate(config=gen_cfg)

        # Verify output
        if output.audio is None:
            raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")

        # Save and return output
        output_path = "output.wav"
        output.save(output_path)
        return output_path, None

    except Exception as e:
        return None, str(e)

with gr.Blocks() as demo:
    gr.Markdown("# OuteTTS-0.3-1B Text-to-Speech Demo")

    error_box = gr.Textbox(label="Error Messages", visible=False)

    with gr.Row():
        with gr.Column():

            # Speaker selection
            speaker_dropdown = gr.Dropdown(
                choices=get_available_speakers(),
                value="en_male_1",
                label="Speaker Selection"
            )

            text_input = gr.Textbox(
                label="Text to Synthesize",
                placeholder="Enter text here..."
            )

            temperature = gr.Slider(
                0.1, 1.0,
                value=0.1,
                label="Temperature (lower = more stable tone, higher = more expressive)"
            )

            repetition_penalty = gr.Slider(
                0.5, 2.0,
                value=1.1,
                label="Repetition Penalty"
            )

            gr.Markdown("""
### Voice Cloning Guidelines:
- Use around 7-10 seconds of clear, noise-free audio
- For transcription interface will use Whisper turbo to transcribe the audio file
- Longer audio clips will reduce maximum output length
- Custom speaker overrides speaker selection
            """)

            reference_audio = gr.Audio(
                label="Reference Audio (for voice cloning)",
                type="filepath"
            )

            submit_button = gr.Button("Generate Speech")

        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath"
            )

    submit_button.click(
        fn=generate_tts,
        inputs=[
            text_input,
            temperature,
            repetition_penalty,
            speaker_dropdown,
            reference_audio,
        ],
        outputs=[audio_output, error_box]
    ).then(
        fn=lambda x: gr.update(visible=bool(x)),
        inputs=[error_box],
        outputs=[error_box]
    )

demo.launch()