fantos commited on
Commit
3a8be35
·
verified ·
1 Parent(s): 4978cb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -74
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import gradio as gr
3
  import outetts
4
- from outetts.version.v2.interface import _DEFAULT_SPEAKERS
5
  import torch
6
  import spaces
7
 
@@ -10,11 +10,7 @@ def get_available_speakers():
10
  return speakers
11
 
12
  @spaces.GPU
13
- def generate_tts(
14
- text, temperature, repetition_penalty,
15
- speaker_selection, reference_audio
16
- ):
17
-
18
  model_config = outetts.HFModelConfig_v2(
19
  model_path="OuteAI/OuteTTS-0.3-1B",
20
  tokenizer_path="OuteAI/OuteTTS-0.3-1B",
@@ -22,21 +18,15 @@ def generate_tts(
22
  device="cuda"
23
  )
24
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
25
-
26
- """Generate TTS with error handling and new features."""
27
  try:
28
- # Validate inputs for custom speaker
29
  if reference_audio:
30
  speaker = interface.create_speaker(reference_audio)
31
-
32
- # Use selected default speaker
33
  elif speaker_selection and speaker_selection != "None":
34
  speaker = interface.load_default_speaker(speaker_selection)
35
-
36
- # No speaker - random characteristics
37
  else:
38
  speaker = None
39
-
40
  gen_cfg = outetts.GenerationConfig(
41
  text=text,
42
  temperature=temperature,
@@ -45,79 +35,109 @@ def generate_tts(
45
  speaker=speaker,
46
  )
47
  output = interface.generate(config=gen_cfg)
48
-
49
- # Verify output
50
  if output.audio is None:
51
- raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
52
-
53
- # Save and return output
54
  output_path = "output.wav"
55
  output.save(output_path)
56
  return output_path, None
57
-
58
  except Exception as e:
59
  return None, str(e)
60
 
61
- with gr.Blocks() as demo:
62
- gr.Markdown("# OuteTTS-0.3-1B Text-to-Speech Demo")
63
-
64
- error_box = gr.Textbox(label="Error Messages", visible=False)
65
-
66
- with gr.Row():
67
- with gr.Column():
68
-
69
- # Speaker selection
70
- speaker_dropdown = gr.Dropdown(
71
- choices=get_available_speakers(),
72
- value="en_male_1",
73
- label="Speaker Selection"
74
- )
75
-
76
- text_input = gr.Textbox(
77
- label="Text to Synthesize",
78
- placeholder="Enter text here..."
79
- )
80
-
81
- temperature = gr.Slider(
82
- 0.1, 1.0,
83
- value=0.1,
84
- label="Temperature (lower = more stable tone, higher = more expressive)"
85
- )
86
-
87
- repetition_penalty = gr.Slider(
88
- 0.5, 2.0,
89
- value=1.1,
90
- label="Repetition Penalty"
91
- )
92
-
93
- gr.Markdown("""
94
- ### Voice Cloning Guidelines:
95
- - Use around 7-10 seconds of clear, noise-free audio
96
- - For transcription interface will use Whisper turbo to transcribe the audio file
97
- - Longer audio clips will reduce maximum output length
98
- - Custom speaker overrides speaker selection
99
- """)
100
-
101
- reference_audio = gr.Audio(
102
- label="Reference Audio (for voice cloning)",
103
- type="filepath"
104
- )
105
-
106
- submit_button = gr.Button("Generate Speech")
107
-
108
- with gr.Column():
109
- audio_output = gr.Audio(
110
- label="Generated Audio",
111
- type="filepath"
112
- )
113
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  submit_button.click(
115
  fn=generate_tts,
116
  inputs=[
117
  text_input,
118
  temperature,
119
  repetition_penalty,
120
- speaker_dropdown,
121
  reference_audio,
122
  ],
123
  outputs=[audio_output, error_box]
 
1
  import os
2
  import gradio as gr
3
  import outetts
4
+ from outetts.version.v2.interface import *DEFAULT*SPEAKERS
5
  import torch
6
  import spaces
7
 
 
10
  return speakers
11
 
12
  @spaces.GPU
13
+ def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
 
 
 
 
14
  model_config = outetts.HFModelConfig_v2(
15
  model_path="OuteAI/OuteTTS-0.3-1B",
16
  tokenizer_path="OuteAI/OuteTTS-0.3-1B",
 
18
  device="cuda"
19
  )
20
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
21
+
 
22
  try:
 
23
  if reference_audio:
24
  speaker = interface.create_speaker(reference_audio)
 
 
25
  elif speaker_selection and speaker_selection != "None":
26
  speaker = interface.load_default_speaker(speaker_selection)
 
 
27
  else:
28
  speaker = None
29
+
30
  gen_cfg = outetts.GenerationConfig(
31
  text=text,
32
  temperature=temperature,
 
35
  speaker=speaker,
36
  )
37
  output = interface.generate(config=gen_cfg)
38
+
 
39
  if output.audio is None:
40
+ raise ValueError("Audio generation failed. Please try again.")
41
+
 
42
  output_path = "output.wav"
43
  output.save(output_path)
44
  return output_path, None
 
45
  except Exception as e:
46
  return None, str(e)
47
 
48
+ # Custom CSS for 3D effect and modern UI
49
+ custom_css = """
50
+ .container {
51
+ background: linear-gradient(145deg, #f0f0f0, #ffffff);
52
+ border-radius: 20px;
53
+ box-shadow: 20px 20px 60px #bebebe, -20px -20px 60px #ffffff;
54
+ padding: 2rem;
55
+ }
56
+ .title {
57
+ font-size: 2.5rem;
58
+ text-align: center;
59
+ background: linear-gradient(45deg, #2196F3, #00BCD4);
60
+ -webkit-background-clip: text;
61
+ -webkit-text-fill-color: transparent;
62
+ margin-bottom: 2rem;
63
+ }
64
+ .radio-group {
65
+ display: grid;
66
+ grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
67
+ gap: 1rem;
68
+ margin: 1rem 0;
69
+ }
70
+ .control-panel {
71
+ background: rgba(255, 255, 255, 0.9);
72
+ border-radius: 15px;
73
+ padding: 1.5rem;
74
+ margin: 1rem 0;
75
+ }
76
+ """
77
+
78
+ with gr.Blocks(css=custom_css) as demo:
79
+ with gr.Column(elem_classes="container"):
80
+ gr.Markdown("# Voice Clone Multilingual TTS", elem_classes="title")
81
+
82
+ with gr.Row():
83
+ with gr.Column(scale=2):
84
+ # Main input section with 3D effect
85
+ with gr.Group(elem_classes="control-panel"):
86
+ text_input = gr.Textbox(
87
+ label="Enter Text",
88
+ placeholder="Type your text here...",
89
+ lines=3
90
+ )
91
+
92
+ speaker_radio = gr.Radio(
93
+ choices=get_available_speakers(),
94
+ value="en_male_1",
95
+ label="Choose Voice",
96
+ elem_classes="radio-group"
97
+ )
98
+
99
+ with gr.Row():
100
+ temperature = gr.Slider(
101
+ minimum=0.1,
102
+ maximum=1.0,
103
+ value=0.1,
104
+ label="Expression Level"
105
+ )
106
+ repetition_penalty = gr.Slider(
107
+ minimum=0.5,
108
+ maximum=2.0,
109
+ value=1.1,
110
+ label="Clarity"
111
+ )
112
+
113
+ reference_audio = gr.Audio(
114
+ label="Upload Voice Reference",
115
+ type="filepath"
116
+ )
117
+
118
+ submit_button = gr.Button(
119
+ "Generate Speech",
120
+ variant="primary"
121
+ )
122
+
123
+ with gr.Column(scale=1):
124
+ # Output section
125
+ audio_output = gr.Audio(
126
+ label="Generated Audio",
127
+ type="filepath"
128
+ )
129
+ error_box = gr.Textbox(
130
+ label="Status",
131
+ visible=False
132
+ )
133
+
134
  submit_button.click(
135
  fn=generate_tts,
136
  inputs=[
137
  text_input,
138
  temperature,
139
  repetition_penalty,
140
+ speaker_radio,
141
  reference_audio,
142
  ],
143
  outputs=[audio_output, error_box]