fantos commited on
Commit
5de3ec5
·
verified ·
1 Parent(s): 3dbba35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -92
app.py CHANGED
@@ -1,19 +1,13 @@
1
  import os
2
  import gradio as gr
3
  import outetts
 
4
  import torch
5
  import spaces
6
 
7
- # Define available speakers
8
- AVAILABLE_SPEAKERS = [
9
- "en_male_1", "en_male_2", "en_female_1", "en_female_2",
10
- "zh_male_1", "zh_male_2", "zh_female_1", "zh_female_2",
11
- "jp_male_1", "jp_male_2", "jp_female_1", "jp_female_2",
12
- "kr_male_1", "kr_male_2", "kr_female_1", "kr_female_2"
13
- ]
14
-
15
  def get_available_speakers():
16
- return AVAILABLE_SPEAKERS
 
17
 
18
  @spaces.GPU
19
  def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
@@ -24,15 +18,18 @@ def generate_tts(text, temperature, repetition_penalty, speaker_selection, refer
24
  device="cuda"
25
  )
26
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
27
-
28
  try:
 
29
  if reference_audio:
30
  speaker = interface.create_speaker(reference_audio)
 
31
  elif speaker_selection and speaker_selection != "None":
32
  speaker = interface.load_default_speaker(speaker_selection)
 
33
  else:
34
  speaker = None
35
-
36
  gen_cfg = outetts.GenerationConfig(
37
  text=text,
38
  temperature=temperature,
@@ -41,124 +38,148 @@ def generate_tts(text, temperature, repetition_penalty, speaker_selection, refer
41
  speaker=speaker,
42
  )
43
  output = interface.generate(config=gen_cfg)
44
-
 
45
  if output.audio is None:
46
- raise ValueError("Audio generation failed. Please try again.")
47
-
 
48
  output_path = "output.wav"
49
  output.save(output_path)
50
  return output_path, None
51
  except Exception as e:
52
  return None, str(e)
53
 
54
- # Custom CSS for 3D effect and modern UI
55
  custom_css = """
56
  .container {
57
- background: linear-gradient(145deg, #f0f0f0, #ffffff);
58
  border-radius: 20px;
59
- box-shadow: 20px 20px 60px #bebebe, -20px -20px 60px #ffffff;
60
  padding: 2rem;
 
 
61
  }
 
62
  .title {
63
  font-size: 2.5rem;
 
 
64
  text-align: center;
65
- background: linear-gradient(45deg, #2196F3, #00BCD4);
66
- -webkit-background-clip: text;
67
- -webkit-text-fill-color: transparent;
68
  margin-bottom: 2rem;
 
69
  }
70
- .radio-group {
71
- display: grid;
72
- grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
73
- gap: 1rem;
74
- margin: 1rem 0;
75
- }
76
- .control-panel {
77
- background: rgba(255, 255, 255, 0.9);
78
  border-radius: 15px;
79
  padding: 1.5rem;
80
  margin: 1rem 0;
81
- box-shadow: 0 8px 16px rgba(0,0,0,0.1);
82
  }
83
- .generate-button {
84
- background: linear-gradient(45deg, #2196F3, #00BCD4);
 
85
  color: white;
86
  border: none;
87
- padding: 1rem 2rem;
88
- border-radius: 8px;
 
89
  cursor: pointer;
90
- transition: transform 0.2s;
 
91
  }
92
- .generate-button:hover {
 
93
  transform: translateY(-2px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
  """
96
 
 
97
  with gr.Blocks(css=custom_css) as demo:
98
- with gr.Column(elem_classes="container"):
99
- gr.Markdown("# Voice Clone Multilingual TTS", elem_classes="title")
100
-
101
- with gr.Row():
102
- with gr.Column(scale=2):
103
- # Main input section with 3D effect
104
- with gr.Group(elem_classes="control-panel"):
105
- text_input = gr.Textbox(
106
- label="Enter Text",
107
- placeholder="Type your text here...",
108
- lines=3
109
- )
110
-
111
- speaker_radio = gr.Radio(
112
- choices=get_available_speakers(),
113
- value="en_male_1",
114
- label="Choose Voice",
115
- elem_classes="radio-group"
116
- )
117
-
118
- with gr.Row():
119
- temperature = gr.Slider(
120
- minimum=0.1,
121
- maximum=1.0,
122
- value=0.1,
123
- label="Expression Level"
124
- )
125
- repetition_penalty = gr.Slider(
126
- minimum=0.5,
127
- maximum=2.0,
128
- value=1.1,
129
- label="Clarity"
130
- )
131
-
132
- reference_audio = gr.Audio(
133
- label="Upload Voice Reference",
134
- type="filepath"
135
- )
136
-
137
- submit_button = gr.Button(
138
- "Generate Speech",
139
- variant="primary",
140
- elem_classes="generate-button"
141
- )
142
-
143
- with gr.Column(scale=1):
144
- # Output section with 3D effect
145
- with gr.Group(elem_classes="control-panel"):
146
- audio_output = gr.Audio(
147
- label="Generated Audio",
148
- type="filepath"
149
- )
150
- error_box = gr.Textbox(
151
- label="Status",
152
- visible=False
153
- )
154
 
 
 
 
 
 
 
 
155
  submit_button.click(
156
  fn=generate_tts,
157
  inputs=[
158
  text_input,
159
  temperature,
160
  repetition_penalty,
161
- speaker_radio,
162
  reference_audio,
163
  ],
164
  outputs=[audio_output, error_box]
 
1
  import os
2
  import gradio as gr
3
  import outetts
4
+ from outetts.version.v2.interface import _DEFAULT_SPEAKERS
5
  import torch
6
  import spaces
7
 
 
 
 
 
 
 
 
 
8
  def get_available_speakers():
9
+ speakers = list(_DEFAULT_SPEAKERS.keys())
10
+ return speakers
11
 
12
  @spaces.GPU
13
  def generate_tts(text, temperature, repetition_penalty, speaker_selection, reference_audio):
 
18
  device="cuda"
19
  )
20
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
21
+
22
  try:
23
+ # Validate inputs for custom speaker
24
  if reference_audio:
25
  speaker = interface.create_speaker(reference_audio)
26
+ # Use selected default speaker
27
  elif speaker_selection and speaker_selection != "None":
28
  speaker = interface.load_default_speaker(speaker_selection)
29
+ # No speaker - random characteristics
30
  else:
31
  speaker = None
32
+
33
  gen_cfg = outetts.GenerationConfig(
34
  text=text,
35
  temperature=temperature,
 
38
  speaker=speaker,
39
  )
40
  output = interface.generate(config=gen_cfg)
41
+
42
+ # Verify output
43
  if output.audio is None:
44
+ raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
45
+
46
+ # Save and return output
47
  output_path = "output.wav"
48
  output.save(output_path)
49
  return output_path, None
50
  except Exception as e:
51
  return None, str(e)
52
 
53
+ # Custom CSS for 3D styling
54
  custom_css = """
55
  .container {
56
+ background: linear-gradient(145deg, #f3f4f6, #ffffff);
57
  border-radius: 20px;
58
+ box-shadow: 10px 10px 20px #d1d1d1, -10px -10px 20px #ffffff;
59
  padding: 2rem;
60
+ margin: 1rem;
61
+ transition: all 0.3s ease;
62
  }
63
+
64
  .title {
65
  font-size: 2.5rem;
66
+ font-weight: bold;
67
+ color: #1a1a1a;
68
  text-align: center;
 
 
 
69
  margin-bottom: 2rem;
70
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.1);
71
  }
72
+
73
+ .input-group {
74
+ background: #ffffff;
 
 
 
 
 
75
  border-radius: 15px;
76
  padding: 1.5rem;
77
  margin: 1rem 0;
78
+ box-shadow: inset 5px 5px 10px #e0e0e0, inset -5px -5px 10px #ffffff;
79
  }
80
+
81
+ .button-3d {
82
+ background: linear-gradient(145deg, #3b82f6, #2563eb);
83
  color: white;
84
  border: none;
85
+ padding: 0.8rem 1.5rem;
86
+ border-radius: 10px;
87
+ font-weight: bold;
88
  cursor: pointer;
89
+ transition: all 0.3s ease;
90
+ box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff;
91
  }
92
+
93
+ .button-3d:hover {
94
  transform: translateY(-2px);
95
+ box-shadow: 7px 7px 15px #d1d1d1, -7px -7px 15px #ffffff;
96
+ }
97
+
98
+ .slider-3d {
99
+ height: 12px;
100
+ border-radius: 6px;
101
+ background: linear-gradient(145deg, #e6e7eb, #ffffff);
102
+ box-shadow: inset 3px 3px 6px #d1d1d1, inset -3px -3px 6px #ffffff;
103
+ }
104
+
105
+ .error-box {
106
+ background: #fee2e2;
107
+ border-left: 4px solid #ef4444;
108
+ padding: 1rem;
109
+ border-radius: 8px;
110
+ margin: 1rem 0;
111
  }
112
  """
113
 
114
+ # Create the Gradio interface with 3D styling
115
  with gr.Blocks(css=custom_css) as demo:
116
+ gr.Markdown('<div class="title">Voice Clone Multilingual TTS</div>')
117
+
118
+ error_box = gr.Textbox(label="Error Messages", visible=False, elem_classes="error-box")
119
+
120
+ with gr.Row(elem_classes="container"):
121
+ with gr.Column():
122
+ # Speaker selection with 3D styling
123
+ speaker_dropdown = gr.Dropdown(
124
+ choices=get_available_speakers(),
125
+ value="en_male_1",
126
+ label="Speaker Selection",
127
+ elem_classes="input-group"
128
+ )
129
+
130
+ text_input = gr.Textbox(
131
+ label="Text to Synthesize",
132
+ placeholder="Enter text here...",
133
+ elem_classes="input-group"
134
+ )
135
+
136
+ temperature = gr.Slider(
137
+ 0.1, 1.0,
138
+ value=0.1,
139
+ label="Temperature (lower = more stable tone, higher = more expressive)",
140
+ elem_classes="slider-3d"
141
+ )
142
+
143
+ repetition_penalty = gr.Slider(
144
+ 0.5, 2.0,
145
+ value=1.1,
146
+ label="Repetition Penalty",
147
+ elem_classes="slider-3d"
148
+ )
149
+
150
+ gr.Markdown("""
151
+ ### Voice Cloning Guidelines:
152
+ - Use around 7-10 seconds of clear, noise-free audio
153
+ - For transcription interface will use Whisper turbo to transcribe the audio file
154
+ - Longer audio clips will reduce maximum output length
155
+ - Custom speaker overrides speaker selection
156
+ """, elem_classes="input-group")
157
+
158
+ reference_audio = gr.Audio(
159
+ label="Reference Audio (for voice cloning)",
160
+ type="filepath",
161
+ elem_classes="input-group"
162
+ )
163
+
164
+ submit_button = gr.Button(
165
+ "Generate Speech",
166
+ elem_classes="button-3d"
167
+ )
 
 
 
 
168
 
169
+ with gr.Column():
170
+ audio_output = gr.Audio(
171
+ label="Generated Audio",
172
+ type="filepath",
173
+ elem_classes="input-group"
174
+ )
175
+
176
  submit_button.click(
177
  fn=generate_tts,
178
  inputs=[
179
  text_input,
180
  temperature,
181
  repetition_penalty,
182
+ speaker_dropdown,
183
  reference_audio,
184
  ],
185
  outputs=[audio_output, error_box]