Spaces:
Running
on
Zero
Running
on
Zero
Enhance performance metrics visualization in app.py and update plot saving format in tts_model.py
Browse files- app.py +63 -21
- tts_model.py +1 -1
app.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
import gradio as gr
|
3 |
import spaces
|
4 |
import time
|
|
|
|
|
5 |
from tts_model import TTSModel
|
6 |
from lib import format_audio_output
|
7 |
|
@@ -32,18 +34,27 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
|
|
32 |
# Create progress state
|
33 |
progress_state = {
|
34 |
"progress": 0.0,
|
35 |
-
"tokens_per_sec":
|
36 |
-
"
|
|
|
|
|
|
|
37 |
}
|
38 |
|
39 |
def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
|
40 |
progress_state["progress"] = chunk_num / total_chunks
|
41 |
-
progress_state["tokens_per_sec"]
|
|
|
42 |
|
43 |
# Update GPU time remaining
|
44 |
elapsed = time.time() - start_time
|
45 |
gpu_time_left = max(0, gpu_timeout - elapsed)
|
46 |
progress_state["gpu_time_left"] = gpu_time_left
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Only update progress display during processing
|
49 |
progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
|
@@ -62,19 +73,51 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
|
|
62 |
# Calculate final metrics
|
63 |
total_time = time.time() - start_time
|
64 |
total_duration = len(audio_array) / 24000 # audio duration in seconds
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
#
|
|
|
|
|
|
|
|
|
|
|
68 |
metrics_text = (
|
69 |
-
f"
|
70 |
-
f"Real-time
|
71 |
-
f"
|
|
|
|
|
72 |
)
|
73 |
|
74 |
return (
|
75 |
audio_output,
|
76 |
-
|
77 |
-
|
78 |
)
|
79 |
except Exception as e:
|
80 |
raise gr.Error(f"Generation failed: {str(e)}")
|
@@ -83,11 +126,11 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
|
|
83 |
with gr.Blocks(title="Kokoro TTS Demo") as demo:
|
84 |
gr.HTML(
|
85 |
"""
|
86 |
-
<div style="display: flex; justify-content: flex-end; padding:
|
|
|
87 |
<a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">
|
88 |
-
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-
|
89 |
</a>
|
90 |
-
<a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="no-preference: light; light: light; dark: dark;" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Repo for Local Use</a>
|
91 |
</div>
|
92 |
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
|
93 |
<h1>Kokoro TTS Demo</h1>
|
@@ -155,21 +198,21 @@ with gr.Blocks(title="Kokoro TTS Demo") as demo:
|
|
155 |
)
|
156 |
progress_bar = gr.Progress(track_tqdm=False)
|
157 |
metrics_text = gr.Textbox(
|
158 |
-
label="
|
159 |
interactive=False,
|
160 |
-
lines=
|
161 |
)
|
162 |
-
|
163 |
-
label="Processing
|
164 |
-
|
165 |
-
|
166 |
)
|
167 |
|
168 |
# Set up event handler
|
169 |
submit_btn.click(
|
170 |
fn=generate_speech_from_ui,
|
171 |
inputs=[text_input, voice_dropdown, speed_slider],
|
172 |
-
outputs=[audio_output,
|
173 |
show_progress=True
|
174 |
)
|
175 |
|
@@ -180,7 +223,6 @@ with gr.Blocks(title="Kokoro TTS Demo") as demo:
|
|
180 |
### Demo Text Info
|
181 |
The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
|
182 |
""")
|
183 |
-
|
184 |
|
185 |
# Launch the app
|
186 |
if __name__ == "__main__":
|
|
|
2 |
import gradio as gr
|
3 |
import spaces
|
4 |
import time
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import numpy as np
|
7 |
from tts_model import TTSModel
|
8 |
from lib import format_audio_output
|
9 |
|
|
|
34 |
# Create progress state
|
35 |
progress_state = {
|
36 |
"progress": 0.0,
|
37 |
+
"tokens_per_sec": [],
|
38 |
+
"rtf": [],
|
39 |
+
"chunk_times": [],
|
40 |
+
"gpu_time_left": gpu_timeout,
|
41 |
+
"total_chunks": 0
|
42 |
}
|
43 |
|
44 |
def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
|
45 |
progress_state["progress"] = chunk_num / total_chunks
|
46 |
+
progress_state["tokens_per_sec"].append(tokens_per_sec)
|
47 |
+
progress_state["rtf"].append(rtf)
|
48 |
|
49 |
# Update GPU time remaining
|
50 |
elapsed = time.time() - start_time
|
51 |
gpu_time_left = max(0, gpu_timeout - elapsed)
|
52 |
progress_state["gpu_time_left"] = gpu_time_left
|
53 |
+
progress_state["total_chunks"] = total_chunks
|
54 |
+
|
55 |
+
# Track individual chunk processing time
|
56 |
+
chunk_time = elapsed - (sum(progress_state["chunk_times"]) if progress_state["chunk_times"] else 0)
|
57 |
+
progress_state["chunk_times"].append(chunk_time)
|
58 |
|
59 |
# Only update progress display during processing
|
60 |
progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
|
|
|
73 |
# Calculate final metrics
|
74 |
total_time = time.time() - start_time
|
75 |
total_duration = len(audio_array) / 24000 # audio duration in seconds
|
76 |
+
rtf = total_time / total_duration if total_duration > 0 else 0
|
77 |
+
mean_tokens_per_sec = np.mean(progress_state["tokens_per_sec"])
|
78 |
+
|
79 |
+
# Create plot of tokens per second with median line
|
80 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
81 |
+
fig.patch.set_facecolor('black')
|
82 |
+
ax.set_facecolor('black')
|
83 |
+
chunk_nums = list(range(1, len(progress_state["tokens_per_sec"]) + 1))
|
84 |
+
|
85 |
+
# Plot bars for tokens per second
|
86 |
+
ax.bar(chunk_nums, progress_state["tokens_per_sec"], color='#ff2a6d', alpha=0.8)
|
87 |
+
|
88 |
+
# Add median line
|
89 |
+
median_tps = np.median(progress_state["tokens_per_sec"])
|
90 |
+
ax.axhline(y=median_tps, color='#05d9e8', linestyle='--', label=f'Median: {median_tps:.1f} tokens/sec')
|
91 |
+
|
92 |
+
# Style improvements
|
93 |
+
ax.set_xlabel('Chunk Number', fontsize=24, labelpad=20)
|
94 |
+
ax.set_ylabel('Tokens per Second', fontsize=24, labelpad=20)
|
95 |
+
ax.set_title('Processing Speed by Chunk', fontsize=28, pad=30)
|
96 |
+
|
97 |
+
# Increase tick label size
|
98 |
+
ax.tick_params(axis='both', which='major', labelsize=20)
|
99 |
+
|
100 |
+
# Remove gridlines
|
101 |
+
ax.grid(False)
|
102 |
|
103 |
+
# Style legend and position it in bottom left
|
104 |
+
ax.legend(fontsize=20, facecolor='black', edgecolor='#05d9e8', loc='lower left')
|
105 |
+
|
106 |
+
plt.tight_layout()
|
107 |
+
|
108 |
+
# Prepare final metrics display including audio duration and real-time speed
|
109 |
metrics_text = (
|
110 |
+
f"Median Processing Speed: {np.median(progress_state['tokens_per_sec']):.1f} tokens/sec\n" +
|
111 |
+
f"Real-time Factor: {rtf:.3f}\n" +
|
112 |
+
f"Real Time Generation Speed: {int(1/rtf)}x \n" +
|
113 |
+
f"Processing Time: {int(total_time)}s\n" +
|
114 |
+
f"Output Audio Duration: {total_duration:.2f}s"
|
115 |
)
|
116 |
|
117 |
return (
|
118 |
audio_output,
|
119 |
+
fig,
|
120 |
+
metrics_text
|
121 |
)
|
122 |
except Exception as e:
|
123 |
raise gr.Error(f"Generation failed: {str(e)}")
|
|
|
126 |
with gr.Blocks(title="Kokoro TTS Demo") as demo:
|
127 |
gr.HTML(
|
128 |
"""
|
129 |
+
<div style="display: flex; justify-content: flex-end; padding: 5px; gap: 5px;">
|
130 |
+
<a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="no-preference: light; light: light; dark: dark;" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Kokoro-FastAPI Repo</a>
|
131 |
<a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">
|
132 |
+
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-lg-dark.svg" alt="Model on HF">
|
133 |
</a>
|
|
|
134 |
</div>
|
135 |
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
|
136 |
<h1>Kokoro TTS Demo</h1>
|
|
|
198 |
)
|
199 |
progress_bar = gr.Progress(track_tqdm=False)
|
200 |
metrics_text = gr.Textbox(
|
201 |
+
label="Performance Summary",
|
202 |
interactive=False,
|
203 |
+
lines=4
|
204 |
)
|
205 |
+
metrics_plot = gr.Plot(
|
206 |
+
label="Processing Metrics",
|
207 |
+
show_label=True,
|
208 |
+
format="png" # Explicitly set format to PNG which is supported by matplotlib
|
209 |
)
|
210 |
|
211 |
# Set up event handler
|
212 |
submit_btn.click(
|
213 |
fn=generate_speech_from_ui,
|
214 |
inputs=[text_input, voice_dropdown, speed_slider],
|
215 |
+
outputs=[audio_output, metrics_plot, metrics_text],
|
216 |
show_progress=True
|
217 |
)
|
218 |
|
|
|
223 |
### Demo Text Info
|
224 |
The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
|
225 |
""")
|
|
|
226 |
|
227 |
# Launch the app
|
228 |
if __name__ == "__main__":
|
tts_model.py
CHANGED
@@ -308,7 +308,7 @@ class TTSModel:
|
|
308 |
setup_plot(fig, ax2, 'Chunk Sizes')
|
309 |
|
310 |
# Save plot
|
311 |
-
plt.savefig('chunk_times.png')
|
312 |
plt.close()
|
313 |
|
314 |
# Calculate metrics
|
|
|
308 |
setup_plot(fig, ax2, 'Chunk Sizes')
|
309 |
|
310 |
# Save plot
|
311 |
+
plt.savefig('chunk_times.png', format='png')
|
312 |
plt.close()
|
313 |
|
314 |
# Calculate metrics
|