Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

App Files Files Community

Remsky commited on 5 days ago

Commit

3c8cbc9

1 Parent(s): b2ae79f

Added Multi-Voice, GPU Timeout, etc

Browse files

Files changed (6) hide show

README.md +3 -1
app.py +151 -91
lib/file_utils.py +48 -42
lib/ui_content.py +1 -1
the_time_machine_hgwells.txt +0 -19
tts_model.py +130 -173

README.md CHANGED Viewed

@@ -42,4 +42,6 @@ Main dependencies:
 - Transformers 4.47.1
 - HuggingFace Hub ≥0.25.1
-For a complete list, see requirements.txt.

 - Transformers 4.47.1
 - HuggingFace Hub ≥0.25.1
+For a complete list, see requirements.txt.

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import spaces
 import time
 import matplotlib.pyplot as plt
 import numpy as np
 from tts_model import TTSModel
 from lib import format_audio_output
 from lib.ui_content import header_html, demo_text_info
@@ -14,106 +16,78 @@ os.environ["HF_HOME"] = "/data/.huggingface"
 # Create TTS model instance
 model = TTSModel()
-@spaces.GPU(duration=10)  # Quick initialization
 def initialize_model():
     """Initialize model and get voices"""
     if model.model is None:
         if not model.initialize():
             raise gr.Error("Failed to initialize model")
-    return model.list_voices()
-# Get initial voice list
-voice_list = initialize_model()
-@spaces.GPU(duration=120)  # Allow 5 minutes for processing
-def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_tqdm=False)):
     """Handle text-to-speech generation from the Gradio UI"""
     try:
         start_time = time.time()
-        gpu_timeout = 120  # seconds
-        # Create progress state
         progress_state = {
             "progress": 0.0,
-            "tokens_per_sec": [],
-            "rtf": [],
-            "chunk_times": [],
-            "gpu_time_left": gpu_timeout,
             "total_chunks": 0
         }
-        def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
-            progress_state["progress"] = chunk_num / total_chunks
-            progress_state["tokens_per_sec"].append(tokens_per_sec)
-            progress_state["rtf"].append(rtf)
-            # Update GPU time remaining
-            elapsed = time.time() - start_time
-            gpu_time_left = max(0, gpu_timeout - elapsed)
-            progress_state["gpu_time_left"] = gpu_time_left
-            progress_state["total_chunks"] = total_chunks
-            # Track individual chunk processing time
-            chunk_time = elapsed - (sum(progress_state["chunk_times"]) if progress_state["chunk_times"] else 0)
-            progress_state["chunk_times"].append(chunk_time)
-            # Only update progress display during processing
-            progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
-        # Generate speech with progress tracking
-        audio_array, duration = model.generate_speech(
-            text,
-            voice_name,
             speed,
-            progress_callback=update_progress
         )
         # Format output for Gradio
         audio_output, duration_text = format_audio_output(audio_array)
-        # Calculate final metrics
-        total_time = time.time() - start_time
-        total_duration = len(audio_array) / 24000  # audio duration in seconds
-        rtf = total_time / total_duration if total_duration > 0 else 0
-        mean_tokens_per_sec = np.mean(progress_state["tokens_per_sec"])
-        # Create plot of tokens per second with median line
-        fig, ax = plt.subplots(figsize=(10, 5))
-        fig.patch.set_facecolor('black')
-        ax.set_facecolor('black')
-        chunk_nums = list(range(1, len(progress_state["tokens_per_sec"]) + 1))
-        # Plot bars for tokens per second
-        ax.bar(chunk_nums, progress_state["tokens_per_sec"], color='#ff2a6d', alpha=0.8)
-        # Add median line
-        median_tps = np.median(progress_state["tokens_per_sec"])
-        ax.axhline(y=median_tps, color='#05d9e8', linestyle='--', label=f'Median: {median_tps:.1f} tokens/sec')
-        # Style improvements
-        ax.set_xlabel('Chunk Number', fontsize=24, labelpad=20)
-        ax.set_ylabel('Tokens per Second', fontsize=24, labelpad=20)
-        ax.set_title('Processing Speed by Chunk', fontsize=28, pad=30)
-        # Increase tick label size
-        ax.tick_params(axis='both', which='major', labelsize=20)
-        # Remove gridlines
-        ax.grid(False)
-        # Style legend and position it in bottom left
-        ax.legend(fontsize=20, facecolor='black', edgecolor='#05d9e8', loc='lower left')
-        plt.tight_layout()
-        # Prepare final metrics display including audio duration and real-time speed
-        metrics_text = (
-            f"Median Processing Speed: {np.median(progress_state['tokens_per_sec']):.1f} tokens/sec\n" +
-            f"Real-time Factor: {rtf:.3f}\n" +
-            f"Real Time Generation Speed: {int(1/rtf)}x \n" +
-            f"Processing Time: {int(total_time)}s\n" +
-            f"Output Audio Duration: {total_duration:.2f}s"
-        )
         return (
             audio_output,
@@ -123,6 +97,70 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
     except Exception as e:
         raise gr.Error(f"Generation failed: {str(e)}")
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo", css="""
     .equal-height {
@@ -135,12 +173,15 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
     with gr.Row():
         # Column 1: Text Input
         with gr.Column(elem_classes="equal-height"):
             text_input = gr.TextArea(
                 label="Text to speak",
                 placeholder="Enter text here or upload a .txt file",
                 lines=10,
-                value=open("the_time_machine_hgwells.txt").read()[:1000]
             )
         # Column 2: Controls
@@ -166,17 +207,17 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
             )
             with gr.Group():
-                default_voice = 'af_sky' if 'af_sky' in voice_list \
-                    else voice_list[0] \
-                        if voice_list else \
-                            None
                 voice_dropdown = gr.Dropdown(
-                    label="Voice",
-                    choices=voice_list,
-                    value=default_voice,
-                    allow_custom_value=True
                 )
                 speed_slider = gr.Slider(
                     label="Speed",
                     minimum=0.5,
@@ -184,6 +225,14 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
                     value=1.0,
                     step=0.1
                 )
                 submit_btn = gr.Button("Generate Speech", variant="primary")
         # Column 3: Output
@@ -198,7 +247,7 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
             metrics_text = gr.Textbox(
                 label="Performance Summary",
                 interactive=False,
-                lines=4
             )
             metrics_plot = gr.Plot(
                 label="Processing Metrics",
@@ -206,10 +255,15 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
                 format="png"  # Explicitly set format to PNG which is supported by matplotlib
             )
-    # Set up event handler
     submit_btn.click(
         fn=generate_speech_from_ui,
-        inputs=[text_input, voice_dropdown, speed_slider],
         outputs=[audio_output, metrics_plot, metrics_text],
         show_progress=True
     )
@@ -218,6 +272,12 @@ with gr.Blocks(title="Kokoro TTS Demo", css="""
     with gr.Row():
         with gr.Column():
             gr.Markdown(demo_text_info)
 # Launch the app
 if __name__ == "__main__":

 import time
 import matplotlib.pyplot as plt
 import numpy as np
+import torch
+import os
 from tts_model import TTSModel
 from lib import format_audio_output
 from lib.ui_content import header_html, demo_text_info
 # Create TTS model instance
 model = TTSModel()
 def initialize_model():
     """Initialize model and get voices"""
     if model.model is None:
         if not model.initialize():
             raise gr.Error("Failed to initialize model")
+    voices = model.list_voices()
+    if not voices:
+        raise gr.Error("No voices found. Please check the voices directory.")
+    return gr.update(choices=voices, value=[voices[0]] if voices else None)
+def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf, progress_state, start_time, gpu_timeout, progress):
+    # Calculate time metrics
+    elapsed = time.time() - start_time
+    gpu_time_left = max(0, gpu_timeout - elapsed)
+    # Calculate chunk time more accurately
+    prev_total_time = sum(progress_state["chunk_times"]) if progress_state["chunk_times"] else 0
+    chunk_time = elapsed - prev_total_time
+    # Validate metrics before adding to state
+    if chunk_time > 0 and tokens_per_sec >= 0:
+        # Update progress state with validated metrics
+        progress_state["progress"] = chunk_num / total_chunks
+        progress_state["total_chunks"] = total_chunks
+        progress_state["gpu_time_left"] = gpu_time_left
+        progress_state["tokens_per_sec"].append(float(tokens_per_sec))
+        progress_state["rtf"].append(float(rtf))
+        progress_state["chunk_times"].append(chunk_time)
+    # Only update progress display during processing
+    progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
+def generate_speech_from_ui(text, voice_names, speed, gpu_timeout, progress=gr.Progress(track_tqdm=False)):
     """Handle text-to-speech generation from the Gradio UI"""
     try:
+        if not text or not voice_names:
+            raise gr.Error("Please enter text and select at least one voice")
         start_time = time.time()
+        # Create progress state with explicit type initialization
         progress_state = {
             "progress": 0.0,
+            "tokens_per_sec": [],  # Initialize as empty list
+            "rtf": [],  # Initialize as empty list
+            "chunk_times": [],  # Initialize as empty list
+            "gpu_time_left": float(gpu_timeout),  # Ensure float
             "total_chunks": 0
         }
+        # Handle single or multiple voices
+        if isinstance(voice_names, str):
+            voice_names = [voice_names]
+        # Generate speech with progress tracking using combined voice
+        audio_array, duration, metrics = model.generate_speech(
+            text,
+            voice_names,
             speed,
+            gpu_timeout=gpu_timeout,
+            progress_callback=update_progress,
+            progress_state=progress_state,
+            progress=progress
         )
         # Format output for Gradio
         audio_output, duration_text = format_audio_output(audio_array)
+        # Create plot and metrics text outside GPU context
+        fig, metrics_text = create_performance_plot(metrics, voice_names)
         return (
             audio_output,
     except Exception as e:
         raise gr.Error(f"Generation failed: {str(e)}")
+def create_performance_plot(metrics, voice_names):
+    """Create performance plot and metrics text from generation metrics"""
+    # Clean and process the data
+    tokens_per_sec = np.array(metrics["tokens_per_sec"])
+    rtf_values = np.array(metrics["rtf"])
+    # Calculate statistics using cleaned data
+    median_tps = float(np.median(tokens_per_sec))
+    mean_tps = float(np.mean(tokens_per_sec))
+    std_tps = float(np.std(tokens_per_sec))
+    # Set y-axis limits based on data range
+    y_min = max(0, np.min(tokens_per_sec) * 0.9)
+    y_max = np.max(tokens_per_sec) * 1.1
+    # Create plot
+    fig, ax = plt.subplots(figsize=(10, 5))
+    fig.patch.set_facecolor('black')
+    ax.set_facecolor('black')
+    # Plot data points
+    chunk_nums = list(range(1, len(tokens_per_sec) + 1))
+    # Plot data points
+    ax.bar(chunk_nums, tokens_per_sec, color='#ff2a6d', alpha=0.6)
+    # Set y-axis limits with padding
+    padding = 0.1 * (y_max - y_min)
+    ax.set_ylim(max(0, y_min - padding), y_max + padding)
+    # Add median line
+    ax.axhline(y=median_tps, color='#05d9e8', linestyle='--',
+              label=f'Median: {median_tps:.1f} tokens/sec')
+    # Style improvements
+    ax.set_xlabel('Chunk Number', fontsize=24, labelpad=20, color='white')
+    ax.set_ylabel('Tokens per Second', fontsize=24, labelpad=20, color='white')
+    ax.set_title('Processing Speed by Chunk', fontsize=28, pad=30, color='white')
+    ax.tick_params(axis='both', which='major', labelsize=20, colors='white')
+    ax.spines['bottom'].set_color('white')
+    ax.spines['top'].set_color('white')
+    ax.spines['left'].set_color('white')
+    ax.spines['right'].set_color('white')
+    ax.grid(False)
+    ax.legend(fontsize=20, facecolor='black', edgecolor='#05d9e8', loc='lower left',
+             labelcolor='white')
+    plt.tight_layout()
+    # Calculate average RTF from individual chunk RTFs
+    rtf = np.mean(rtf_values)
+    # Prepare metrics text
+    metrics_text = (
+        f"Median Speed: {median_tps:.1f} tokens/sec (o200k_base)\n" +
+        f"Real-time Factor: {rtf:.3f}\n" +
+        f"Real Time Speed: {int(1/rtf)}x\n" +
+        f"Processing Time: {int(metrics['total_time'])}s\n" +
+        f"Total Tokens: {metrics['total_tokens']} (o200k_base)\n" +
+        f"Voices: {', '.join(voice_names)}"
+    )
+    return fig, metrics_text
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo", css="""
     .equal-height {
     with gr.Row():
         # Column 1: Text Input
+        with open("the_time_machine_hgwells.txt") as f:
+            text = f.readlines()[:200]
+            text = "".join(text)
         with gr.Column(elem_classes="equal-height"):
             text_input = gr.TextArea(
                 label="Text to speak",
                 placeholder="Enter text here or upload a .txt file",
                 lines=10,
+                value=text
             )
         # Column 2: Controls
             )
             with gr.Group():
                 voice_dropdown = gr.Dropdown(
+                    label="Voice(s)",
+                    choices=[],  # Start empty, will be populated after initialization
+                    value=None,
+                    allow_custom_value=True,
+                    multiselect=True
                 )
+                # Add refresh button to manually update voice list
+                refresh_btn = gr.Button("🔄 Refresh Voices", size="sm")
                 speed_slider = gr.Slider(
                     label="Speed",
                     minimum=0.5,
                     value=1.0,
                     step=0.1
                 )
+                gpu_timeout_slider = gr.Slider(
+                    label="GPU Timeout (seconds)",
+                    minimum=15,
+                    maximum=120,
+                    value=60,
+                    step=1,
+                    info="Maximum time allowed for GPU processing"
+                )
                 submit_btn = gr.Button("Generate Speech", variant="primary")
         # Column 3: Output
             metrics_text = gr.Textbox(
                 label="Performance Summary",
                 interactive=False,
+                lines=5
             )
             metrics_plot = gr.Plot(
                 label="Processing Metrics",
                 format="png"  # Explicitly set format to PNG which is supported by matplotlib
             )
+    # Set up event handlers
+    refresh_btn.click(
+        fn=initialize_model,
+        outputs=[voice_dropdown]
+    )
     submit_btn.click(
         fn=generate_speech_from_ui,
+        inputs=[text_input, voice_dropdown, speed_slider, gpu_timeout_slider],
         outputs=[audio_output, metrics_plot, metrics_text],
         show_progress=True
     )
     with gr.Row():
         with gr.Column():
             gr.Markdown(demo_text_info)
+    # Initialize voices on load
+    demo.load(
+        fn=initialize_model,
+        outputs=[voice_dropdown]
+    )
 # Launch the app
 if __name__ == "__main__":

lib/file_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import importlib.util
 import sys
-from huggingface_hub import hf_hub_download
 from typing import List, Optional
 def load_module_from_file(module_name: str, file_path: str):
@@ -35,19 +35,39 @@ def ensure_dir(path: str) -> None:
     """Ensure directory exists, create if it doesn't"""
     os.makedirs(path, exist_ok=True)
 def list_voice_files(voices_dir: str) -> List[str]:
     """List available voice files in directory"""
     voices = []
     try:
-        if not os.path.exists(voices_dir):
-            print(f"Voices directory does not exist: {voices_dir}")
-            return voices
-        files = os.listdir(voices_dir)
         print(f"Found {len(files)} files in voices directory")
         for file in files:
-            if file.endswith(".pt"):
                 voice_name = file[:-3]  # Remove .pt extension
                 print(f"Found voice: {voice_name}")
                 voices.append(voice_name)
@@ -62,40 +82,26 @@ def list_voice_files(voices_dir: str) -> List[str]:
     return sorted(voices)
-def download_voice_files(repo_id: str, voices: List[str], voices_dir: str) -> None:
-    """Download voice files from Hugging Face Hub"""
-    ensure_dir(voices_dir)
-    for voice in voices:
-        try:
-            voice_path = os.path.join(voices_dir, voice)
-            print(f"Attempting to download voice {voice} to {voice_path}")
-            try:
-                downloaded_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=f"voices/{voice}",
-                    local_dir=voices_dir,
-                    local_dir_use_symlinks=False,
-                    force_filename=voice
-                )
-                print(f"Download completed to: {downloaded_path}")
-                if not os.path.exists(voice_path):
-                    print(f"Warning: File not found at expected path {voice_path}")
-                    print(f"Checking download location: {downloaded_path}")
-                    if os.path.exists(downloaded_path):
-                        print(f"Moving file from {downloaded_path} to {voice_path}")
-                        os.rename(downloaded_path, voice_path)
-                else:
-                    print(f"Verified voice file exists: {voice_path}")
-            except Exception as e:
-                print(f"Error downloading voice {voice}: {str(e)}")
-                import traceback
-                traceback.print_exc()
-        except Exception as e:
-            print(f"Error downloading voice {voice}: {str(e)}")
-            import traceback
-            traceback.print_exc()

 import os
 import importlib.util
 import sys
+from huggingface_hub import hf_hub_download, snapshot_download
 from typing import List, Optional
 def load_module_from_file(module_name: str, file_path: str):
     """Ensure directory exists, create if it doesn't"""
     os.makedirs(path, exist_ok=True)
+def find_voice_directory(start_path: str) -> str:
+    """Recursively search for directory containing .pt files that don't have 'kokoro' in the name"""
+    for root, dirs, files in os.walk(start_path):
+        pt_files = [f for f in files if f.endswith('.pt') and 'kokoro' not in f.lower()]
+        if pt_files:
+            return root
+    return ""
 def list_voice_files(voices_dir: str) -> List[str]:
     """List available voice files in directory"""
     voices = []
     try:
+        # First try the standard locations
+        if os.path.exists(os.path.join(voices_dir, 'voices')):
+            voice_path = os.path.join(voices_dir, 'voices')
+        else:
+            voice_path = voices_dir
+        # If no voices found, try recursive search
+        if not os.path.exists(voice_path) or not any(f.endswith('.pt') for f in os.listdir(voice_path)):
+            found_dir = find_voice_directory(os.path.dirname(voices_dir))
+            if found_dir:
+                voice_path = found_dir
+                print(f"Found voices in: {voice_path}")
+            else:
+                print(f"No voice directory found")
+                return voices
+        files = os.listdir(voice_path)
         print(f"Found {len(files)} files in voices directory")
         for file in files:
+            if file.endswith(".pt") and 'kokoro' not in file.lower():
                 voice_name = file[:-3]  # Remove .pt extension
                 print(f"Found voice: {voice_name}")
                 voices.append(voice_name)
     return sorted(voices)
+def download_voice_files(repo_id: str, directory: str, local_dir: str) -> None:
+    """Download voice files from Hugging Face Hub
+    Args:
+        repo_id: The Hugging Face repository ID
+        directory: The directory in the repo to download (e.g. "voices")
+        local_dir: Local directory to save files to
+    """
+    ensure_dir(local_dir)
+    try:
+        print(f"Downloading voice files from {repo_id}/{directory} to {local_dir}")
+        downloaded_path = snapshot_download(
+            repo_id=repo_id,
+            repo_type="model",
+            local_dir=local_dir,
+            allow_patterns=[f"{directory}/*"],
+            local_dir_use_symlinks=False
+        )
+        print(f"Download completed to: {downloaded_path}")
+    except Exception as e:
+        print(f"Error downloading voice files: {str(e)}")
+        import traceback
+        traceback.print_exc()

lib/ui_content.py CHANGED Viewed

@@ -13,7 +13,7 @@ header_html = """
     <div style="text-align: center; margin-bottom: 1rem;">
         <h1 style="font-size: 1.75rem; font-weight: bold; color: #ffffff; margin-bottom: 0.5rem;">Kokoro TTS Demo</h1>
-        <p style="color: #d1d5db;">Convert text to natural-sounding speech using various voices.</p>
     </div>
     <div style="display: flex; gap: 1rem;">

     <div style="text-align: center; margin-bottom: 1rem;">
         <h1 style="font-size: 1.75rem; font-weight: bold; color: #ffffff; margin-bottom: 0.5rem;">Kokoro TTS Demo</h1>
+        <p style="color: #d1d5db;">Rapidly onvert text to natural speech using various and blended voices.</p>
     </div>
     <div style="display: flex; gap: 1rem;">

the_time_machine_hgwells.txt CHANGED Viewed

@@ -1,22 +1,3 @@
-The Time Traveller (for so it will be convenient to speak of him) was
-expounding a recondite matter to us. His pale grey eyes shone and
-twinkled, and his usually pale face was flushed and animated. The fire
-burnt brightly, and the soft radiance of the incandescent lights in the
-lilies of silver caught the bubbles that flashed and passed in our
-glasses. Our chairs, being his patents, embraced and caressed us rather
-than submitted to be sat upon, and there was that luxurious
-after-dinner atmosphere, when thought runs gracefully free of the
-trammels of precision. And he put it to us in this way—marking the
-points with a lean forefinger—as we sat and lazily admired his
-earnestness over this new paradox (as we thought it) and his fecundity.
-“You must follow me carefully. I shall have to controvert one or two
-ideas that are almost universally accepted. The geometry, for instance,
-they taught you at school is founded on a misconception.”
-“Is not that rather a large thing to expect us to begin upon?” said
-Filby, an argumentative person with red hair.
 “I do not mean to ask you to accept anything without reasonable ground
 for it. You will soon admit as much as I need from you. You know of
 course that a mathematical line, a line of thickness _nil_, has no real

 “I do not mean to ask you to accept anything without reasonable ground
 for it. You will soon admit as much as I need from you. You know of
 course that a mathematical line, a line of thickness _nil_, has no real

tts_model.py CHANGED Viewed

@@ -16,6 +16,7 @@ from lib import (
     ensure_dir,
     concatenate_audio_chunks
 )
 class TTSModel:
     """GPU-accelerated TTS model manager"""
@@ -25,6 +26,7 @@ class TTSModel:
         self.voices_dir = "voices"
         self.model_repo = "hexgrad/Kokoro-82M"
         ensure_dir(self.voices_dir)
         # Load required modules
         py_modules = ["istftnet", "plbert", "models", "kokoro"]
@@ -48,14 +50,14 @@ class TTSModel:
                 self.model_repo,
                 ["kokoro-v0_19.pth", "config.json"]
             )
-            model_path = model_files[0]  # kokoro-v0_19.pth
-            # Build model directly on GPU
-            with torch.cuda.device(0):
-                torch.cuda.set_device(0)
-                self.model = self.build_model(model_path, 'cuda')
-                self._model_on_gpu = True
             print("Model initialization complete")
             return True
@@ -66,7 +68,7 @@ class TTSModel:
     def ensure_voice_downloaded(self, voice_name: str) -> bool:
         """Ensure specific voice is downloaded"""
         try:
-            voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
             if not os.path.exists(voice_path):
                 print(f"Downloading voice {voice_name}.pt...")
                 download_voice_files(self.model_repo, [f"{voice_name}.pt"], self.voices_dir)
@@ -77,43 +79,58 @@ class TTSModel:
     def list_voices(self) -> List[str]:
         """List available voices"""
-        return [
-            "af_bella", "af_nicole", "af_sarah", "af_sky", "af",
-            "am_adam", "am_michael", "bf_emma", "bf_isabella",
-            "bm_george", "bm_lewis"
-        ]
-    def _ensure_model_on_gpu(self) -> None:
-        """Ensure model is on GPU and stays there"""
-        if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
-            print("Moving model to GPU...")
-            with torch.cuda.device(0):
-                torch.cuda.set_device(0)
-                if hasattr(self.model, 'to'):
-                    self.model.to('cuda')
-                else:
-                    for name in self.model:
-                        if isinstance(self.model[name], torch.Tensor):
-                            self.model[name] = self.model[name].cuda()
-                self._model_on_gpu = True
     def _generate_audio(self, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> np.ndarray:
         """GPU-accelerated audio generation"""
         try:
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
-                # Move everything to GPU in a single context
-                if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
-                    print("Moving model to GPU...")
-                    if hasattr(self.model, 'to'):
-                        self.model.to('cuda')
-                    else:
-                        for name in self.model:
-                            if isinstance(self.model[name], torch.Tensor):
-                                self.model[name] = self.model[name].cuda()
-                    self._model_on_gpu = True
                 # Move voicepack to GPU
                 voicepack = voicepack.cuda()
@@ -131,59 +148,73 @@ class TTSModel:
         except Exception as e:
             print(f"Error in audio generation: {str(e)}")
             raise e
-    def generate_speech(self, text: str, voice_name: str, speed: float = 1.0, progress_callback=None) -> Tuple[np.ndarray, float]:
         """Generate speech from text. Returns (audio_array, duration)
         Args:
             text: Input text to convert to speech
             voice_name: Name of voice to use
             speed: Speech speed multiplier
-            progress_callback: Optional callback function(chunk_num, total_chunks, tokens_per_sec, rtf)
         """
         try:
-            if not text or not voice_name:
-                raise ValueError("Text and voice name are required")
             start_time = time.time()
-            # Count tokens and normalize text
-            total_tokens = count_tokens(text)
-            text = normalize_text(text)
-            if not text:
-                raise ValueError("Text is empty after normalization")
-            # Load voice and process within GPU context
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
-                voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
-                # Ensure voice is downloaded and load directly to GPU
-                if not self.ensure_voice_downloaded(voice_name):
-                    raise ValueError(f"Failed to download voice: {voice_name}")
-                voicepack = torch.load(voice_path, map_location='cuda', weights_only=True)
-                # Break text into chunks for better memory management
-                chunks = chunk_text(text)
-                print(f"Processing {len(chunks)} chunks...")
-                # Ensure model is initialized and on GPU
                 if self.model is None:
-                    print("Model not initialized, reinitializing...")
-                    if not self.initialize():
-                        raise ValueError("Failed to initialize model")
                 # Move model to GPU if needed
-                if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
                     print("Moving model to GPU...")
                     if hasattr(self.model, 'to'):
-                        self.model.to('cuda')
                     else:
                         for name in self.model:
                             if isinstance(self.model[name], torch.Tensor):
                                 self.model[name] = self.model[name].cuda()
-                    self._model_on_gpu = True
                 # Process all chunks within same GPU context
                 audio_chunks = []
@@ -202,11 +233,13 @@ class TTSModel:
                     )
                     chunk_time = time.time() - chunk_start
-                    # Update metrics
                     chunk_tokens = count_tokens(chunk)
                     total_processed_tokens += chunk_tokens
                     total_processed_time += chunk_time
-                    current_tokens_per_sec = total_processed_tokens / total_processed_time
                     # Calculate processing speed metrics
                     chunk_duration = len(chunk_audio) / 24000  # audio duration in seconds
@@ -216,7 +249,7 @@ class TTSModel:
                     chunk_times.append(chunk_time)
                     chunk_sizes.append(len(chunk))
                     print(f"Chunk {i+1}/{len(chunks)} processed in {chunk_time:.2f}s")
-                    print(f"Current tokens/sec: {current_tokens_per_sec:.2f}")
                     print(f"Real-time factor: {rtf:.2f}x")
                     print(f"{times_faster:.1f}x faster than real-time")
@@ -224,109 +257,33 @@ class TTSModel:
                     # Call progress callback if provided
                     if progress_callback:
-                        progress_callback(i + 1, len(chunks), current_tokens_per_sec, rtf)
             # Concatenate audio chunks
             audio = concatenate_audio_chunks(audio_chunks)
-            def setup_plot(fig, ax, title):
-                """Configure plot styling"""
-                # Improve grid
-                ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
-                # Set title and labels with better fonts and more padding
-                ax.set_title(title, pad=40, fontsize=16, fontweight="bold", color="#ffffff")
-                ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
-                ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
-                # Improve tick labels
-                ax.tick_params(labelsize=12, colors="#ffffff")
-                # Style spines
-                for spine in ax.spines.values():
-                    spine.set_color("#ffffff")
-                    spine.set_alpha(0.3)
-                    spine.set_linewidth(0.5)
-                # Set background colors
-                ax.set_facecolor("#1a1a2e")
-                fig.patch.set_facecolor("#1a1a2e")
-                return fig, ax
-            # Set dark style
-            plt.style.use("dark_background")
-            # Create figure with subplots
-            fig = plt.figure(figsize=(18, 16))
-            fig.patch.set_facecolor("#1a1a2e")
-            # Create subplot grid
-            gs = plt.GridSpec(2, 1, left=0.15, right=0.85, top=0.9, bottom=0.15, hspace=0.4)
-            # Processing times plot
-            ax1 = plt.subplot(gs[0])
-            chunks_x = list(range(1, len(chunks) + 1))
-            bars = ax1.bar(chunks_x, chunk_times, color='#ff2a6d', alpha=0.8)
-            # Add statistics lines
-            mean_time = mean(chunk_times)
-            median_time = median(chunk_times)
-            std_time = stdev(chunk_times) if len(chunk_times) > 1 else 0
-            ax1.axhline(y=mean_time, color='#05d9e8', linestyle='--',
-                       label=f'Mean: {mean_time:.2f}s')
-            ax1.axhline(y=median_time, color='#d1f7ff', linestyle=':',
-                       label=f'Median: {median_time:.2f}s')
-            # Add ±1 std dev range
-            if len(chunk_times) > 1:
-                ax1.axhspan(mean_time - std_time, mean_time + std_time,
-                          color='#8c1eff', alpha=0.2, label='±1 Std Dev')
-            # Add value labels on top of bars
-            for bar in bars:
-                height = bar.get_height()
-                ax1.text(bar.get_x() + bar.get_width() / 2.0,
-                        height,
-                        f'{height:.2f}s',
-                        ha='center',
-                        va='bottom',
-                        color='white',
-                        fontsize=10)
-            ax1.set_xlabel('Chunk Number')
-            ax1.set_ylabel('Processing Time (seconds)')
-            setup_plot(fig, ax1, 'Chunk Processing Times')
-            ax1.legend(facecolor="#1a1a2e", edgecolor="#ffffff")
-            # Chunk sizes plot
-            ax2 = plt.subplot(gs[1])
-            ax2.plot(chunks_x, chunk_sizes, color='#ff9e00', marker='o', linewidth=2)
-            ax2.set_xlabel('Chunk Number')
-            ax2.set_ylabel('Chunk Size (chars)')
-            setup_plot(fig, ax2, 'Chunk Sizes')
-            # Save plot
-            plt.savefig('chunk_times.png', format='png')
-            plt.close()
-            # Calculate metrics
-            total_time = time.time() - start_time
-            tokens_per_second = total_tokens / total_time
-            print(f"\nProcessing Metrics:")
-            print(f"Total tokens: {total_tokens}")
-            print(f"Total time: {total_time:.2f}s")
-            print(f"Tokens per second: {tokens_per_second:.2f}")
-            print(f"Mean chunk time: {mean_time:.2f}s")
-            print(f"Median chunk time: {median_time:.2f}s")
-            if len(chunk_times) > 1:
-                print(f"Std dev: {std_time:.2f}s")
-            print(f"\nChunk time plot saved as 'chunk_times.png'")
-            return audio, len(audio) / 24000  # Return audio array and duration
         except Exception as e:
             print(f"Error generating speech: {str(e)}")
             raise

     ensure_dir,
     concatenate_audio_chunks
 )
+import spaces
 class TTSModel:
     """GPU-accelerated TTS model manager"""
         self.voices_dir = "voices"
         self.model_repo = "hexgrad/Kokoro-82M"
         ensure_dir(self.voices_dir)
+        self.model_path = None
         # Load required modules
         py_modules = ["istftnet", "plbert", "models", "kokoro"]
                 self.model_repo,
                 ["kokoro-v0_19.pth", "config.json"]
             )
+            self.model_path = model_files[0]  # kokoro-v0_19.pth
+            # Download voice files
+            download_voice_files(self.model_repo, "voices", self.voices_dir)
+            # Get list of available voices
+            available_voices = self.list_voices()
             print("Model initialization complete")
             return True
     def ensure_voice_downloaded(self, voice_name: str) -> bool:
         """Ensure specific voice is downloaded"""
         try:
+            voice_path = os.path.join(self.voices_dir, "voices", f"{voice_name}.pt")
             if not os.path.exists(voice_path):
                 print(f"Downloading voice {voice_name}.pt...")
                 download_voice_files(self.model_repo, [f"{voice_name}.pt"], self.voices_dir)
     def list_voices(self) -> List[str]:
         """List available voices"""
+        voices = []
+        voices_subdir = os.path.join(self.voices_dir, "voices")
+        if os.path.exists(voices_subdir):
+            for file in os.listdir(voices_subdir):
+                if file.endswith(".pt"):
+                    voice_name = file[:-3]
+                    voices.append(voice_name)
+        return voices
+    # def _ensure_model_on_gpu(self) -> None:
+    #     """Ensure model is on GPU and stays there"""
+    #     if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
+    #         print("Moving model to GPU...")
+    #         with torch.cuda.device(0):
+    #             torch.cuda.set_device(0)
+    #             if hasattr(self.model, 'to'):
+    #                 self.model.to('cuda')
+    #             else:
+    #                 for name in self.model:
+    #                     if isinstance(self.model[name], torch.Tensor):
+    #                         self.model[name] = self.model[name].cuda()
+    #             self._model_on_gpu = True
     def _generate_audio(self, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> np.ndarray:
         """GPU-accelerated audio generation"""
         try:
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
+                try:
+                    # Build model if needed
+                    if self.model is None:
+                        print("Building model...")
+                        device = torch.device('cuda')
+                        self.model = self.build_model(self.model_path, device=device)
+                        if self.model is None:
+                            raise ValueError("Failed to build model")
+                        print("Model built successfully")
+                    # Move model to GPU if needed
+                    if not hasattr(self.model, '_on_gpu'):
+                        print("Moving model to GPU...")
+                        if hasattr(self.model, 'to'):
+                            self.model = self.model.to('cuda')
+                        else:
+                            for name in self.model:
+                                if isinstance(self.model[name], torch.Tensor):
+                                    self.model[name] = self.model[name].cuda()
+                        self.model._on_gpu = True
+                except Exception as e:
+                    print(f"Error building model: {str(e)}")
+                    print("Attempting to continue")
+                    raise e
                 # Move voicepack to GPU
                 voicepack = voicepack.cuda()
         except Exception as e:
             print(f"Error in audio generation: {str(e)}")
             raise e
+    @spaces.GPU(duration=None)  # Duration will be set by the UI
+    def generate_speech(self, text: str, voice_names: list[str], speed: float = 1.0, gpu_timeout: int = 60, progress_callback=None, progress_state=None, progress=None) -> Tuple[np.ndarray, float]:
         """Generate speech from text. Returns (audio_array, duration)
         Args:
             text: Input text to convert to speech
             voice_name: Name of voice to use
             speed: Speech speed multiplier
+            progress_callback: Optional callback function(chunk_num, total_chunks, tokens_per_sec, rtf, progress_state, start_time, gpu_timeout, progress)
+            progress_state: Dictionary tracking generation progress metrics
+            progress: Progress callback from Gradio
         """
         try:
             start_time = time.time()
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
+                if not text or not voice_names:
+                    raise ValueError("Text and voice name are required")
+                            # Build model directly on GPU
+                # Build model if needed
                 if self.model is None:
+                    print("Building model...")
+                    self.model = self.build_model(self.model_path, device='cuda')
+                    if self.model is None:
+                        raise ValueError("Failed to build model")
+                    print("Model built successfully")
                 # Move model to GPU if needed
+                if not hasattr(self.model, '_on_gpu'):
                     print("Moving model to GPU...")
                     if hasattr(self.model, 'to'):
+                        self.model = self.model.to('cuda')
                     else:
                         for name in self.model:
                             if isinstance(self.model[name], torch.Tensor):
                                 self.model[name] = self.model[name].cuda()
+                    self.model._on_gpu = True
+                t_voices = []
+                if isinstance(voice_names, list) and len(voice_names) > 1:
+                    for voice in voice_names:
+                        try:
+                            voice_path = os.path.join(self.voices_dir, "voices", f"{voice}.pt")
+                            voicepack = torch.load(voice_path, weights_only=True)
+                            t_voices.append(voicepack)
+                        except Exception as e:
+                            print(f"Warning: Failed to load voice {voice}: {str(e)}")
+                    # Combine voices by taking mean
+                    voicepack = torch.mean(torch.stack(t_voices), dim=0)
+                    voice_name = "_".join(voice_names)
+                else:
+                    voice_name = voice_names[0]
+                    voice_path = os.path.join(self.voices_dir, "voices", f"{voice_name}.pt")
+                    voicepack = torch.load(voice_path, weights_only=True)
+                # Count tokens and normalize text
+                total_tokens = count_tokens(text)
+                text = normalize_text(text)
+                if not text:
+                    raise ValueError("Text is empty after normalization")
+                # Break text into chunks for better memory management
+                chunks = chunk_text(text)
+                print(f"Processing {len(chunks)} chunks...")
                 # Process all chunks within same GPU context
                 audio_chunks = []
                     )
                     chunk_time = time.time() - chunk_start
+                    # Calculate per-chunk metrics
                     chunk_tokens = count_tokens(chunk)
+                    chunk_tokens_per_sec = chunk_tokens / chunk_time
+                    # Update totals for overall stats
                     total_processed_tokens += chunk_tokens
                     total_processed_time += chunk_time
                     # Calculate processing speed metrics
                     chunk_duration = len(chunk_audio) / 24000  # audio duration in seconds
                     chunk_times.append(chunk_time)
                     chunk_sizes.append(len(chunk))
                     print(f"Chunk {i+1}/{len(chunks)} processed in {chunk_time:.2f}s")
+                    print(f"Current tokens/sec: {chunk_tokens_per_sec:.2f}")
                     print(f"Real-time factor: {rtf:.2f}x")
                     print(f"{times_faster:.1f}x faster than real-time")
                     # Call progress callback if provided
                     if progress_callback:
+                        progress_callback(
+                            i + 1,  # chunk_num
+                            len(chunks),  # total_chunks
+                            chunk_tokens_per_sec,  # Pass per-chunk rate instead of cumulative
+                            rtf,
+                            progress_state,  # Added
+                            start_time,  # Added
+                            gpu_timeout,  # Use the timeout value from UI
+                            progress  # Added
+                        )
             # Concatenate audio chunks
             audio = concatenate_audio_chunks(audio_chunks)
+            # Return audio and metrics
+            return (
+                audio,  # Audio array
+                len(audio) / 24000,  # Duration
+                {
+                    "chunk_times": chunk_times,
+                    "chunk_sizes": chunk_sizes,
+                    "tokens_per_sec": [float(x) for x in progress_state["tokens_per_sec"]],
+                    "rtf": [float(x) for x in progress_state["rtf"]],
+                    "total_tokens": total_tokens,
+                    "total_time": time.time() - start_time
+                }
+            )
         except Exception as e:
             print(f"Error generating speech: {str(e)}")
             raise