Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

App Files Files Community

Remsky commited on 7 days ago

Commit

cb7f5d0

1 Parent(s): d21e919

Add audio and text utility modules, update requirements, and revise README

Browse files

Files changed (8) hide show

README.md +26 -27
app.py +114 -30
lib/__init__.py +34 -0
lib/audio_utils.py +23 -0
lib/file_utils.py +101 -0
lib/text_utils.py +56 -0
requirements.txt +1 -1
tts_model.py +213 -210

README.md CHANGED Viewed

@@ -1,47 +1,46 @@
 ---
-title: Kokoro TTS Zero
-emoji: 🎴
-colorFrom: gray
 colorTo: purple
 sdk: gradio
 sdk_version: 5.9.1
 app_file: app.py
-pinned: true
-license: apache-2.0
-short_description: A100 GPU Accelerated Inference applied to Kokoro-82M TTS
-models:
-- hexgrad/Kokoro-82M
 ---
 # Kokoro TTS Demo Space
 A Zero GPU-optimized Hugging Face Space for the Kokoro TTS model.
 ## Overview
 This Space provides a Gradio interface for the Kokoro TTS model, allowing users to:
 - Convert text to speech using multiple voices
 - Adjust speech speed
-- Get instant audio playback
-## Technical Details
-- Zero GPU for efficient GPU resource management
-- Dynamically loads required modules from hexgrad/Kokoro-82M repository
-All dependencies are automatically handled:
-- Core modules (kokoro.py, models.py, etc.) are downloaded from hexgrad/Kokoro-82M
-- Model weights and voice files are cached in /data/.huggingface
-- System dependencies (espeak-ng) are installed via packages.txt
-## Environment
-- Python 3.10.13
 - PyTorch 2.2.2
 - Gradio 5.9.1
-- A100 Zero GPU Enabled
-## Notes
-- Model Warm-Up takes some time, it shines at longer lengths.

 ---
+title: Kokoro TTS Demo
+emoji: 🎙️
+colorFrom: blue
 colorTo: purple
 sdk: gradio
 sdk_version: 5.9.1
 app_file: app.py
+pinned: false
+license: mit
 ---
 # Kokoro TTS Demo Space
 A Zero GPU-optimized Hugging Face Space for the Kokoro TTS model.
 ## Overview
 This Space provides a Gradio interface for the Kokoro TTS model, allowing users to:
 - Convert text to speech using multiple voices
 - Adjust speech speed
+## Project Structure
+```
+.
+├── app.py              # Main Gradio interface
+├── tts_model.py        # GPU-accelerated TTS model manager
+├── lib/                # Utility modules
+│   ├── __init__.py    # Package exports
+│   ├── text_utils.py  # Text processing utilities
+│   ├── file_utils.py  # File operations
+│   └── audio_utils.py # Audio processing
+└── requirements.txt    # Project dependencies
+```
+## Dependencies
+Main dependencies:
 - PyTorch 2.2.2
 - Gradio 5.9.1
+- Transformers 4.47.1
+- HuggingFace Hub ≥0.25.1
+For a complete list, see requirements.txt.

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 import gradio as gr
 import spaces
 from tts_model import TTSModel
-import numpy as np
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
@@ -22,81 +23,164 @@ def initialize_model():
 voice_list = initialize_model()
 @spaces.GPU(duration=120)  # Allow 5 minutes for processing
-def generate_speech_from_ui(text, voice_name, speed):
     """Handle text-to-speech generation from the Gradio UI"""
     try:
-        audio_array, duration = model.generate_speech(text, voice_name, speed)
-        # Convert float array to int16 range (-32768 to 32767)
-        audio_array = np.array(audio_array, dtype=np.float32)
-        audio_array = (audio_array * 32767).astype(np.int16)
-        return (24000, audio_array), f"Audio Duration: {duration:.2f} seconds\nProcessing complete - check console for detailed metrics"
     except Exception as e:
-        raise gr.Error(str(e))
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo") as demo:
     gr.HTML(
         """
         <div style="text-align: center; max-width: 800px; margin: 0 auto;">
             <h1>Kokoro TTS Demo</h1>
             <p>Convert text to natural-sounding speech using various voices.</p>
         </div>
         """
     )
     with gr.Row():
-        with gr.Column(scale=3):
-            # Input components
             text_input = gr.TextArea(
                 label="Text to speak",
-                placeholder="Enter text here...",
-                lines=3,
                 value=open("the_time_machine_hgwells.txt").read()[:1000]
             )
-            voice_dropdown = gr.Dropdown(
-                label="Voice",
-                choices=voice_list,
-                value=voice_list[0] if voice_list else None,
-                allow_custom_value=True  # Allow custom values to avoid warnings
             )
-            speed_slider = gr.Slider(
-                label="Speed",
-                minimum=0.5,
-                maximum=2.0,
-                value=1.0,
-                step=0.1
             )
-            submit_btn = gr.Button("Generate Speech")
-        with gr.Column(scale=2):
-            # Output components
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="numpy",
                 format="wav",
                 autoplay=False
             )
             duration_text = gr.Textbox(
                 label="Processing Info",
                 interactive=False,
-                lines=4
             )
     # Set up event handler
     submit_btn.click(
         fn=generate_speech_from_ui,
         inputs=[text_input, voice_dropdown, speed_slider],
-        outputs=[audio_output, duration_text]
     )
     # Add text analysis info
     with gr.Row():
         with gr.Column():
             gr.Markdown("""
             ### Demo Text Info
-            The preloaded text is from H.G. Wells' "The Time Machine" (Public Domain)
             """)
 # Launch the app
 if __name__ == "__main__":

 import os
 import gradio as gr
 import spaces
+import time
 from tts_model import TTSModel
+from lib import format_audio_output
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
 voice_list = initialize_model()
 @spaces.GPU(duration=120)  # Allow 5 minutes for processing
+def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_tqdm=False)):
     """Handle text-to-speech generation from the Gradio UI"""
     try:
+        start_time = time.time()
+        gpu_timeout = 120  # seconds
+        # Create progress state
+        progress_state = {
+            "progress": 0.0,
+            "tokens_per_sec": 0.0,
+            "gpu_time_left": gpu_timeout
+        }
+        def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
+            progress_state["progress"] = chunk_num / total_chunks
+            progress_state["tokens_per_sec"] = tokens_per_sec
+            # Update GPU time remaining
+            elapsed = time.time() - start_time
+            gpu_time_left = max(0, gpu_timeout - elapsed)
+            progress_state["gpu_time_left"] = gpu_time_left
+            # Only update progress display during processing
+            progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
+        # Generate speech with progress tracking
+        audio_array, duration = model.generate_speech(
+            text,
+            voice_name,
+            speed,
+            progress_callback=update_progress
+        )
+        # Format output for Gradio
+        audio_output, duration_text = format_audio_output(audio_array)
+        # Calculate final metrics
+        total_time = time.time() - start_time
+        total_duration = len(audio_array) / 24000  # audio duration in seconds
+        final_rtf = total_time / total_duration if total_duration > 0 else 0
+        # Prepare final metrics display
+        metrics_text = (
+            f"Tokens/sec: {progress_state['tokens_per_sec']:.1f}\n" +
+            f"Real-time factor: {final_rtf:.2f}x (Processing Time / Audio Duration)\n" +
+            f"GPU Time Used: {int(total_time)}s of {gpu_timeout}s"
+        )
+        return (
+            audio_output,
+            metrics_text,
+            duration_text
+        )
     except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
 # Create Gradio interface
 with gr.Blocks(title="Kokoro TTS Demo") as demo:
     gr.HTML(
         """
+        <div style="display: flex; justify-content: flex-end; padding: 10px; gap: 10px;">
+            <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-md-dark.svg" alt="Model on HF">
+            </a>
+            <a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="no-preference: light; light: light; dark: dark;" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Repo for Local Use</a>
+        </div>
         <div style="text-align: center; max-width: 800px; margin: 0 auto;">
             <h1>Kokoro TTS Demo</h1>
             <p>Convert text to natural-sounding speech using various voices.</p>
         </div>
+        <script async defer src="https://buttons.github.io/buttons.js"></script>
         """
     )
     with gr.Row():
+        # Column 1: Text Input
+        with gr.Column():
             text_input = gr.TextArea(
                 label="Text to speak",
+                placeholder="Enter text here or upload a .txt file",
+                lines=10,
                 value=open("the_time_machine_hgwells.txt").read()[:1000]
             )
+        # Column 2: Controls
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload .txt file",
+                file_types=[".txt"],
+                type="binary"
             )
+            def load_text_from_file(file_bytes):
+                if file_bytes is None:
+                    return None
+                try:
+                    return file_bytes.decode('utf-8')
+                except Exception as e:
+                    raise gr.Error(f"Failed to read file: {str(e)}")
+            file_input.change(
+                fn=load_text_from_file,
+                inputs=[file_input],
+                outputs=[text_input]
             )
+            with gr.Group():
+                voice_dropdown = gr.Dropdown(
+                    label="Voice",
+                    choices=voice_list,
+                    value=voice_list[0] if voice_list else None,
+                    allow_custom_value=True
+                )
+                speed_slider = gr.Slider(
+                    label="Speed",
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1
+                )
+                submit_btn = gr.Button("Generate Speech", variant="primary")
+        # Column 3: Output
+        with gr.Column():
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="numpy",
                 format="wav",
                 autoplay=False
             )
+            progress_bar = gr.Progress(track_tqdm=False)
+            metrics_text = gr.Textbox(
+                label="Processing Metrics",
+                interactive=False,
+                lines=3
+            )
             duration_text = gr.Textbox(
                 label="Processing Info",
                 interactive=False,
+                lines=2
             )
     # Set up event handler
     submit_btn.click(
         fn=generate_speech_from_ui,
         inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output, metrics_text, duration_text],
+        show_progress=True
     )
     # Add text analysis info
     with gr.Row():
         with gr.Column():
             gr.Markdown("""
             ### Demo Text Info
+            The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
             """)
 # Launch the app
 if __name__ == "__main__":

lib/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from .text_utils import normalize_text, chunk_text, count_tokens
+from .file_utils import (
+    load_module_from_file,
+    download_model_files,
+    list_voice_files,
+    download_voice_files,
+    ensure_dir
+)
+from .audio_utils import (
+    convert_float_to_int16,
+    get_audio_duration,
+    format_audio_output,
+    concatenate_audio_chunks
+)
+__all__ = [
+    # Text utilities
+    'normalize_text',
+    'chunk_text',
+    'count_tokens',
+    # File utilities
+    'load_module_from_file',
+    'download_model_files',
+    'list_voice_files',
+    'download_voice_files',
+    'ensure_dir',
+    # Audio utilities
+    'convert_float_to_int16',
+    'get_audio_duration',
+    'format_audio_output',
+    'concatenate_audio_chunks'
+]

lib/audio_utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+from typing import Tuple
+def convert_float_to_int16(audio_array: np.ndarray) -> np.ndarray:
+    """Convert float audio array to int16 format"""
+    # Convert to float32 first to ensure proper scaling
+    audio_array = np.array(audio_array, dtype=np.float32)
+    # Scale to int16 range (-32768 to 32767)
+    return (audio_array * 32767).astype(np.int16)
+def get_audio_duration(audio_array: np.ndarray, sample_rate: int = 24000) -> float:
+    """Calculate duration of audio in seconds"""
+    return len(audio_array) / sample_rate
+def format_audio_output(audio_array: np.ndarray, sample_rate: int = 24000) -> Tuple[Tuple[int, np.ndarray], str]:
+    """Format audio array for Gradio output with duration info"""
+    audio_array = convert_float_to_int16(audio_array)
+    duration = get_audio_duration(audio_array, sample_rate)
+    return (sample_rate, audio_array), f"Audio Duration: {duration:.2f} seconds"
+def concatenate_audio_chunks(chunks: list[np.ndarray]) -> np.ndarray:
+    """Concatenate multiple audio chunks into a single array"""
+    return np.concatenate(chunks)

lib/file_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import importlib.util
+import sys
+from huggingface_hub import hf_hub_download
+from typing import List, Optional
+def load_module_from_file(module_name: str, file_path: str):
+    """Load a Python module from file path"""
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Cannot load module {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+def download_model_files(repo_id: str, filenames: List[str], local_dir: Optional[str] = None) -> List[str]:
+    """Download multiple files from Hugging Face Hub"""
+    paths = []
+    for filename in filenames:
+        try:
+            path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                local_dir=local_dir,
+                local_dir_use_symlinks=False
+            )
+            paths.append(path)
+        except Exception as e:
+            print(f"Error downloading {filename}: {str(e)}")
+            raise
+    return paths
+def ensure_dir(path: str) -> None:
+    """Ensure directory exists, create if it doesn't"""
+    os.makedirs(path, exist_ok=True)
+def list_voice_files(voices_dir: str) -> List[str]:
+    """List available voice files in directory"""
+    voices = []
+    try:
+        if not os.path.exists(voices_dir):
+            print(f"Voices directory does not exist: {voices_dir}")
+            return voices
+        files = os.listdir(voices_dir)
+        print(f"Found {len(files)} files in voices directory")
+        for file in files:
+            if file.endswith(".pt"):
+                voice_name = file[:-3]  # Remove .pt extension
+                print(f"Found voice: {voice_name}")
+                voices.append(voice_name)
+        if not voices:
+            print("No voice files found in voices directory")
+    except Exception as e:
+        print(f"Error listing voices: {str(e)}")
+        import traceback
+        traceback.print_exc()
+    return sorted(voices)
+def download_voice_files(repo_id: str, voices: List[str], voices_dir: str) -> None:
+    """Download voice files from Hugging Face Hub"""
+    ensure_dir(voices_dir)
+    for voice in voices:
+        try:
+            voice_path = os.path.join(voices_dir, voice)
+            print(f"Attempting to download voice {voice} to {voice_path}")
+            try:
+                downloaded_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=f"voices/{voice}",
+                    local_dir=voices_dir,
+                    local_dir_use_symlinks=False,
+                    force_filename=voice
+                )
+                print(f"Download completed to: {downloaded_path}")
+                if not os.path.exists(voice_path):
+                    print(f"Warning: File not found at expected path {voice_path}")
+                    print(f"Checking download location: {downloaded_path}")
+                    if os.path.exists(downloaded_path):
+                        print(f"Moving file from {downloaded_path} to {voice_path}")
+                        os.rename(downloaded_path, voice_path)
+                else:
+                    print(f"Verified voice file exists: {voice_path}")
+            except Exception as e:
+                print(f"Error downloading voice {voice}: {str(e)}")
+                import traceback
+                traceback.print_exc()
+        except Exception as e:
+            print(f"Error downloading voice {voice}: {str(e)}")
+            import traceback
+            traceback.print_exc()

lib/text_utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import tiktoken
+def normalize_text(text: str) -> str:
+    """Normalize text for TTS processing"""
+    if not text:
+        return ""
+    # Basic normalization - can be expanded based on needs
+    return text.strip()
+def chunk_text(text: str, max_chars: int = 300) -> list[str]:
+    """Break text into chunks at natural boundaries"""
+    chunks = []
+    current_chunk = ""
+    # Split on sentence boundaries first
+    sentences = text.replace(".", ".|").replace("!", "!|").replace("?", "?|").replace(";", ";|").split("|")
+    for sentence in sentences:
+        if not sentence.strip():
+            continue
+        # If sentence is already too long, break on commas
+        if len(sentence) > max_chars:
+            parts = sentence.split(",")
+            for part in parts:
+                if len(current_chunk) + len(part) <= max_chars:
+                    current_chunk += part + ","
+                else:
+                    # If part is still too long, break on whitespace
+                    if len(part) > max_chars:
+                        words = part.split()
+                        for word in words:
+                            if len(current_chunk) + len(word) > max_chars:
+                                chunks.append(current_chunk.strip())
+                                current_chunk = word + " "
+                            else:
+                                current_chunk += word + " "
+                    else:
+                        chunks.append(current_chunk.strip())
+                        current_chunk = part + ","
+        else:
+            if len(current_chunk) + len(sentence) <= max_chars:
+                current_chunk += sentence
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def count_tokens(text: str) -> int:
+    """Count tokens in text using tiktoken"""
+    enc = tiktoken.get_encoding("cl100k_base")
+    return len(enc.encode(text))

requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ regex==2024.11.6
 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0

 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0
+matplotlib==3.4.3

tts_model.py CHANGED Viewed

@@ -1,122 +1,61 @@
 import os
-import io
-import spaces
 import torch
 import numpy as np
 import time
-import tiktoken
-import scipy.io.wavfile as wavfile
-from huggingface_hub import hf_hub_download
-import importlib.util
-import sys
-def load_module_from_file(module_name, file_path):
-    """Load a Python module from file path"""
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None or spec.loader is None:
-        raise ImportError(f"Cannot load module {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-    return module
-# Download and load required Python modules
-py_modules = ["istftnet", "plbert", "models"]
-for py_module in py_modules:
-    path = hf_hub_download(repo_id="hexgrad/Kokoro-82M", filename=f"{py_module}.py")
-    load_module_from_file(py_module, path)
-# Load the kokoro module
-kokoro_path = hf_hub_download(repo_id="hexgrad/Kokoro-82M", filename="kokoro.py")
-kokoro = load_module_from_file("kokoro", kokoro_path)
-# Import required functions
-generate = kokoro.generate
-normalize_text = kokoro.normalize_text
-models = sys.modules['models']
-build_model = models.build_model
-# Set HF_HOME for faster restarts
-os.environ["HF_HOME"] = "/data/.huggingface"
 class TTSModel:
-    """Self-contained TTS model manager for Hugging Face Spaces"""
     def __init__(self):
         self.model = None
         self.voices_dir = "voices"
         self.model_repo = "hexgrad/Kokoro-82M"
-        os.makedirs(self.voices_dir, exist_ok=True)
-    def initialize(self):
         """Initialize model and download voices"""
         try:
             print("Initializing model...")
-            # Download model and config
-            model_path = hf_hub_download(
-                repo_id=self.model_repo,
-                filename="kokoro-v0_19.pth"
-            )
-            config_path = hf_hub_download(
-                repo_id=self.model_repo,
-                filename="config.json"
             )
-            # Build model directly on GPU if available
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
-                self.model = build_model(model_path, 'cuda')
                 self._model_on_gpu = True
-            # Download all available voices
-            voices = [
-                "af_bella.pt", "af_nicole.pt", "af_sarah.pt", "af_sky.pt", "af.pt",
-                "am_adam.pt", "am_michael.pt",
-                "bf_emma.pt", "bf_isabella.pt",
-                "bm_george.pt", "bm_lewis.pt"
-            ]
-            for voice in voices:
-                try:
-                    # Download voice file
-                    # Create full destination path
-                    voice_path = os.path.join(self.voices_dir, voice)
-                    print(f"Attempting to download voice {voice} to {voice_path}")
-                    # Ensure directory exists
-                    os.makedirs(self.voices_dir, exist_ok=True)
-                    # Download with explicit destination
-                    try:
-                        downloaded_path = hf_hub_download(
-                            repo_id=self.model_repo,
-                            filename=f"voices/{voice}",
-                            local_dir=self.voices_dir,
-                            local_dir_use_symlinks=False,
-                            force_filename=voice
-                        )
-                        print(f"Download completed to: {downloaded_path}")
-                        # Verify file exists
-                        if not os.path.exists(voice_path):
-                            print(f"Warning: File not found at expected path {voice_path}")
-                            print(f"Checking download location: {downloaded_path}")
-                            if os.path.exists(downloaded_path):
-                                print(f"Moving file from {downloaded_path} to {voice_path}")
-                                os.rename(downloaded_path, voice_path)
-                        else:
-                            print(f"Verified voice file exists: {voice_path}")
-                    except Exception as e:
-                        print(f"Error downloading voice {voice}: {str(e)}")
-                        import traceback
-                        traceback.print_exc()
-                except Exception as e:
-                    print(f"Error downloading voice {voice}: {str(e)}")
-                    import traceback
-                    traceback.print_exc()
             print("Model initialization complete")
             return True
@@ -124,46 +63,35 @@ class TTSModel:
             print(f"Error initializing model: {str(e)}")
             return False
-    def list_voices(self):
-        """List available voices"""
-        voices = []
         try:
-            # Verify voices directory exists
-            if not os.path.exists(self.voices_dir):
-                print(f"Voices directory does not exist: {self.voices_dir}")
-                return voices
-            # Get list of files
-            files = os.listdir(self.voices_dir)
-            print(f"Found {len(files)} files in voices directory")
-            # Filter for .pt files
-            for file in files:
-                if file.endswith(".pt"):
-                    voices.append(file[:-3])  # Remove .pt extension
-                    print(f"Found voice: {file[:-3]}")
-            if not voices:
-                print("No voice files found in voices directory")
         except Exception as e:
-            print(f"Error listing voices: {str(e)}")
-            import traceback
-            traceback.print_exc()
-        return sorted(voices)
-    def _ensure_model_on_gpu(self):
         """Ensure model is on GPU and stays there"""
         if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
             print("Moving model to GPU...")
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
-                # Move model to GPU using torch.nn.Module method
                 if hasattr(self.model, 'to'):
                     self.model.to('cuda')
                 else:
-                    # Fallback for Munch object - move parameters individually
                     for name in self.model:
                         if isinstance(self.model[name], torch.Tensor):
                             self.model[name] = self.model[name].cuda()
@@ -190,7 +118,7 @@ class TTSModel:
                 voicepack = voicepack.cuda()
                 # Run generation with everything on GPU
-                audio, _ = generate(
                     self.model,
                     text,
                     voicepack,
@@ -203,63 +131,24 @@ class TTSModel:
         except Exception as e:
             print(f"Error in audio generation: {str(e)}")
             raise e
-    def chunk_text(self, text: str, max_chars: int = 300) -> list[str]:
-        """Break text into chunks at natural boundaries"""
-        chunks = []
-        current_chunk = ""
-        # Split on sentence boundaries first
-        sentences = text.replace(".", ".|").replace("!", "!|").replace("?", "?|").replace(";", ";|").split("|")
-        for sentence in sentences:
-            if not sentence.strip():
-                continue
-            # If sentence is already too long, break on commas
-            if len(sentence) > max_chars:
-                parts = sentence.split(",")
-                for part in parts:
-                    if len(current_chunk) + len(part) <= max_chars:
-                        current_chunk += part + ","
-                    else:
-                        # If part is still too long, break on whitespace
-                        if len(part) > max_chars:
-                            words = part.split()
-                            for word in words:
-                                if len(current_chunk) + len(word) > max_chars:
-                                    chunks.append(current_chunk.strip())
-                                    current_chunk = word + " "
-                                else:
-                                    current_chunk += word + " "
-                        else:
-                            chunks.append(current_chunk.strip())
-                            current_chunk = part + ","
-            else:
-                if len(current_chunk) + len(sentence) <= max_chars:
-                    current_chunk += sentence
-                else:
-                    chunks.append(current_chunk.strip())
-                    current_chunk = sentence
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        return chunks
-    def generate_speech(self, text: str, voice_name: str, speed: float = 1.0) -> tuple[np.ndarray, float]:
-        """Generate speech from text. Returns (audio_array, duration)"""
         try:
             if not text or not voice_name:
                 raise ValueError("Text and voice name are required")
             start_time = time.time()
-            # Initialize tokenizer
-            enc = tiktoken.get_encoding("cl100k_base")
-            total_tokens = len(enc.encode(text))
-            # Normalize text
             text = normalize_text(text)
             if not text:
                 raise ValueError("Text is empty after normalization")
@@ -269,49 +158,158 @@ class TTSModel:
                 torch.cuda.set_device(0)
                 voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
-                if not os.path.exists(voice_path):
-                    raise ValueError(f"Voice not found: {voice_name}")
-                # Load voice directly to GPU
                 voicepack = torch.load(voice_path, map_location='cuda', weights_only=True)
                 # Break text into chunks for better memory management
-                chunks = self.chunk_text(text)
                 print(f"Processing {len(chunks)} chunks...")
-            # Ensure model is initialized and on GPU
-            if self.model is None:
-                print("Model not initialized, reinitializing...")
-                if not self.initialize():
-                    raise ValueError("Failed to initialize model")
-            # Move model to GPU if needed
-            if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
-                print("Moving model to GPU...")
-                if hasattr(self.model, 'to'):
-                    self.model.to('cuda')
-                else:
-                    for name in self.model:
-                        if isinstance(self.model[name], torch.Tensor):
-                            self.model[name] = self.model[name].cuda()
-                self._model_on_gpu = True
-            # Process all chunks within same GPU context
-            audio_chunks = []
-            for i, chunk in enumerate(chunks):
-                chunk_start = time.time()
-                chunk_audio = self._generate_audio(
-                    text=chunk,
-                    voicepack=voicepack,
-                    lang=voice_name[0],
-                    speed=speed
-                )
-                chunk_time = time.time() - chunk_start
-                print(f"Chunk {i+1}/{len(chunks)} processed in {chunk_time:.2f}s")
-                audio_chunks.append(chunk_audio)
             # Concatenate audio chunks
-            audio = np.concatenate(audio_chunks)
             # Calculate metrics
             total_time = time.time() - start_time
@@ -321,6 +319,11 @@ class TTSModel:
             print(f"Total tokens: {total_tokens}")
             print(f"Total time: {total_time:.2f}s")
             print(f"Tokens per second: {tokens_per_second:.2f}")
             return audio, len(audio) / 24000  # Return audio array and duration

 import os
 import torch
 import numpy as np
 import time
+import matplotlib.pyplot as plt
+from typing import Tuple, List
+from statistics import mean, median, stdev
+from lib import (
+    normalize_text,
+    chunk_text,
+    count_tokens,
+    load_module_from_file,
+    download_model_files,
+    list_voice_files,
+    download_voice_files,
+    ensure_dir,
+    concatenate_audio_chunks
+)
 class TTSModel:
+    """GPU-accelerated TTS model manager"""
     def __init__(self):
         self.model = None
         self.voices_dir = "voices"
         self.model_repo = "hexgrad/Kokoro-82M"
+        ensure_dir(self.voices_dir)
+        # Load required modules
+        py_modules = ["istftnet", "plbert", "models", "kokoro"]
+        module_files = download_model_files(self.model_repo, [f"{m}.py" for m in py_modules])
+        for module_name, file_path in zip(py_modules, module_files):
+            load_module_from_file(module_name, file_path)
+        # Import required functions from kokoro module
+        kokoro = __import__("kokoro")
+        self.generate = kokoro.generate
+        self.build_model = __import__("models").build_model
+    def initialize(self) -> bool:
         """Initialize model and download voices"""
         try:
             print("Initializing model...")
+            # Download model files
+            model_files = download_model_files(
+                self.model_repo,
+                ["kokoro-v0_19.pth", "config.json"]
             )
+            model_path = model_files[0]  # kokoro-v0_19.pth
+            # Build model directly on GPU
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
+                self.model = self.build_model(model_path, 'cuda')
                 self._model_on_gpu = True
             print("Model initialization complete")
             return True
             print(f"Error initializing model: {str(e)}")
             return False
+    def ensure_voice_downloaded(self, voice_name: str) -> bool:
+        """Ensure specific voice is downloaded"""
         try:
+            voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
+            if not os.path.exists(voice_path):
+                print(f"Downloading voice {voice_name}.pt...")
+                download_voice_files(self.model_repo, [f"{voice_name}.pt"], self.voices_dir)
+            return True
         except Exception as e:
+            print(f"Error downloading voice {voice_name}: {str(e)}")
+            return False
+    def list_voices(self) -> List[str]:
+        """List available voices"""
+        return [
+            "af_bella", "af_nicole", "af_sarah", "af_sky", "af",
+            "am_adam", "am_michael", "bf_emma", "bf_isabella",
+            "bm_george", "bm_lewis"
+        ]
+    def _ensure_model_on_gpu(self) -> None:
         """Ensure model is on GPU and stays there"""
         if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
             print("Moving model to GPU...")
             with torch.cuda.device(0):
                 torch.cuda.set_device(0)
                 if hasattr(self.model, 'to'):
                     self.model.to('cuda')
                 else:
                     for name in self.model:
                         if isinstance(self.model[name], torch.Tensor):
                             self.model[name] = self.model[name].cuda()
                 voicepack = voicepack.cuda()
                 # Run generation with everything on GPU
+                audio, _ = self.generate(
                     self.model,
                     text,
                     voicepack,
         except Exception as e:
             print(f"Error in audio generation: {str(e)}")
             raise e
+    def generate_speech(self, text: str, voice_name: str, speed: float = 1.0, progress_callback=None) -> Tuple[np.ndarray, float]:
+        """Generate speech from text. Returns (audio_array, duration)
+        Args:
+            text: Input text to convert to speech
+            voice_name: Name of voice to use
+            speed: Speech speed multiplier
+            progress_callback: Optional callback function(chunk_num, total_chunks, tokens_per_sec, rtf)
+        """
         try:
             if not text or not voice_name:
                 raise ValueError("Text and voice name are required")
             start_time = time.time()
+            # Count tokens and normalize text
+            total_tokens = count_tokens(text)
             text = normalize_text(text)
             if not text:
                 raise ValueError("Text is empty after normalization")
                 torch.cuda.set_device(0)
                 voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
+                # Ensure voice is downloaded and load directly to GPU
+                if not self.ensure_voice_downloaded(voice_name):
+                    raise ValueError(f"Failed to download voice: {voice_name}")
                 voicepack = torch.load(voice_path, map_location='cuda', weights_only=True)
                 # Break text into chunks for better memory management
+                chunks = chunk_text(text)
                 print(f"Processing {len(chunks)} chunks...")
+                # Ensure model is initialized and on GPU
+                if self.model is None:
+                    print("Model not initialized, reinitializing...")
+                    if not self.initialize():
+                        raise ValueError("Failed to initialize model")
+                # Move model to GPU if needed
+                if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
+                    print("Moving model to GPU...")
+                    if hasattr(self.model, 'to'):
+                        self.model.to('cuda')
+                    else:
+                        for name in self.model:
+                            if isinstance(self.model[name], torch.Tensor):
+                                self.model[name] = self.model[name].cuda()
+                    self._model_on_gpu = True
+                # Process all chunks within same GPU context
+                audio_chunks = []
+                chunk_times = []
+                chunk_sizes = []  # Store chunk lengths
+                total_processed_tokens = 0
+                total_processed_time = 0
+                for i, chunk in enumerate(chunks):
+                    chunk_start = time.time()
+                    chunk_audio = self._generate_audio(
+                        text=chunk,
+                        voicepack=voicepack,
+                        lang=voice_name[0],
+                        speed=speed
+                    )
+                    chunk_time = time.time() - chunk_start
+                    # Update metrics
+                    chunk_tokens = count_tokens(chunk)
+                    total_processed_tokens += chunk_tokens
+                    total_processed_time += chunk_time
+                    current_tokens_per_sec = total_processed_tokens / total_processed_time
+                    # Calculate processing speed metrics
+                    chunk_duration = len(chunk_audio) / 24000  # audio duration in seconds
+                    rtf = chunk_time / chunk_duration
+                    times_faster = 1 / rtf
+                    chunk_times.append(chunk_time)
+                    chunk_sizes.append(len(chunk))
+                    print(f"Chunk {i+1}/{len(chunks)} processed in {chunk_time:.2f}s")
+                    print(f"Current tokens/sec: {current_tokens_per_sec:.2f}")
+                    print(f"Real-time factor: {rtf:.2f}x")
+                    print(f"{times_faster:.1f}x faster than real-time")
+                    audio_chunks.append(chunk_audio)
+                    # Call progress callback if provided
+                    if progress_callback:
+                        progress_callback(i + 1, len(chunks), current_tokens_per_sec, rtf)
             # Concatenate audio chunks
+            audio = concatenate_audio_chunks(audio_chunks)
+            def setup_plot(fig, ax, title):
+                """Configure plot styling"""
+                # Improve grid
+                ax.grid(True, linestyle="--", alpha=0.3, color="#ffffff")
+                # Set title and labels with better fonts and more padding
+                ax.set_title(title, pad=40, fontsize=16, fontweight="bold", color="#ffffff")
+                ax.set_xlabel(ax.get_xlabel(), fontsize=14, fontweight="medium", color="#ffffff")
+                ax.set_ylabel(ax.get_ylabel(), fontsize=14, fontweight="medium", color="#ffffff")
+                # Improve tick labels
+                ax.tick_params(labelsize=12, colors="#ffffff")
+                # Style spines
+                for spine in ax.spines.values():
+                    spine.set_color("#ffffff")
+                    spine.set_alpha(0.3)
+                    spine.set_linewidth(0.5)
+                # Set background colors
+                ax.set_facecolor("#1a1a2e")
+                fig.patch.set_facecolor("#1a1a2e")
+                return fig, ax
+            # Set dark style
+            plt.style.use("dark_background")
+            # Create figure with subplots
+            fig = plt.figure(figsize=(18, 16))
+            fig.patch.set_facecolor("#1a1a2e")
+            # Create subplot grid
+            gs = plt.GridSpec(2, 1, left=0.15, right=0.85, top=0.9, bottom=0.15, hspace=0.4)
+            # Processing times plot
+            ax1 = plt.subplot(gs[0])
+            chunks_x = list(range(1, len(chunks) + 1))
+            bars = ax1.bar(chunks_x, chunk_times, color='#ff2a6d', alpha=0.8)
+            # Add statistics lines
+            mean_time = mean(chunk_times)
+            median_time = median(chunk_times)
+            std_time = stdev(chunk_times) if len(chunk_times) > 1 else 0
+            ax1.axhline(y=mean_time, color='#05d9e8', linestyle='--',
+                       label=f'Mean: {mean_time:.2f}s')
+            ax1.axhline(y=median_time, color='#d1f7ff', linestyle=':',
+                       label=f'Median: {median_time:.2f}s')
+            # Add ±1 std dev range
+            if len(chunk_times) > 1:
+                ax1.axhspan(mean_time - std_time, mean_time + std_time,
+                          color='#8c1eff', alpha=0.2, label='±1 Std Dev')
+            # Add value labels on top of bars
+            for bar in bars:
+                height = bar.get_height()
+                ax1.text(bar.get_x() + bar.get_width() / 2.0,
+                        height,
+                        f'{height:.2f}s',
+                        ha='center',
+                        va='bottom',
+                        color='white',
+                        fontsize=10)
+            ax1.set_xlabel('Chunk Number')
+            ax1.set_ylabel('Processing Time (seconds)')
+            setup_plot(fig, ax1, 'Chunk Processing Times')
+            ax1.legend(facecolor="#1a1a2e", edgecolor="#ffffff")
+            # Chunk sizes plot
+            ax2 = plt.subplot(gs[1])
+            ax2.plot(chunks_x, chunk_sizes, color='#ff9e00', marker='o', linewidth=2)
+            ax2.set_xlabel('Chunk Number')
+            ax2.set_ylabel('Chunk Size (chars)')
+            setup_plot(fig, ax2, 'Chunk Sizes')
+            # Save plot
+            plt.savefig('chunk_times.png')
+            plt.close()
             # Calculate metrics
             total_time = time.time() - start_time
             print(f"Total tokens: {total_tokens}")
             print(f"Total time: {total_time:.2f}s")
             print(f"Tokens per second: {tokens_per_second:.2f}")
+            print(f"Mean chunk time: {mean_time:.2f}s")
+            print(f"Median chunk time: {median_time:.2f}s")
+            if len(chunk_times) > 1:
+                print(f"Std dev: {std_time:.2f}s")
+            print(f"\nChunk time plot saved as 'chunk_times.png'")
             return audio, len(audio) / 24000  # Return audio array and duration