Spaces:

aarishshahmohsin
/

iit_roorkee_tts_final

Sleeping

App Files Files Community

aarishshahmohsin commited on Oct 22, 2024

Commit

61db051

1 Parent(s): d2ac162

done

Browse files

Files changed (2) hide show

app.py +95 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+# Model configurations
+models = {
+    "Urdu Model": {
+        "checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
+        "vocoder": "microsoft/speecht5_hifigan",
+        "processor": "aarishshahmohsin/urdu_processor_t5",
+    },
+    "Technical Model": {
+        "checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
+        "vocoder": "microsoft/speecht5_hifigan",
+        "processor": "microsoft/speecht5_tts",  # Using same checkpoint for processor
+    }
+}
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# Initialize all models at startup
+print("Loading models...")
+loaded_models = {}
+for model_name, config in models.items():
+    processor = SpeechT5Processor.from_pretrained(config["processor"])
+    model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
+    vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
+    loaded_models[model_name] = {
+        "processor": processor,
+        "model": model,
+        "vocoder": vocoder
+    }
+print("Models loaded successfully!")
+def predict(text, model_name):
+    if len(text.strip()) == 0:
+        return (16000, np.zeros(0).astype(np.int16))
+    model_components = loaded_models[model_name]
+    processor = model_components["processor"]
+    model = model_components["model"]
+    vocoder = model_components["vocoder"]
+    inputs = processor(text=text, return_tensors="pt")
+    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
+    speech = (speech.numpy() * 32767).astype(np.int16)
+    return (16000, speech)
+# UI Configuration
+title = "Multi-Model SpeechT5 Demo"
+examples = [
+    # Urdu Model Examples
+    ["میں نے آج بہت کام کیا۔", "Urdu Model"],
+    ["آپ کا دن کیسا گزرا؟", "Urdu Model"],
+    # Technical Model Examples
+    ["JSON response with HTTP status code 200.", "Technical Model"],
+    ["Nginx is the best", "Technical Model"],
+]
+description = """
+Select a model and enter text to generate speech.
+1. Regional Language(Urdu)
+2. Technical Speech
+"""
+# Create and launch the interface
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Text(label="Input Text"),
+        gr.Dropdown(
+            choices=list(models.keys()),
+            label="Select Model",
+            value="Technical Model"
+        )
+    ],
+    outputs=[
+        gr.Audio(label="Generated Speech", type="numpy"),
+    ],
+    title=title,
+    description=description,
+    examples=examples,  # Add examples to the interface
+    cache_examples=True,
+).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+datasets
+librosa
+torch
+numpy