File size: 2,848 Bytes
61db051 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
import librosa
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# Model configurations
models = {
"Urdu Model": {
"checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
"vocoder": "microsoft/speecht5_hifigan",
"processor": "aarishshahmohsin/urdu_processor_t5",
},
"Technical Model": {
"checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
"vocoder": "microsoft/speecht5_hifigan",
"processor": "microsoft/speecht5_tts", # Using same checkpoint for processor
}
}
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Initialize all models at startup
print("Loading models...")
loaded_models = {}
for model_name, config in models.items():
processor = SpeechT5Processor.from_pretrained(config["processor"])
model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
loaded_models[model_name] = {
"processor": processor,
"model": model,
"vocoder": vocoder
}
print("Models loaded successfully!")
def predict(text, model_name):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
model_components = loaded_models[model_name]
processor = model_components["processor"]
model = model_components["model"]
vocoder = model_components["vocoder"]
inputs = processor(text=text, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
# UI Configuration
title = "Multi-Model SpeechT5 Demo"
examples = [
# Urdu Model Examples
["میں نے آج بہت کام کیا۔", "Urdu Model"],
["آپ کا دن کیسا گزرا؟", "Urdu Model"],
# Technical Model Examples
["JSON response with HTTP status code 200.", "Technical Model"],
["Nginx is the best", "Technical Model"],
]
description = """
Select a model and enter text to generate speech.
1. Regional Language(Urdu)
2. Technical Speech
"""
# Create and launch the interface
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Dropdown(
choices=list(models.keys()),
label="Select Model",
value="Technical Model"
)
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
examples=examples, # Add examples to the interface
cache_examples=True,
).launch() |