File size: 2,848 Bytes
61db051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import librosa
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

# Model configurations
models = {
    "Urdu Model": {
        "checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
        "vocoder": "microsoft/speecht5_hifigan",
        "processor": "aarishshahmohsin/urdu_processor_t5",  
    },
    "Technical Model": {  
        "checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",  
        "vocoder": "microsoft/speecht5_hifigan",
        "processor": "microsoft/speecht5_tts",  # Using same checkpoint for processor
    }
}

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


# Initialize all models at startup
print("Loading models...")
loaded_models = {}
for model_name, config in models.items():
    processor = SpeechT5Processor.from_pretrained(config["processor"])
    model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
    vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
    
    loaded_models[model_name] = {
        "processor": processor,
        "model": model,
        "vocoder": vocoder
    }
print("Models loaded successfully!")

def predict(text, model_name):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))
    
    model_components = loaded_models[model_name]
    processor = model_components["processor"]
    model = model_components["model"]
    vocoder = model_components["vocoder"]

    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
    speech = (speech.numpy() * 32767).astype(np.int16)
    
    return (16000, speech)

# UI Configuration
title = "Multi-Model SpeechT5 Demo"

examples = [
    # Urdu Model Examples
    ["میں نے آج بہت کام کیا۔", "Urdu Model"],
    ["آپ کا دن کیسا گزرا؟", "Urdu Model"],
    
    # Technical Model Examples
    ["JSON response with HTTP status code 200.", "Technical Model"],
    ["Nginx is the best", "Technical Model"],
]

description = """
Select a model and enter text to generate speech. 

1. Regional Language(Urdu)
2. Technical Speech

"""

# Create and launch the interface
gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Dropdown(
            choices=list(models.keys()),
            label="Select Model",
            value="Technical Model"
        )
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    examples=examples,  # Add examples to the interface
    cache_examples=True, 
).launch()