aarishshahmohsin commited on
Commit
61db051
·
1 Parent(s): d2ac162
Files changed (2) hide show
  1. app.py +95 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset
7
+
8
+ # Model configurations
9
+ models = {
10
+ "Urdu Model": {
11
+ "checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
12
+ "vocoder": "microsoft/speecht5_hifigan",
13
+ "processor": "aarishshahmohsin/urdu_processor_t5",
14
+ },
15
+ "Technical Model": {
16
+ "checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
17
+ "vocoder": "microsoft/speecht5_hifigan",
18
+ "processor": "microsoft/speecht5_tts", # Using same checkpoint for processor
19
+ }
20
+ }
21
+
22
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
23
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
24
+
25
+
26
+ # Initialize all models at startup
27
+ print("Loading models...")
28
+ loaded_models = {}
29
+ for model_name, config in models.items():
30
+ processor = SpeechT5Processor.from_pretrained(config["processor"])
31
+ model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
32
+ vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])
33
+
34
+ loaded_models[model_name] = {
35
+ "processor": processor,
36
+ "model": model,
37
+ "vocoder": vocoder
38
+ }
39
+ print("Models loaded successfully!")
40
+
41
+ def predict(text, model_name):
42
+ if len(text.strip()) == 0:
43
+ return (16000, np.zeros(0).astype(np.int16))
44
+
45
+ model_components = loaded_models[model_name]
46
+ processor = model_components["processor"]
47
+ model = model_components["model"]
48
+ vocoder = model_components["vocoder"]
49
+
50
+ inputs = processor(text=text, return_tensors="pt")
51
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
52
+ speech = (speech.numpy() * 32767).astype(np.int16)
53
+
54
+ return (16000, speech)
55
+
56
+ # UI Configuration
57
+ title = "Multi-Model SpeechT5 Demo"
58
+
59
+ examples = [
60
+ # Urdu Model Examples
61
+ ["میں نے آج بہت کام کیا۔", "Urdu Model"],
62
+ ["آپ کا دن کیسا گزرا؟", "Urdu Model"],
63
+
64
+ # Technical Model Examples
65
+ ["JSON response with HTTP status code 200.", "Technical Model"],
66
+ ["Nginx is the best", "Technical Model"],
67
+ ]
68
+
69
+ description = """
70
+ Select a model and enter text to generate speech.
71
+
72
+ 1. Regional Language(Urdu)
73
+ 2. Technical Speech
74
+
75
+ """
76
+
77
+ # Create and launch the interface
78
+ gr.Interface(
79
+ fn=predict,
80
+ inputs=[
81
+ gr.Text(label="Input Text"),
82
+ gr.Dropdown(
83
+ choices=list(models.keys()),
84
+ label="Select Model",
85
+ value="Technical Model"
86
+ )
87
+ ],
88
+ outputs=[
89
+ gr.Audio(label="Generated Speech", type="numpy"),
90
+ ],
91
+ title=title,
92
+ description=description,
93
+ examples=examples, # Add examples to the interface
94
+ cache_examples=True,
95
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ datasets
3
+ librosa
4
+ torch
5
+ numpy