Tejas1206 commited on
Commit
049c446
·
1 Parent(s): 32e99af
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+
8
+
9
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
+ model = SpeechT5ForTextToSpeech.from_pretrained("tejas1206/speecht5_tts_ta")
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
+
13
+
14
+ speaker_embeddings = {
15
+ "BDL": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
16
+ "CLB": "speaker/cmu_us_clb_arctic-wav-arctic_a0144.npy",
17
+ "KSP": "speaker/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
18
+ "RMS": "speaker/cmu_us_rms_arctic-wav-arctic_b0353.npy",
19
+ "SLT": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy",
20
+ }
21
+
22
+ def convert_text(sentence):
23
+
24
+ replacements = [
25
+ (' ', ' '), # Space
26
+ ('&', 'and'), # Ampersand
27
+ ('_', '_'), # Underscore
28
+ ('`', '`'), # Backtick
29
+ ('·', '.'), # Middle dot
30
+ ('á', 'a'), # Accent on 'a'
31
+ ('ô', 'o'), # Accent on 'o'
32
+ ('š', 's'), # 'S' with caron (soft s sound)
33
+ ('ஃ', 'akh'), # Aytham (Tamil diacritic)
34
+ ('அ', 'a'), # Tamil letter A
35
+ ('ஆ', 'aa'), # Tamil letter AA
36
+ ('இ', 'i'), # Tamil letter I
37
+ ('ஈ', 'ii'), # Tamil letter II
38
+ ('உ', 'u'), # Tamil letter U
39
+ ('ஊ', 'uu'), # Tamil letter UU
40
+ ('எ', 'e'), # Tamil letter E
41
+ ('ஏ', 'ee'), # Tamil letter EE
42
+ ('ஐ', 'ai'), # Tamil letter AI
43
+ ('ஒ', 'o'), # Tamil letter O
44
+ ('ஓ', 'oo'), # Tamil letter OO
45
+ ('ஔ', 'au'), # Tamil letter AU
46
+ ('க', 'ka'), # Tamil letter KA
47
+ ('ங', 'nga'), # Tamil letter NGA
48
+ ('ச', 'cha'), # Tamil letter CHA
49
+ ('ஜ', 'ja'), # Tamil letter JA
50
+ ('ஞ', 'nya'), # Tamil letter NYA
51
+ ('ட', 'ta'), # Tamil letter TTA (retroflex T)
52
+ ('ண', 'na'), # Tamil letter NNA (retroflex N)
53
+ ('த', 'tha'), # Tamil letter THA
54
+ ('ந', 'na'), # Tamil letter NA
55
+ ('ன', 'na'), # Tamil letter NN (alveolar N)
56
+ ('ப', 'pa'), # Tamil letter PA
57
+ ('ம', 'ma'), # Tamil letter MA
58
+ ('ய', 'ya'), # Tamil letter YA
59
+ ('ர', 'ra'), # Tamil letter RA
60
+ ('ற', 'rra'), # Tamil letter RRA (retroflex R)
61
+ ('ல', 'la'), # Tamil letter LA
62
+ ('ள', 'lla'), # Tamil letter LLA (retroflex L)
63
+ ('ழ', 'zha'), # Tamil letter LLA (unique Tamil letter)
64
+ ('வ', 'va'), # Tamil letter VA
65
+ ('ஷ', 'sha'), # Tamil letter SHA
66
+ ('ஸ', 'sa'), # Tamil letter SA
67
+ ('ஹ', 'ha'), # Tamil letter HA
68
+ ('ா', 'aa'), # Long A (Tamil vowel extension)
69
+ ('ி', 'i'), # Short I (Tamil vowel extension)
70
+ ('ீ', 'ii'), # Long I (Tamil vowel extension)
71
+ ('ு', 'u'), # Short U (Tamil vowel extension)
72
+ ('ூ', 'uu'), # Long U (Tamil vowel extension)
73
+ ('ெ', 'e'), # Short E (Tamil vowel extension)
74
+ ('ே', 'ee'), # Long E (Tamil vowel extension)
75
+ ('ை', 'ai'), # Tamil diphthong AI
76
+ ('ொ', 'o'), # Short O (Tamil vowel extension)
77
+ ('ோ', 'oo'), # Long O (Tamil vowel extension)
78
+ ('ௌ', 'au'), # Tamil diphthong AU
79
+ ('்', ''), # Tamil virama (removes inherent vowel)
80
+ ('ௗ', 'au'), # Rare Tamil vowel diacritic
81
+ ('ഥ', 'tha'), # Malayalam letter THA
82
+ ('–', '-'), # En dash
83
+ ('‘', "'"), # Left single quotation mark
84
+ ('’', "'"), # Right single quotation mark
85
+ ('‚', ','), # Single low quotation mark
86
+ ('“', '"'), # Left double quotation mark
87
+ ('”', '"'), # Right double quotation mark
88
+ ('•', '.'), # Bullet point
89
+ ('…', '...'), # Ellipsis
90
+ ('′', "'"), # Prime (minutes or feet symbol)
91
+ ('″', '"'), # Double prime (seconds or inches symbol)
92
+ ('●', '.'), # Filled bullet
93
+ ('◯', 'o'), # Circle symbol
94
+ ]
95
+
96
+ for src, dst in replacements:
97
+ sentence = sentence.replace(src, dst)
98
+ return sentence
99
+
100
+
101
+ def predict(text, speaker):
102
+
103
+ if len(text.strip()) == 0:
104
+ return (16000, np.zeros(0).astype(np.int16))
105
+
106
+ text = convert_text(text)
107
+
108
+ inputs = processor(text=text, return_tensors="pt")
109
+
110
+ # limit input length
111
+ input_ids = inputs["input_ids"]
112
+ input_ids = input_ids[..., :model.config.max_text_positions]
113
+
114
+ if speaker == "Surprise Me!":
115
+ # load one of the provided speaker embeddings at random
116
+ idx = np.random.randint(len(speaker_embeddings))
117
+ key = list(speaker_embeddings.keys())[idx]
118
+ speaker_embedding = np.load(speaker_embeddings[key])
119
+
120
+ # randomly shuffle the elements
121
+ np.random.shuffle(speaker_embedding)
122
+
123
+ # randomly flip half the values
124
+ x = (np.random.rand(512) >= 0.5) * 1.0
125
+ x[x == 0] = -1.0
126
+ speaker_embedding *= x
127
+
128
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
129
+ else:
130
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
131
+
132
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
133
+
134
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
135
+
136
+ speech = (speech.numpy() * 32767).astype(np.int16)
137
+ return (16000, speech)
138
+
139
+
140
+ title = "Text-to-Speech App using SpeechT5"
141
+
142
+ gr.Interface(
143
+ fn=predict,
144
+ inputs=[
145
+ gr.Text(label="Input Text"),
146
+ gr.Radio(label="Speaker", choices=[
147
+ "BDL (male)",
148
+ "CLB (female)",
149
+ "KSP (male)",
150
+ "RMS (male)",
151
+ "SLT (female)",
152
+ "Surprise Me!"
153
+ ],
154
+ value="BDL (male)"),
155
+ ],
156
+ outputs=[
157
+ gr.Audio(label="Generated Speech", type="numpy"),
158
+ ],
159
+ title=title,
160
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==5.1.0
2
+ torch==2.4.0
3
+ git+https://github.com/huggingface/transformers.git
4
+ soundfile==0.12.1
5
+ sentencepiece==0.2.0
6
+ samplerate
7
+ librosa
8
+ resampy
speaker/cmu_us_awb_arctic-wav-arctic_a0002.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db7a684ab490f21cec1628e00d461a184e369fe4eafb1ee441a796faf4ab6ae
3
+ size 2176
speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215326eae3a428af8934c385fbe043b36c72849ca17d1d013adeb189e6bd6962
3
+ size 2176
speaker/cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf67b36c47edfb1851466a1dff081b436bc6809b5ebc12811d9df0c0d0f28d0e
3
+ size 2176
speaker/cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c5c2a38c2e400179019c560a74c4322f4ee13beda22ee601807545edee283e
3
+ size 2176
speaker/cmu_us_rms_arctic-wav-arctic_b0353.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49dac3e9c3a71a4dbca4c364233c7915ae6e0cb71b2ceaed97296231b95cb50
3
+ size 2176
speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71ffadda3f3a4de079740a0b34963824dc644d9d5442283bd0a2b0d4f44ff0b
3
+ size 2176