Upload kokoro.py
Browse files
kokoro.py
CHANGED
@@ -86,18 +86,22 @@ VOCAB = get_vocab()
|
|
86 |
def tokenize(ps):
|
87 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
if norm:
|
92 |
text = normalize_text(text)
|
93 |
-
ps =
|
94 |
ps = ps[0] if ps else ''
|
95 |
# https://en.wiktionary.org/wiki/kokoro#English
|
96 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
97 |
ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
|
98 |
ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
|
99 |
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
|
100 |
-
|
|
|
101 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
102 |
return ps.strip()
|
103 |
|
@@ -131,8 +135,8 @@ def forward(model, tokens, ref_s, speed):
|
|
131 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
132 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
133 |
|
134 |
-
def generate(model, text, voicepack,
|
135 |
-
ps =
|
136 |
tokens = tokenize(ps)
|
137 |
if not tokens:
|
138 |
return None
|
@@ -142,4 +146,4 @@ def generate(model, text, voicepack, speed=1, ps=None):
|
|
142 |
ref_s = voicepack[len(tokens)]
|
143 |
out = forward(model, tokens, ref_s, speed)
|
144 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
145 |
-
return out, ps
|
|
|
86 |
def tokenize(ps):
|
87 |
return [i for i in map(VOCAB.get, ps) if i is not None]
|
88 |
|
89 |
+
phonemizers = dict(
|
90 |
+
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
91 |
+
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
|
92 |
+
)
|
93 |
+
def phonemize(text, lang, norm=True):
|
94 |
if norm:
|
95 |
text = normalize_text(text)
|
96 |
+
ps = phonemizers[lang].phonemize([text])
|
97 |
ps = ps[0] if ps else ''
|
98 |
# https://en.wiktionary.org/wiki/kokoro#English
|
99 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
100 |
ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
|
101 |
ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
|
102 |
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
|
103 |
+
if lang == 'a':
|
104 |
+
ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
|
105 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
106 |
return ps.strip()
|
107 |
|
|
|
135 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
136 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
137 |
|
138 |
+
def generate(model, text, voicepack, lang='a', speed=1):
|
139 |
+
ps = phonemize(text, lang)
|
140 |
tokens = tokenize(ps)
|
141 |
if not tokens:
|
142 |
return None
|
|
|
146 |
ref_s = voicepack[len(tokens)]
|
147 |
out = forward(model, tokens, ref_s, speed)
|
148 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
149 |
+
return out, ps
|