Spaces:
Runtime error
Runtime error
ORI-Muchim
commited on
Commit
·
bfc486e
1
Parent(s):
3388262
Upload 4 files
Browse files- text/LICENSE +19 -0
- text/__init__.py +32 -0
- text/cleaners.py +17 -0
- text/japanese.py +132 -0
text/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017 Keith Ito
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
+
of this software and associated documentation files (the "Software"), to deal
|
5 |
+
in the Software without restriction, including without limitation the rights
|
6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
+
copies of the Software, and to permit persons to whom the Software is
|
8 |
+
furnished to do so, subject to the following conditions:
|
9 |
+
|
10 |
+
The above copyright notice and this permission notice shall be included in
|
11 |
+
all copies or substantial portions of the Software.
|
12 |
+
|
13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
+
THE SOFTWARE.
|
text/__init__.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
from text import cleaners
|
3 |
+
|
4 |
+
|
5 |
+
def text_to_sequence(text, symbols, cleaner_names):
|
6 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
7 |
+
Args:
|
8 |
+
text: string to convert to a sequence
|
9 |
+
cleaner_names: names of the cleaner functions to run the text through
|
10 |
+
Returns:
|
11 |
+
List of integers corresponding to the symbols in the text
|
12 |
+
'''
|
13 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
14 |
+
|
15 |
+
sequence = []
|
16 |
+
|
17 |
+
clean_text = _clean_text(text, cleaner_names)
|
18 |
+
for symbol in clean_text:
|
19 |
+
if symbol not in _symbol_to_id.keys():
|
20 |
+
continue
|
21 |
+
symbol_id = _symbol_to_id[symbol]
|
22 |
+
sequence += [symbol_id]
|
23 |
+
return sequence
|
24 |
+
|
25 |
+
|
26 |
+
def _clean_text(text, cleaner_names):
|
27 |
+
for name in cleaner_names:
|
28 |
+
cleaner = getattr(cleaners, name)
|
29 |
+
if not cleaner:
|
30 |
+
raise Exception('Unknown cleaner: %s' % name)
|
31 |
+
text = cleaner(text)
|
32 |
+
return text
|
text/cleaners.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def japanese_cleaners(text):
|
4 |
+
from text.japanese import japanese_to_romaji_with_accent
|
5 |
+
text = japanese_to_romaji_with_accent(text)
|
6 |
+
if len(text) == 0 or re.match('[A-Za-z]', text[-1]):
|
7 |
+
text += '.'
|
8 |
+
return text
|
9 |
+
|
10 |
+
|
11 |
+
def japanese_cleaners2(text):
|
12 |
+
text = text.replace('・・・', '…').replace('・', ' ')
|
13 |
+
text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') \
|
14 |
+
.replace('(', '').replace(')', '') \
|
15 |
+
.replace('[', '').replace(']', '') \
|
16 |
+
.replace('*', ' ').replace('{', '').replace('}', '')
|
17 |
+
return text
|
text/japanese.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from unidecode import unidecode
|
3 |
+
import pyopenjtalk
|
4 |
+
|
5 |
+
|
6 |
+
# Regular expression matching Japanese without punctuation marks:
|
7 |
+
_japanese_characters = re.compile(
|
8 |
+
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
9 |
+
|
10 |
+
# Regular expression matching non-Japanese characters or punctuation marks:
|
11 |
+
_japanese_marks = re.compile(
|
12 |
+
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
13 |
+
|
14 |
+
# List of (symbol, Japanese) pairs for marks:
|
15 |
+
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
|
16 |
+
('%', 'パーセント')
|
17 |
+
]]
|
18 |
+
|
19 |
+
# List of (romaji, ipa) pairs for marks:
|
20 |
+
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
21 |
+
('ts', 'ʦ'),
|
22 |
+
('u', 'ɯ'),
|
23 |
+
('...', '…'),
|
24 |
+
('j', 'ʥ'),
|
25 |
+
('y', 'j'),
|
26 |
+
('ni', 'n^i'),
|
27 |
+
('nj', 'n^'),
|
28 |
+
('hi', 'çi'),
|
29 |
+
('hj', 'ç'),
|
30 |
+
('f', 'ɸ'),
|
31 |
+
('I', 'i*'),
|
32 |
+
('U', 'ɯ*'),
|
33 |
+
('r', 'ɾ')
|
34 |
+
]]
|
35 |
+
|
36 |
+
# Dictinary of (consonant, sokuon) pairs:
|
37 |
+
_real_sokuon = {
|
38 |
+
'k': 'k#',
|
39 |
+
'g': 'k#',
|
40 |
+
't': 't#',
|
41 |
+
'd': 't#',
|
42 |
+
'ʦ': 't#',
|
43 |
+
'ʧ': 't#',
|
44 |
+
'ʥ': 't#',
|
45 |
+
'j': 't#',
|
46 |
+
's': 's',
|
47 |
+
'ʃ': 's',
|
48 |
+
'p': 'p#',
|
49 |
+
'b': 'p#'
|
50 |
+
}
|
51 |
+
|
52 |
+
# Dictinary of (consonant, hatsuon) pairs:
|
53 |
+
_real_hatsuon = {
|
54 |
+
'p': 'm',
|
55 |
+
'b': 'm',
|
56 |
+
'm': 'm',
|
57 |
+
't': 'n',
|
58 |
+
'd': 'n',
|
59 |
+
'n': 'n',
|
60 |
+
'ʧ': 'n^',
|
61 |
+
'ʥ': 'n^',
|
62 |
+
'k': 'ŋ',
|
63 |
+
'g': 'ŋ'
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
def symbols_to_japanese(text):
|
68 |
+
for regex, replacement in _symbols_to_japanese:
|
69 |
+
text = re.sub(regex, replacement, text)
|
70 |
+
return text
|
71 |
+
|
72 |
+
|
73 |
+
def japanese_to_romaji_with_accent(text):
|
74 |
+
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
|
75 |
+
text = symbols_to_japanese(text)
|
76 |
+
sentences = re.split(_japanese_marks, text)
|
77 |
+
marks = re.findall(_japanese_marks, text)
|
78 |
+
text = ''
|
79 |
+
for i, sentence in enumerate(sentences):
|
80 |
+
if re.match(_japanese_characters, sentence):
|
81 |
+
if text != '':
|
82 |
+
text += ' '
|
83 |
+
labels = pyopenjtalk.extract_fullcontext(sentence)
|
84 |
+
for n, label in enumerate(labels):
|
85 |
+
phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
|
86 |
+
if phoneme not in ['sil', 'pau']:
|
87 |
+
text += phoneme.replace('ch', 'ʧ').replace('sh',
|
88 |
+
'ʃ').replace('cl', 'Q')
|
89 |
+
else:
|
90 |
+
continue
|
91 |
+
# n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
|
92 |
+
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
|
93 |
+
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
|
94 |
+
a3 = int(re.search(r"\+(\d+)/", label).group(1))
|
95 |
+
if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
|
96 |
+
a2_next = -1
|
97 |
+
else:
|
98 |
+
a2_next = int(
|
99 |
+
re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
|
100 |
+
# Accent phrase boundary
|
101 |
+
if a3 == 1 and a2_next == 1:
|
102 |
+
text += ' '
|
103 |
+
# Falling
|
104 |
+
elif a1 == 0 and a2_next == a2 + 1:
|
105 |
+
text += '↓'
|
106 |
+
# Rising
|
107 |
+
elif a2 == 1 and a2_next == 2:
|
108 |
+
text += '↑'
|
109 |
+
if i < len(marks):
|
110 |
+
text += unidecode(marks[i]).replace(' ', '')
|
111 |
+
return text
|
112 |
+
|
113 |
+
|
114 |
+
def get_real_sokuon(text):
|
115 |
+
text=re.sub('Q[↑↓]*(.)',lambda x:_real_sokuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_sokuon.keys() else x.group(0),text)
|
116 |
+
return text
|
117 |
+
|
118 |
+
|
119 |
+
def get_real_hatsuon(text):
|
120 |
+
text=re.sub('N[↑↓]*(.)',lambda x:_real_hatsuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_hatsuon.keys() else x.group(0),text)
|
121 |
+
return text
|
122 |
+
|
123 |
+
|
124 |
+
def japanese_to_ipa(text):
|
125 |
+
text=japanese_to_romaji_with_accent(text)
|
126 |
+
for regex, replacement in _romaji_to_ipa:
|
127 |
+
text = re.sub(regex, replacement, text)
|
128 |
+
text = re.sub(
|
129 |
+
r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
|
130 |
+
text = get_real_sokuon(text)
|
131 |
+
text = get_real_hatsuon(text)
|
132 |
+
return text
|