trysem commited on
Commit
a920b41
·
verified ·
1 Parent(s): cad0e81

Upload 14 files

Browse files
src/inference.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import re
3
+ import base64
4
+ import numpy as np
5
+ import traceback
6
+ from typing import Union
7
+
8
+ from TTS.utils.synthesizer import Synthesizer
9
+ from aksharamukha.transliterate import process as aksharamukha_xlit
10
+ from scipy.io.wavfile import write as scipy_wav_write
11
+
12
+ import nltk
13
+ import pysbd
14
+
15
+ from .models.common import Language
16
+ from .models.request import TTSRequest
17
+ from .models.response import AudioFile, AudioConfig, TTSResponse, TTSFailureResponse
18
+ from .utils.text import TextNormalizer
19
+ from .utils.paragraph_handler import ParagraphHandler
20
+ from src.postprocessor import PostProcessor
21
+
22
+ class TextToSpeechEngine:
23
+ def __init__(
24
+ self,
25
+ models: dict,
26
+ allow_transliteration: bool = True,
27
+ enable_denoiser: bool = True,
28
+ ):
29
+ self.models = models
30
+ # TODO: Ability to instantiate models by accepting standard paths or auto-downloading
31
+
32
+ code_mixed_found = False
33
+ if allow_transliteration:
34
+ # Initialize Indic-Xlit models for the languages corresponding to TTS models
35
+ from ai4bharat.transliteration import XlitEngine
36
+ xlit_langs = set()
37
+
38
+ for lang in list(models):
39
+ if lang == 'en':
40
+ continue # No need of any Indic-transliteration for English
41
+
42
+ if '+' in lang:
43
+ # If it's a code-mixed model like Hinglish, we need Hindi Xlit for non-English words
44
+ # TODO: Make it mandatory irrespective of `allow_transliteration` boolean
45
+ lang = lang.split('+')[1]
46
+ code_mixed_found = True
47
+ xlit_langs.add(lang)
48
+
49
+ self.xlit_engine = XlitEngine(xlit_langs, beam_width=6)
50
+ else:
51
+ self.xlit_engine = None
52
+
53
+ self.text_normalizer = TextNormalizer()
54
+ self.paragraph_handler = ParagraphHandler()
55
+ self.sent_seg = pysbd.Segmenter(language="en", clean=True)
56
+
57
+ self.orig_sr = 22050 # model.output_sample_rate
58
+ self.enable_denoiser = enable_denoiser
59
+ if enable_denoiser:
60
+ from src.postprocessor import Denoiser
61
+ self.target_sr = 16000
62
+ self.denoiser = Denoiser(self.orig_sr, self.target_sr)
63
+ else:
64
+ self.target_sr = self.orig_sr
65
+
66
+ self.post_processor = PostProcessor(self.target_sr)
67
+
68
+ if code_mixed_found:
69
+ # Dictionary of English words
70
+ import enchant
71
+ from enchant.tokenize import get_tokenizer
72
+
73
+ self.enchant_dicts = {
74
+ "en_US": enchant.Dict("en_US"),
75
+ "en_GB": enchant.Dict("en_GB"),
76
+ }
77
+ self.enchant_tokenizer = get_tokenizer("en")
78
+
79
+ def concatenate_chunks(self, wav: np.ndarray, wav_chunk: np.ndarray):
80
+ # TODO: Move to utils
81
+ if type(wav_chunk) != np.ndarray:
82
+ wav_chunk = np.array(wav_chunk)
83
+ if wav is None:
84
+ return wav_chunk
85
+ return np.concatenate([wav, wav_chunk])
86
+
87
+ def infer_from_request(
88
+ self,
89
+ request: TTSRequest,
90
+ transliterate_roman_to_native: bool = True
91
+ ) -> TTSResponse:
92
+
93
+ config = request.config
94
+ lang = config.language.sourceLanguage
95
+ gender = config.gender
96
+
97
+ # If there's no separate English model, use the Hinglish one
98
+ if lang == "en" and lang not in self.models and "en+hi" in self.models:
99
+ lang = "en+hi"
100
+
101
+ if lang not in self.models:
102
+ return TTSFailureResponse(status_text="Unsupported language!")
103
+
104
+ if lang == "brx" and gender == "male":
105
+ return TTSFailureResponse(status_text="Sorry, `male` speaker not supported for this language!")
106
+
107
+ output_list = []
108
+
109
+ for sentence in request.input:
110
+ raw_audio = self.infer_from_text(sentence.source, lang, gender, transliterate_roman_to_native=transliterate_roman_to_native)
111
+ # Convert PCM to WAV
112
+ byte_io = io.BytesIO()
113
+ scipy_wav_write(byte_io, self.target_sr, raw_audio)
114
+ # Encode WAV fileobject as base64 for transmission via JSON
115
+ encoded_bytes = base64.b64encode(byte_io.read())
116
+ encoded_string = encoded_bytes.decode()
117
+ speech_response = AudioFile(audioContent=encoded_string)
118
+
119
+ output_list.append(speech_response)
120
+
121
+ audio_config = AudioConfig(language=Language(sourceLanguage=lang))
122
+ return TTSResponse(audio=output_list, config=audio_config)
123
+
124
+ def infer_from_text(
125
+ self,
126
+ input_text: str,
127
+ lang: str,
128
+ speaker_name: str,
129
+ transliterate_roman_to_native: bool = True
130
+ ) -> np.ndarray:
131
+
132
+ # If there's no separate English model, use the Hinglish one
133
+ if lang == "en" and lang not in self.models and "en+hi" in self.models:
134
+ lang = "en+hi"
135
+
136
+ input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)
137
+
138
+ wav = None
139
+ paragraphs = self.paragraph_handler.split_text(input_text)
140
+
141
+ for paragraph in paragraphs:
142
+ paragraph = self.handle_transliteration(paragraph, primary_lang, transliterate_roman_to_native)
143
+ paras = []
144
+ for sent in self.sent_seg.segment(paragraph):
145
+ if sent.strip() and not re.match(r'^[_\W]+$', sent.strip()):
146
+ paras.append(sent.strip())
147
+ paragraph = " ".join(paras)
148
+
149
+ # Run Inference. TODO: Support for batch inference
150
+ wav_chunk = self.models[lang].tts(paragraph, speaker_name=speaker_name, style_wav="")
151
+
152
+ wav_chunk = self.postprocess_audio(wav_chunk, primary_lang, speaker_name)
153
+ # Concatenate current chunk with previous audio outputs
154
+ wav = self.concatenate_chunks(wav, wav_chunk)
155
+ return wav
156
+
157
+ def parse_langs_normalise_text(self, input_text: str, lang: str) -> Union[str, str, str]:
158
+ # If there's no separate English model, use the Hinglish one if present
159
+ if lang == "en" and lang not in self.models and "en+hi" in self.models:
160
+ lang = "en+hi"
161
+
162
+ if lang == "en+hi": # Hinglish (English+Hindi code-mixed)
163
+ primary_lang, secondary_lang = lang.split('+')
164
+ else:
165
+ primary_lang = lang
166
+ secondary_lang = None
167
+
168
+ input_text = self.text_normalizer.normalize_text(input_text, primary_lang)
169
+ if secondary_lang:
170
+ # TODO: Write a proper `transliterate_native_words_using_eng_dictionary`
171
+ input_text = self.transliterate_native_words_using_spell_checker(input_text, secondary_lang)
172
+
173
+ return input_text, primary_lang, secondary_lang
174
+
175
+ def handle_transliteration(self, input_text: str, primary_lang: str, transliterate_roman_to_native: bool) -> str:
176
+ if transliterate_roman_to_native and primary_lang != 'en':
177
+ input_text = self.transliterate_sentence(input_text, primary_lang)
178
+
179
+ # Manipuri was trained using the Central-govt's Bangla script
180
+ # So convert the words in native state-govt script to Eastern-Nagari
181
+ if primary_lang == "mni":
182
+ # TODO: Delete explicit-schwa
183
+ input_text = aksharamukha_xlit("MeeteiMayek", "Bengali", input_text)
184
+ return input_text
185
+
186
+ def preprocess_text(
187
+ self,
188
+ input_text: str,
189
+ lang: str,
190
+ # speaker_name: str,
191
+ transliterate_roman_to_native: bool = True
192
+ ) -> np.ndarray:
193
+
194
+ input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)
195
+ input_text = self.handle_transliteration(input_text, primary_lang, transliterate_roman_to_native)
196
+ return input_text
197
+
198
+ def postprocess_audio(self, wav_chunk, primary_lang, speaker_name):
199
+ if self.enable_denoiser:
200
+ wav_chunk = self.denoiser.denoise(wav_chunk)
201
+ wav_chunk = self.post_processor.process(wav_chunk, primary_lang, speaker_name)
202
+ return wav_chunk
203
+
204
+ def transliterate_native_words_using_spell_checker(self, input_text, lang):
205
+ tokens = [result[0] for result in self.enchant_tokenizer(input_text)]
206
+ pos_tags = [result[1] for result in nltk.tag.pos_tag(tokens)]
207
+
208
+ # Transliterate non-English Roman words to Indic
209
+ for word, pos_tag in zip(tokens, pos_tags):
210
+ if pos_tag == "NNP" or pos_tag == "NNPS":
211
+ # Enchant has many proper-nouns as well in its dictionary, don't know why.
212
+ # So if it's a proper-noun, always nativize
213
+ # FIXME: But NLTK's `averaged_perceptron_tagger` does not seem to be 100% accurate, it has false positives 🤦‍♂️
214
+ pass
215
+ elif self.enchant_dicts["en_US"].check(word) or self.enchant_dicts["en_GB"].check(word):
216
+ # TODO: Merge British and American dicts into 1 somehow
217
+ continue
218
+
219
+ # Convert "Ram's" -> "Ram". TODO: Think what are the failure cases
220
+ word = word.split("'")[0]
221
+
222
+ transliterated_word = self.transliterate_sentence(word, lang)
223
+ input_text = input_text.replace(word, transliterated_word, 1)
224
+ return input_text
225
+
226
+ def transliterate_sentence(self, input_text, lang):
227
+ if not self.xlit_engine:
228
+ return input_text
229
+
230
+ if lang == "raj":
231
+ lang = "hi" # Approximate
232
+
233
+ return self.xlit_engine.translit_sentence(input_text, lang)
src/models/__init__.py ADDED
File without changes
src/models/common.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, validator
2
+
3
+
4
+ class Language(BaseModel):
5
+ sourceLanguage: str
6
+
7
+ # @validator('sourceLanguage', pre=True)
8
+ # def blank_string_in_language(cls, value, field):
9
+ # if value == "":
10
+ # raise ValueError('sourceLanguage cannot be empty')
11
+ # return value
src/models/request.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel, validator
4
+
5
+ from .common import Language
6
+
7
+ SUPPORTED_GENDERS = {'male', 'female'}
8
+
9
+
10
+ class Sentence(BaseModel):
11
+ source: str
12
+
13
+ # @validator('source', pre=True)
14
+ # def blank_string_in_source(cls, value, field):
15
+ # if value == "":
16
+ # raise ValueError('source cannot be empty')
17
+ # return value
18
+
19
+
20
+ class TTSConfig(BaseModel):
21
+ language: Language
22
+ gender: str
23
+
24
+ # @validator('gender', pre=True)
25
+ # def blank_string_in_gender(cls, value, field):
26
+ # if value == "":
27
+ # raise ValueError('gender cannot be empty')
28
+ # if value not in SUPPORTED_GENDERS:
29
+ # raise ValueError('Unsupported gender value')
30
+ # return value
31
+
32
+
33
+ class TTSRequest(BaseModel):
34
+ input: List[Sentence]
35
+ config: TTSConfig
36
+
37
+ # @validator('input', pre=True)
38
+ # def input_cannot_be_empty(cls, value, field):
39
+ # if len(value) < 1:
40
+ # raise ValueError('input cannot be empty')
41
+ # return value
src/models/response.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from .common import Language
6
+
7
+
8
+ class AudioFile(BaseModel):
9
+ audioContent: str
10
+
11
+
12
+ class AudioConfig(BaseModel):
13
+ language: Language
14
+ audioFormat: str = 'wav'
15
+ encoding: str = 'base64'
16
+ samplingRate: int = 22050
17
+
18
+
19
+ class TTSResponse(BaseModel):
20
+ audio: List[AudioFile]
21
+ config: AudioConfig
22
+
23
+
24
+ class TTSFailureResponse(BaseModel):
25
+ status: str = 'ERROR'
26
+ status_text: str
src/postprocessor/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .postprocessor import PostProcessor
2
+ from .denoiser import Denoiser
3
+ from .vad import VoiceActivityDetection
src/postprocessor/denoiser.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import numpy as np
4
+
5
+ class Denoiser:
6
+
7
+ def __init__(self, orig_sr:int, target_sr:int):
8
+ self.orig_sr = orig_sr
9
+ self.target_sr = target_sr
10
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+ from asteroid.models import BaseModel as AsteroidBaseModel
13
+ self.model = AsteroidBaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k").to(self.device)
14
+
15
+ def denoise(self, wav):
16
+ if type(wav) != np.ndarray:
17
+ wav = np.array(wav)
18
+
19
+ if len(wav.shape) > 1:
20
+ wav = np.mean(wav, axis=1)
21
+ wav = librosa.resample(wav, orig_sr=self.orig_sr, target_sr=self.target_sr)
22
+ wav = torch.Tensor(wav.reshape(1, 1, wav.shape[0])).float().to(self.device)
23
+ wav = self.model.separate(wav)[0][0] #(batch, channels, time) -> (time)
24
+ return wav.cpu().detach().numpy()
src/postprocessor/postprocessor.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ffmpeg
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import tempfile
7
+
8
+ from .vad import VoiceActivityDetection
9
+
10
+
11
+ class PostProcessor:
12
+
13
+ def __init__(self, target_sr:int):
14
+ self.target_sr = target_sr
15
+ self.vad = VoiceActivityDetection()
16
+
17
+ def set_tempo(self, wav:np.ndarray, atempo:str ='1'):
18
+ with tempfile.TemporaryDirectory() as tmpdirname:
19
+ inpath = os.path.join(tmpdirname, 'input.wav')
20
+ outpath = inpath.replace('input.wav', 'output.wav')
21
+ sf.write(inpath, wav, self.target_sr)
22
+ in_stream = ffmpeg.input(inpath)
23
+ audio_stream = ffmpeg.filter_(in_stream, 'atempo', atempo)
24
+ audio_stream = audio_stream.output(outpath)
25
+ ffmpeg.run(audio_stream, overwrite_output=True)
26
+ wav, _ = librosa.load(outpath, sr=self.target_sr)
27
+ return wav
28
+
29
+ def trim_silence(self, wav:np.ndarray):
30
+ return self.vad.process(wav, sc_threshold=40)
31
+
32
+ def process(self, wav, lang:str, gender:str):
33
+ if type(wav) != np.ndarray:
34
+ wav = np.array(wav)
35
+
36
+ if (lang == "te") and (gender=='female'): # Telugu female speaker slow down
37
+ wav = self.set_tempo(wav, '0.85')
38
+ wav = self.trim_silence(wav)
39
+ elif (lang == 'mr') and (gender=='female'): # Marathi female speaker speed up
40
+ wav = self.trim_silence(wav)
41
+ wav = self.set_tempo(wav, '1.15')
42
+ elif (lang == 'gu'): # Gujarati speaker speed up
43
+ # wav = trim_silence(wav)
44
+ wav = self.set_tempo(wav, '1.20')
45
+
46
+ return wav
src/postprocessor/vad.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env python
2
+ # encoding: utf-8
3
+ '''
4
+ MIT License
5
+
6
+ Copyright (c) 2018 Mauricio
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.
25
+
26
+ Adapted from https://github.com/mauriciovander/silence-removal/blob/master/vad.py
27
+ '''
28
+ import numpy
29
+
30
+ class VoiceActivityDetection:
31
+
32
+ def __init__(self):
33
+ self.__step = 160
34
+ self.__buffer_size = 160
35
+ self.__buffer = numpy.array([],dtype=numpy.int16)
36
+ self.__out_buffer = numpy.array([],dtype=numpy.int16)
37
+ self.__n = 0
38
+ self.__VADthd = 0.
39
+ self.__VADn = 0.
40
+ self.__silence_counter = 0
41
+
42
+ # Voice Activity Detection
43
+ # Adaptive threshold
44
+ def vad(self, _frame, sc_threshold=20):
45
+ frame = numpy.array(_frame) ** 2.
46
+ result = True
47
+ threshold = 0.2
48
+ thd = numpy.min(frame) + numpy.ptp(frame) * threshold
49
+ self.__VADthd = (self.__VADn * self.__VADthd + thd) / float(self.__VADn + 1.)
50
+ self.__VADn += 1.
51
+
52
+ if numpy.mean(frame) <= self.__VADthd:
53
+ self.__silence_counter += 1
54
+ else:
55
+ self.__silence_counter = 0
56
+ if self.__silence_counter > sc_threshold:
57
+ result = False
58
+ return result
59
+
60
+ # Push new audio samples into the buffer.
61
+ def add_samples(self, data):
62
+ self.__buffer = numpy.append(self.__buffer, data)
63
+ result = len(self.__buffer) >= self.__buffer_size
64
+ # print('__buffer size %i'%self.__buffer.size)
65
+ return result
66
+
67
+ # Pull a portion of the buffer to process
68
+ # (pulled samples are deleted after being
69
+ # processed
70
+ def get_frame(self):
71
+ window = self.__buffer[:self.__buffer_size]
72
+ self.__buffer = self.__buffer[self.__step:]
73
+ # print('__buffer size %i'%self.__buffer.size)
74
+ return window
75
+
76
+ # Adds new audio samples to the internal
77
+ # buffer and process them
78
+ def process(self, data, sc_threshold):
79
+ self.__buffer = numpy.array([],dtype=numpy.int16)
80
+ self.__out_buffer = numpy.array([],dtype=numpy.int16)
81
+ if self.add_samples(data):
82
+ while len(self.__buffer) >= self.__buffer_size:
83
+ # Framing
84
+ window = self.get_frame()
85
+ if self.vad(window, sc_threshold): # speech frame
86
+ self.__out_buffer = numpy.append(self.__out_buffer, window)
87
+ return self.__out_buffer
src/utils/alphabet2phone.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"a": "aey", "b": "bee", "c": "see", "d": "dee", "e": "eee", "f": "eff", "g": "jee", "h": "ech", "i": "aai", "j": "jay", "k": "kay", "l": "ell", "m": "em", "n": "en", "o": "oh", "p": "pee", "q": "kyuu", "r": "aar", "s": "es", "t": "tea", "u": "you", "v": "vee", "w": "doubleu", "x": "ex", "y": "why", "z": "zedd"}
src/utils/paragraph_handler.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ import re
5
+
6
+ non_chars_regex = re.compile(r'[^\w]')
7
+
8
+ class ParagraphHandler():
9
+
10
+ def __init__(self, max_text_len=512):
11
+ self.L = max_text_len
12
+
13
+ def split_text(self, text:str, delimiter='.'):
14
+ '''Splits text at delimiter into paragraphs of max. length self.L'''
15
+ delimiter = ' ' if delimiter not in text else delimiter
16
+ if delimiter not in text:
17
+ return [text]
18
+
19
+ paragraphs = []
20
+ l_pos, r_pos = 0, 0
21
+ while r_pos < len(text):
22
+ r_pos = l_pos + self.L
23
+ if r_pos >= len(text): # append last paragraph.
24
+ paragraphs.append(text[l_pos:len(text)])
25
+ break
26
+ while delimiter is not None and text[r_pos] != delimiter and r_pos > l_pos and r_pos > 0: # find nearest delimiter < r_pos to split paragraph at.
27
+ r_pos -= 1
28
+ extracted_paragraph = text[l_pos:r_pos+1]
29
+ extracted_paragraph_without_special_chars = non_chars_regex.sub('', extracted_paragraph)
30
+ if extracted_paragraph_without_special_chars:
31
+ paragraphs.append(extracted_paragraph)
32
+ l_pos = r_pos + 1 # handle next paragraph
33
+ return paragraphs
34
+
35
+
36
+ if __name__ == '__main__':
37
+ text = "The following are quotes from A.P.J. Abdul Kalam. To succeed in your mission, you must have single-minded devotion to your goal. Look at the sky. We are not alone. The whole universe is friendly to us and conspires only to give the best to those who dream and work. The youth need to be enabled to become job generators from job seekers. If four things are followed - having a great aim, acquiring knowledge, hard work, and perseverance - then anything can be achieved. Where there is righteousness in the heart, there is beauty in the character. When there is beauty in the character, there is harmony in the home. When there is harmony in the home, there is order in the nation. When there is order in the nation, there is peace in the world. Great teachers emanate out of knowledge, passion and compassion. Let me define a leader. He must have vision and passion and not be afraid of any problem. Instead, he should know how to defeat it. Most importantly, he must work with integrity."
38
+ print('LENGTH: ', len(text)) # 988
39
+
40
+ paragraph_handler = ParagraphHandler()
41
+ paragraphs = paragraph_handler.split_text(text)
42
+ for p in paragraphs:
43
+ print(len(p), p)
src/utils/symbols.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "₹": {
3
+ "as": "Rupees",
4
+ "bn": "রুপি",
5
+ "brx": "Rupees",
6
+ "en": "Rupees",
7
+ "gu": "રૂપિયા",
8
+ "hi": "रुपये",
9
+ "kn": "ರೂಪಾಯಿ",
10
+ "ml": "രൂപ",
11
+ "mni": "Rupees",
12
+ "mr": "रुपये",
13
+ "or": "ଟଙ୍କ।",
14
+ "pa": "ਰੁਪਏ",
15
+ "raj": "रीप्या",
16
+ "ta": "ரூபாய்",
17
+ "te": "రూపాయలు"
18
+ },
19
+ "@": {
20
+ "as": "আঁ ট",
21
+ "bn": "আট",
22
+ "brx": "at",
23
+ "en": "at",
24
+ "gu": "આત્‌",
25
+ "hi": "आट",
26
+ "kn": "ಅಟ್‌",
27
+ "ml": "ആറ്റ്‌",
28
+ "mni": "ꯑꯊ",
29
+ "mr": "आट",
30
+ "or": "ଆଟ",
31
+ "pa": "ਆਤ",
32
+ "raj": "आट",
33
+ "ta": "ஆட்‌",
34
+ "te": "ఆ ట"
35
+ },
36
+ ".": {
37
+ "as": "ডোট",
38
+ "bn": "ডোট",
39
+ "brx": "dot",
40
+ "en": "dot",
41
+ "gu": "ડોટ",
42
+ "hi": "डोट",
43
+ "kn": "dot",
44
+ "ml": "ഡോട്ട്‌",
45
+ "mni": "ꯗꯣꯇ",
46
+ "mr": "डॉट",
47
+ "or": "ଡୋଟ୍‌",
48
+ "pa": "ਡੋਟ",
49
+ "raj": "डॉट",
50
+ "ta": "டாட்‌",
51
+ "te": "డాట్‌"
52
+ },
53
+ "/": {
54
+ "as": "শ্লাচ",
55
+ "bn": "স্লাশ",
56
+ "brx": "स्लाश",
57
+ "en": "slash",
58
+ "gu": "સ્લેશ",
59
+ "hi": "सलाश",
60
+ "kn": "ಸ್ಲಾಶ್‌",
61
+ "ml": "സ്ലാഷ്‌",
62
+ "mni": "ꯁ꯭ꯂꯦꯁ",
63
+ "mr": "सलाश",
64
+ "or": "ସ୍ଲାଶ୍‌",
65
+ "pa": "slash",
66
+ "raj": "स्लाश",
67
+ "ta": "ஸ்லாஷ்‌",
68
+ "te": "స్లాష్‌"
69
+ },
70
+ ":": {
71
+ "as": "কোলন",
72
+ "bn": "কোলন",
73
+ "brx": "कोलन",
74
+ "en": "colon",
75
+ "gu": "કોલન",
76
+ "hi": "कोलन",
77
+ "kn": "ಕೋಲನ್‌",
78
+ "ml": "കോളൻ",
79
+ "mni": "ꯀꯣꯂꯦꯟ",
80
+ "mr": "कोलन",
81
+ "or": "କୋଲୋନ",
82
+ "pa": "ਕੌਲੋਨ",
83
+ "raj": "कोलन",
84
+ "ta": "கோலன்‌",
85
+ "te": "కోలన్‌"
86
+ },
87
+ "+": {
88
+ "as": "প্লাচ",
89
+ "bn": "প্লাস",
90
+ "brx": "प्लस",
91
+ "en": "plus",
92
+ "gu": "પ્લસ",
93
+ "hi": "प्लस",
94
+ "kn": "ಪ್ಲಸ್‌",
95
+ "ml": "പ്ലസ്‌",
96
+ "mni": "ꯄ꯭ꯂꯁ",
97
+ "mr": "प्लॅस",
98
+ "or": "ପ୍ଲସ୍‌",
99
+ "pa": "ਪਲੱਸ",
100
+ "raj": "प्लस",
101
+ "ta": "பிளஸ்‌",
102
+ "te": "ప్లస్‌"
103
+ },
104
+ "-": {
105
+ "as": "ডাছ",
106
+ "bn": "ডাছ",
107
+ "brx": "दाश",
108
+ "en": "dash",
109
+ "gu": "ડાશ",
110
+ "hi": "दाश",
111
+ "kn": "ದಾಶು",
112
+ "ml": "ദാശ",
113
+ "mni": "ꯗꯥꯁ",
114
+ "mr": "दाश",
115
+ "or": "ଦାଶ",
116
+ "pa": "ਮਾਈਨਉਸ",
117
+ "raj": "दाश",
118
+ "ta": "டாஷ்‌",
119
+ "te": "డాష్‌"
120
+ },
121
+ "www": {
122
+ "as": "ডাব্লিঅডাব্লিওডাব্লিও",
123
+ "bn": "ডাব্লিউডাব্লিউডাব্লিউ",
124
+ "brx": "डबलयु डबलयु डबलयु",
125
+ "en": "doubleyou doubleyou doubleyou",
126
+ "gu": "ડબલ્યું ડબલ્યું ડબલ્યું",
127
+ "hi": "डबल्यू डबल्यू डबल्यू",
128
+ "kn": "ಡುಬ್ಲ್ಯುಡುಬ್ಲ್ಯುಡುಬ್ಲ್ಯು ",
129
+ "ml": "ഡബ്ലിയൂ ഡബ്ലിയൂ ഡബ്ലിയൂ",
130
+ "mni": "ꯗꯕꯜꯌꯨꯗꯕꯜꯌꯨꯗꯕꯜꯌꯨ",
131
+ "mr": "डब्ल्यू डब्ल्यू डब्ल्यू",
132
+ "or": "ଡବ୍ଲିୟୁଡବ୍ଲିୟୁଡବ୍ଲିୟୁ",
133
+ "pa": "ਡਬਲਿਊ ਡਬਲਿਊ ਡਬਲਿਊ",
134
+ "raj": "डब्ल्यू डब्ल्यू डब्ल्यू",
135
+ "ta": "டபிளியூ டபிளியூ டபிளியூ",
136
+ "te": "డబుల్యూడబుల్యూడబుల్యూ"
137
+ },
138
+ "%": {
139
+ "as": "শতাংশ",
140
+ "bn": "শতাংশ",
141
+ "brx": "percent",
142
+ "en": "percent",
143
+ "gu": "ટકા",
144
+ "hi": "प्रतिशत",
145
+ "kn": "ಶೇಕಡಾ",
146
+ "ml": "ശതമാനം",
147
+ "mni": "ꯆꯥꯗꯥ ꯆꯥꯗꯥ ꯴",
148
+ "mr": "टक्के",
149
+ "or": "ଶତକଡା",
150
+ "pa": "ਪ੍ਰਤੀਸ਼ਤ",
151
+ "raj": "percent",
152
+ "ta": "சதவீதம்",
153
+ "te": "శాతం"
154
+ }
155
+ }
src/utils/text.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ PWD = os.path.dirname(__file__)
3
+ import re
4
+ import regex
5
+ import json
6
+ import traceback
7
+
8
+ from nemo_text_processing.text_normalization.normalize import Normalizer
9
+ from indic_numtowords import num2words, supported_langs
10
+ from .translator import GoogleTranslator
11
+
12
+ indic_acronym_matcher = regex.compile(r"([\p{L}\p{M}]+\.\s*){2,}")
13
+
14
+ # short_form_regex = re.compile(r'\b[A-Z\.]{2,}s?\b')
15
+ # def get_shortforms_from_string(text):
16
+ # return short_form_regex.findall(text)
17
+
18
+ short_form_regex = re.compile(r"\b([A-Z][\.\s]+)+([A-Z])?\b")
19
+ eng_consonants_regex = re.compile(r"\b[BCDFGHJKLMNPQRSTVWXZbcdfghjklmnpqrstvwxz]+\b")
20
+ def get_shortforms_from_string(text):
21
+ dotted_shortforms = [m.group() for m in re.finditer(short_form_regex, text)]
22
+ non_dotted_shortforms = [m.group() for m in re.finditer(eng_consonants_regex, text)]
23
+ return dotted_shortforms + non_dotted_shortforms
24
+
25
+ decimal_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)")
26
+ def get_all_decimals_from_string(text):
27
+ return decimal_str_regex.findall(text)
28
+
29
+ num_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)?")
30
+ def get_all_numbers_from_string(text):
31
+ return num_str_regex.findall(text)
32
+
33
+ multiple_stops_regex = r'\.\.+'
34
+ def replace_multiple_stops(text):
35
+ return re.sub(multiple_stops_regex, '.', text)
36
+
37
+ date_generic_match_regex = re.compile("(?:[^0-9]\d*[./-]\d*[./-]\d*)")
38
+ date_str_regex = re.compile("(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4})|(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})") # match like dd/mm/yyyy or dd-mm-yy or yyyy.mm.dd or yy/mm/dd
39
+ def get_all_dates_from_string(text):
40
+ candidates = date_generic_match_regex.findall(text)
41
+ candidates = [c.replace(' ', '') for c in candidates]
42
+ candidates = [c for c in candidates if len(c) <= 10] # Prune invalid dates
43
+ candidates = ' '.join(candidates)
44
+ return date_str_regex.findall(candidates)
45
+
46
+ def get_decimal_substitution(decimal):
47
+ decimal_parts = decimal.split('.')
48
+ l_part = decimal_parts[0]
49
+ r_part = ""
50
+ for part in decimal_parts[1:]:
51
+ r_part += ' '.join(list(part)) # space between every digit after decimal point
52
+ decimal_sub = l_part + " point " + r_part
53
+ decimal_sub = decimal_sub.strip()
54
+ return decimal_sub
55
+
56
+ email_regex = r'[\w.+-]+@[\w-]+\.[\w.-]+'
57
+ url_regex = r'((?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*)|(\w*.com/?[\w\.\?=#]*)'
58
+ currency_regex = r"\₹\ ?[+-]?[0-9]{1,3}(?:,?[0-9])*(?:\.[0-9]{1,2})?"
59
+ phone_regex = r'\+?\d[ \d-]{6,12}\d'
60
+
61
+
62
+
63
+ class TextNormalizer:
64
+ def __init__(self):
65
+ self.translator = GoogleTranslator()
66
+ self.normalizer = Normalizer(input_case='cased', lang='en')
67
+ self.symbols2lang2word = json.load(open(os.path.join(PWD, "symbols.json"), encoding="utf-8"))
68
+ self.alphabet2phone = json.load(open(os.path.join(PWD, "alphabet2phone.json"), encoding="utf-8"))
69
+
70
+ def normalize_text(self, text, lang):
71
+ text = text.replace("।", ".").replace("|", ".").replace("꯫", ".").strip()
72
+ text = self.expand_shortforms(text, lang)
73
+ text = self.normalize_decimals(text, lang)
74
+ text = self.replace_punctutations(text, lang)
75
+ text = self.convert_dates_to_words(text, lang)
76
+ text = self.convert_symbols_to_words(text, lang)
77
+ text = self.convert_numbers_to_words(text, lang)
78
+ return text
79
+
80
+ def normalize_decimals(self, text, lang):
81
+ decimal_strs = get_all_decimals_from_string(text)
82
+ if not decimal_strs:
83
+ return text
84
+ decimals = [str(decimal_str.replace(',', '')) for decimal_str in decimal_strs]
85
+ decimal_substitutions = [get_decimal_substitution(decimal) for decimal in decimals]
86
+ for decimal_str, decimal_sub in zip(decimal_strs, decimal_substitutions):
87
+ text = text.replace(decimal_str, decimal_sub)
88
+ return text
89
+
90
+ def replace_punctutations(self, text, lang):
91
+ text = replace_multiple_stops(text)
92
+ if lang not in ['brx', 'or']:
93
+ text = text.replace('।', '.')
94
+ if text[-1] not in ['.', '!', '?', ',', ':', ';']:
95
+ text = text + ' .'
96
+ else:
97
+ text = text.replace('.', '।')
98
+ text = text.replace('|', '.')
99
+ for bracket in ['(', ')', '{', '}', '[', ']']:
100
+ text = text.replace(bracket, ',')
101
+ # text = text.replace(':', ',').replace(';',',')
102
+ text = text.replace(';',',')
103
+ return text
104
+
105
+ def convert_numbers_to_words(self, text, lang):
106
+ num_strs = get_all_numbers_from_string(text)
107
+ if not num_strs:
108
+ return text
109
+
110
+ # TODO: If it is a large integer without commas (say >5 digits), spell it out numeral by numeral
111
+ # NOTE: partially handled by phones
112
+ numbers = [int(num_str.replace(',', '')) for num_str in num_strs]
113
+
114
+ if lang in supported_langs:
115
+ # print(lang, numbers)
116
+ num_words = [num2words(num, lang=lang) for num in numbers]
117
+ else: # Fallback, converting to Indian-English, followed by NMT
118
+ try:
119
+ num_words = [num2words(num, lang="en") for num in numbers]
120
+ translated_num_words = [self.translator(text=num_word, from_lang="en", to_lang=lang) for num_word in num_words]
121
+ # TODO: Cache the results?
122
+ num_words = translated_num_words
123
+ except:
124
+ traceback.print_exc()
125
+
126
+ for num_str, num_word in zip(num_strs, num_words):
127
+ text = text.replace(num_str, ' '+num_word+' ', 1)
128
+ return text.replace(" ", ' ')
129
+
130
+ def convert_dates_to_words(self, text, lang):
131
+ date_strs = get_all_dates_from_string(text)
132
+ if not date_strs:
133
+ return text
134
+ for date_str in date_strs:
135
+ normalized_str = self.normalizer.normalize(date_str, verbose=False, punct_post_process=True)
136
+ if lang in ['brx', 'en']: # no translate
137
+ translated_str = normalized_str
138
+ else:
139
+ translated_str = self.translator(text=normalized_str, from_lang="en", to_lang=lang)
140
+ text = text.replace(date_str, translated_str)
141
+ return text
142
+
143
+ def expand_phones(self, item):
144
+ return ' '.join(list(item))
145
+
146
+ def find_valid(self, regex_str, text):
147
+ items = re.findall(regex_str, text)
148
+ return_items = []
149
+ for item in items:
150
+ if isinstance(item, tuple):
151
+ for subitem in item:
152
+ if len(subitem) > 0:
153
+ return_items.append(subitem)
154
+ break # choose first valid sub item
155
+ elif len(item) > 0:
156
+ return_items.append(item)
157
+ return return_items
158
+
159
+ def convert_symbols_to_words(self, text, lang):
160
+ symbols = self.symbols2lang2word.keys()
161
+ emails = self.find_valid(email_regex, text)
162
+ # urls = re.findall(r'(?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*', text)
163
+ urls = self.find_valid(url_regex, text)
164
+ # print('URLS', urls)
165
+ for item in emails + urls:
166
+ item_norm = item
167
+ for symbol in symbols:
168
+ item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
169
+ text = text.replace(item, item_norm)
170
+
171
+ currencies = self.find_valid(currency_regex, text)
172
+ for item in currencies:
173
+ item_norm = item.replace('₹','') + '₹' # Pronounce after numerals
174
+ for symbol in symbols:
175
+ item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
176
+ text = text.replace(item, item_norm)
177
+
178
+ phones = self.find_valid(phone_regex, text)
179
+ for item in phones:
180
+ item_norm = item.replace('-', ' ')
181
+ for symbol in symbols:
182
+ item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
183
+ item_norm = self.expand_phones(item_norm)
184
+ text = text.replace(item, item_norm)
185
+
186
+ # percentage
187
+ text = text.replace('%', self.symbols2lang2word['%'][lang])
188
+
189
+ return text
190
+
191
+ def convert_char2phone(self, char):
192
+ return self.alphabet2phone[char.lower()] if char.lower() in self.alphabet2phone else ''
193
+
194
+ def expand_shortforms(self, text, lang):
195
+ if lang!='en':
196
+ # Remove dots, as it speaks out like each letter is separate sentence
197
+ # Example: अई. अई. टी. -> अई अई टी
198
+ for match in regex.finditer(indic_acronym_matcher, text):
199
+ match = match.group()
200
+ match_without_dot = match.replace('.', ' ')
201
+ text = text.replace(match, match_without_dot)
202
+ return text
203
+
204
+ shortforms = get_shortforms_from_string(text)
205
+ for shortform in shortforms:
206
+ shortform = shortform.strip()
207
+ if shortform == 'I' or shortform == "A":
208
+ # Skip if valid English words
209
+ continue
210
+ expanded = ' '.join([self.convert_char2phone(char) for char in shortform])
211
+ text = text.replace(shortform, expanded, 1)
212
+ return text
src/utils/translator.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class GoogleTranslator:
2
+ def __init__(self):
3
+ from translators.server import google, _google
4
+ self._translate = google
5
+
6
+ google("Testing...")
7
+ self.supported_languages = set(_google.language_map['en'])
8
+ self.custom_lang_map = {
9
+ "mni": "mni-Mtei",
10
+ "raj": "hi",
11
+ }
12
+
13
+ def translate(self, text, from_lang, to_lang):
14
+ if from_lang in self.custom_lang_map:
15
+ from_lang = self.custom_lang_map[from_lang]
16
+ elif from_lang not in self.supported_languages:
17
+ return text
18
+
19
+ if to_lang in self.custom_lang_map:
20
+ to_lang = self.custom_lang_map[to_lang]
21
+ elif to_lang not in self.supported_languages:
22
+ return text
23
+
24
+ return self._translate(text, from_language=from_lang, to_language=to_lang)
25
+
26
+ def __call__(self, **kwargs):
27
+ return self.translate(**kwargs)