Upload 14 files
Browse files- src/inference.py +233 -0
- src/models/__init__.py +0 -0
- src/models/common.py +11 -0
- src/models/request.py +41 -0
- src/models/response.py +26 -0
- src/postprocessor/__init__.py +3 -0
- src/postprocessor/denoiser.py +24 -0
- src/postprocessor/postprocessor.py +46 -0
- src/postprocessor/vad.py +87 -0
- src/utils/alphabet2phone.json +1 -0
- src/utils/paragraph_handler.py +43 -0
- src/utils/symbols.json +155 -0
- src/utils/text.py +212 -0
- src/utils/translator.py +27 -0
src/inference.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import re
|
3 |
+
import base64
|
4 |
+
import numpy as np
|
5 |
+
import traceback
|
6 |
+
from typing import Union
|
7 |
+
|
8 |
+
from TTS.utils.synthesizer import Synthesizer
|
9 |
+
from aksharamukha.transliterate import process as aksharamukha_xlit
|
10 |
+
from scipy.io.wavfile import write as scipy_wav_write
|
11 |
+
|
12 |
+
import nltk
|
13 |
+
import pysbd
|
14 |
+
|
15 |
+
from .models.common import Language
|
16 |
+
from .models.request import TTSRequest
|
17 |
+
from .models.response import AudioFile, AudioConfig, TTSResponse, TTSFailureResponse
|
18 |
+
from .utils.text import TextNormalizer
|
19 |
+
from .utils.paragraph_handler import ParagraphHandler
|
20 |
+
from src.postprocessor import PostProcessor
|
21 |
+
|
22 |
+
class TextToSpeechEngine:
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
models: dict,
|
26 |
+
allow_transliteration: bool = True,
|
27 |
+
enable_denoiser: bool = True,
|
28 |
+
):
|
29 |
+
self.models = models
|
30 |
+
# TODO: Ability to instantiate models by accepting standard paths or auto-downloading
|
31 |
+
|
32 |
+
code_mixed_found = False
|
33 |
+
if allow_transliteration:
|
34 |
+
# Initialize Indic-Xlit models for the languages corresponding to TTS models
|
35 |
+
from ai4bharat.transliteration import XlitEngine
|
36 |
+
xlit_langs = set()
|
37 |
+
|
38 |
+
for lang in list(models):
|
39 |
+
if lang == 'en':
|
40 |
+
continue # No need of any Indic-transliteration for English
|
41 |
+
|
42 |
+
if '+' in lang:
|
43 |
+
# If it's a code-mixed model like Hinglish, we need Hindi Xlit for non-English words
|
44 |
+
# TODO: Make it mandatory irrespective of `allow_transliteration` boolean
|
45 |
+
lang = lang.split('+')[1]
|
46 |
+
code_mixed_found = True
|
47 |
+
xlit_langs.add(lang)
|
48 |
+
|
49 |
+
self.xlit_engine = XlitEngine(xlit_langs, beam_width=6)
|
50 |
+
else:
|
51 |
+
self.xlit_engine = None
|
52 |
+
|
53 |
+
self.text_normalizer = TextNormalizer()
|
54 |
+
self.paragraph_handler = ParagraphHandler()
|
55 |
+
self.sent_seg = pysbd.Segmenter(language="en", clean=True)
|
56 |
+
|
57 |
+
self.orig_sr = 22050 # model.output_sample_rate
|
58 |
+
self.enable_denoiser = enable_denoiser
|
59 |
+
if enable_denoiser:
|
60 |
+
from src.postprocessor import Denoiser
|
61 |
+
self.target_sr = 16000
|
62 |
+
self.denoiser = Denoiser(self.orig_sr, self.target_sr)
|
63 |
+
else:
|
64 |
+
self.target_sr = self.orig_sr
|
65 |
+
|
66 |
+
self.post_processor = PostProcessor(self.target_sr)
|
67 |
+
|
68 |
+
if code_mixed_found:
|
69 |
+
# Dictionary of English words
|
70 |
+
import enchant
|
71 |
+
from enchant.tokenize import get_tokenizer
|
72 |
+
|
73 |
+
self.enchant_dicts = {
|
74 |
+
"en_US": enchant.Dict("en_US"),
|
75 |
+
"en_GB": enchant.Dict("en_GB"),
|
76 |
+
}
|
77 |
+
self.enchant_tokenizer = get_tokenizer("en")
|
78 |
+
|
79 |
+
def concatenate_chunks(self, wav: np.ndarray, wav_chunk: np.ndarray):
|
80 |
+
# TODO: Move to utils
|
81 |
+
if type(wav_chunk) != np.ndarray:
|
82 |
+
wav_chunk = np.array(wav_chunk)
|
83 |
+
if wav is None:
|
84 |
+
return wav_chunk
|
85 |
+
return np.concatenate([wav, wav_chunk])
|
86 |
+
|
87 |
+
def infer_from_request(
|
88 |
+
self,
|
89 |
+
request: TTSRequest,
|
90 |
+
transliterate_roman_to_native: bool = True
|
91 |
+
) -> TTSResponse:
|
92 |
+
|
93 |
+
config = request.config
|
94 |
+
lang = config.language.sourceLanguage
|
95 |
+
gender = config.gender
|
96 |
+
|
97 |
+
# If there's no separate English model, use the Hinglish one
|
98 |
+
if lang == "en" and lang not in self.models and "en+hi" in self.models:
|
99 |
+
lang = "en+hi"
|
100 |
+
|
101 |
+
if lang not in self.models:
|
102 |
+
return TTSFailureResponse(status_text="Unsupported language!")
|
103 |
+
|
104 |
+
if lang == "brx" and gender == "male":
|
105 |
+
return TTSFailureResponse(status_text="Sorry, `male` speaker not supported for this language!")
|
106 |
+
|
107 |
+
output_list = []
|
108 |
+
|
109 |
+
for sentence in request.input:
|
110 |
+
raw_audio = self.infer_from_text(sentence.source, lang, gender, transliterate_roman_to_native=transliterate_roman_to_native)
|
111 |
+
# Convert PCM to WAV
|
112 |
+
byte_io = io.BytesIO()
|
113 |
+
scipy_wav_write(byte_io, self.target_sr, raw_audio)
|
114 |
+
# Encode WAV fileobject as base64 for transmission via JSON
|
115 |
+
encoded_bytes = base64.b64encode(byte_io.read())
|
116 |
+
encoded_string = encoded_bytes.decode()
|
117 |
+
speech_response = AudioFile(audioContent=encoded_string)
|
118 |
+
|
119 |
+
output_list.append(speech_response)
|
120 |
+
|
121 |
+
audio_config = AudioConfig(language=Language(sourceLanguage=lang))
|
122 |
+
return TTSResponse(audio=output_list, config=audio_config)
|
123 |
+
|
124 |
+
def infer_from_text(
|
125 |
+
self,
|
126 |
+
input_text: str,
|
127 |
+
lang: str,
|
128 |
+
speaker_name: str,
|
129 |
+
transliterate_roman_to_native: bool = True
|
130 |
+
) -> np.ndarray:
|
131 |
+
|
132 |
+
# If there's no separate English model, use the Hinglish one
|
133 |
+
if lang == "en" and lang not in self.models and "en+hi" in self.models:
|
134 |
+
lang = "en+hi"
|
135 |
+
|
136 |
+
input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)
|
137 |
+
|
138 |
+
wav = None
|
139 |
+
paragraphs = self.paragraph_handler.split_text(input_text)
|
140 |
+
|
141 |
+
for paragraph in paragraphs:
|
142 |
+
paragraph = self.handle_transliteration(paragraph, primary_lang, transliterate_roman_to_native)
|
143 |
+
paras = []
|
144 |
+
for sent in self.sent_seg.segment(paragraph):
|
145 |
+
if sent.strip() and not re.match(r'^[_\W]+$', sent.strip()):
|
146 |
+
paras.append(sent.strip())
|
147 |
+
paragraph = " ".join(paras)
|
148 |
+
|
149 |
+
# Run Inference. TODO: Support for batch inference
|
150 |
+
wav_chunk = self.models[lang].tts(paragraph, speaker_name=speaker_name, style_wav="")
|
151 |
+
|
152 |
+
wav_chunk = self.postprocess_audio(wav_chunk, primary_lang, speaker_name)
|
153 |
+
# Concatenate current chunk with previous audio outputs
|
154 |
+
wav = self.concatenate_chunks(wav, wav_chunk)
|
155 |
+
return wav
|
156 |
+
|
157 |
+
def parse_langs_normalise_text(self, input_text: str, lang: str) -> Union[str, str, str]:
|
158 |
+
# If there's no separate English model, use the Hinglish one if present
|
159 |
+
if lang == "en" and lang not in self.models and "en+hi" in self.models:
|
160 |
+
lang = "en+hi"
|
161 |
+
|
162 |
+
if lang == "en+hi": # Hinglish (English+Hindi code-mixed)
|
163 |
+
primary_lang, secondary_lang = lang.split('+')
|
164 |
+
else:
|
165 |
+
primary_lang = lang
|
166 |
+
secondary_lang = None
|
167 |
+
|
168 |
+
input_text = self.text_normalizer.normalize_text(input_text, primary_lang)
|
169 |
+
if secondary_lang:
|
170 |
+
# TODO: Write a proper `transliterate_native_words_using_eng_dictionary`
|
171 |
+
input_text = self.transliterate_native_words_using_spell_checker(input_text, secondary_lang)
|
172 |
+
|
173 |
+
return input_text, primary_lang, secondary_lang
|
174 |
+
|
175 |
+
def handle_transliteration(self, input_text: str, primary_lang: str, transliterate_roman_to_native: bool) -> str:
|
176 |
+
if transliterate_roman_to_native and primary_lang != 'en':
|
177 |
+
input_text = self.transliterate_sentence(input_text, primary_lang)
|
178 |
+
|
179 |
+
# Manipuri was trained using the Central-govt's Bangla script
|
180 |
+
# So convert the words in native state-govt script to Eastern-Nagari
|
181 |
+
if primary_lang == "mni":
|
182 |
+
# TODO: Delete explicit-schwa
|
183 |
+
input_text = aksharamukha_xlit("MeeteiMayek", "Bengali", input_text)
|
184 |
+
return input_text
|
185 |
+
|
186 |
+
def preprocess_text(
|
187 |
+
self,
|
188 |
+
input_text: str,
|
189 |
+
lang: str,
|
190 |
+
# speaker_name: str,
|
191 |
+
transliterate_roman_to_native: bool = True
|
192 |
+
) -> np.ndarray:
|
193 |
+
|
194 |
+
input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)
|
195 |
+
input_text = self.handle_transliteration(input_text, primary_lang, transliterate_roman_to_native)
|
196 |
+
return input_text
|
197 |
+
|
198 |
+
def postprocess_audio(self, wav_chunk, primary_lang, speaker_name):
|
199 |
+
if self.enable_denoiser:
|
200 |
+
wav_chunk = self.denoiser.denoise(wav_chunk)
|
201 |
+
wav_chunk = self.post_processor.process(wav_chunk, primary_lang, speaker_name)
|
202 |
+
return wav_chunk
|
203 |
+
|
204 |
+
def transliterate_native_words_using_spell_checker(self, input_text, lang):
|
205 |
+
tokens = [result[0] for result in self.enchant_tokenizer(input_text)]
|
206 |
+
pos_tags = [result[1] for result in nltk.tag.pos_tag(tokens)]
|
207 |
+
|
208 |
+
# Transliterate non-English Roman words to Indic
|
209 |
+
for word, pos_tag in zip(tokens, pos_tags):
|
210 |
+
if pos_tag == "NNP" or pos_tag == "NNPS":
|
211 |
+
# Enchant has many proper-nouns as well in its dictionary, don't know why.
|
212 |
+
# So if it's a proper-noun, always nativize
|
213 |
+
# FIXME: But NLTK's `averaged_perceptron_tagger` does not seem to be 100% accurate, it has false positives 🤦♂️
|
214 |
+
pass
|
215 |
+
elif self.enchant_dicts["en_US"].check(word) or self.enchant_dicts["en_GB"].check(word):
|
216 |
+
# TODO: Merge British and American dicts into 1 somehow
|
217 |
+
continue
|
218 |
+
|
219 |
+
# Convert "Ram's" -> "Ram". TODO: Think what are the failure cases
|
220 |
+
word = word.split("'")[0]
|
221 |
+
|
222 |
+
transliterated_word = self.transliterate_sentence(word, lang)
|
223 |
+
input_text = input_text.replace(word, transliterated_word, 1)
|
224 |
+
return input_text
|
225 |
+
|
226 |
+
def transliterate_sentence(self, input_text, lang):
|
227 |
+
if not self.xlit_engine:
|
228 |
+
return input_text
|
229 |
+
|
230 |
+
if lang == "raj":
|
231 |
+
lang = "hi" # Approximate
|
232 |
+
|
233 |
+
return self.xlit_engine.translit_sentence(input_text, lang)
|
src/models/__init__.py
ADDED
File without changes
|
src/models/common.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, validator
|
2 |
+
|
3 |
+
|
4 |
+
class Language(BaseModel):
|
5 |
+
sourceLanguage: str
|
6 |
+
|
7 |
+
# @validator('sourceLanguage', pre=True)
|
8 |
+
# def blank_string_in_language(cls, value, field):
|
9 |
+
# if value == "":
|
10 |
+
# raise ValueError('sourceLanguage cannot be empty')
|
11 |
+
# return value
|
src/models/request.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
from pydantic import BaseModel, validator
|
4 |
+
|
5 |
+
from .common import Language
|
6 |
+
|
7 |
+
SUPPORTED_GENDERS = {'male', 'female'}
|
8 |
+
|
9 |
+
|
10 |
+
class Sentence(BaseModel):
|
11 |
+
source: str
|
12 |
+
|
13 |
+
# @validator('source', pre=True)
|
14 |
+
# def blank_string_in_source(cls, value, field):
|
15 |
+
# if value == "":
|
16 |
+
# raise ValueError('source cannot be empty')
|
17 |
+
# return value
|
18 |
+
|
19 |
+
|
20 |
+
class TTSConfig(BaseModel):
|
21 |
+
language: Language
|
22 |
+
gender: str
|
23 |
+
|
24 |
+
# @validator('gender', pre=True)
|
25 |
+
# def blank_string_in_gender(cls, value, field):
|
26 |
+
# if value == "":
|
27 |
+
# raise ValueError('gender cannot be empty')
|
28 |
+
# if value not in SUPPORTED_GENDERS:
|
29 |
+
# raise ValueError('Unsupported gender value')
|
30 |
+
# return value
|
31 |
+
|
32 |
+
|
33 |
+
class TTSRequest(BaseModel):
|
34 |
+
input: List[Sentence]
|
35 |
+
config: TTSConfig
|
36 |
+
|
37 |
+
# @validator('input', pre=True)
|
38 |
+
# def input_cannot_be_empty(cls, value, field):
|
39 |
+
# if len(value) < 1:
|
40 |
+
# raise ValueError('input cannot be empty')
|
41 |
+
# return value
|
src/models/response.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
from pydantic import BaseModel
|
4 |
+
|
5 |
+
from .common import Language
|
6 |
+
|
7 |
+
|
8 |
+
class AudioFile(BaseModel):
|
9 |
+
audioContent: str
|
10 |
+
|
11 |
+
|
12 |
+
class AudioConfig(BaseModel):
|
13 |
+
language: Language
|
14 |
+
audioFormat: str = 'wav'
|
15 |
+
encoding: str = 'base64'
|
16 |
+
samplingRate: int = 22050
|
17 |
+
|
18 |
+
|
19 |
+
class TTSResponse(BaseModel):
|
20 |
+
audio: List[AudioFile]
|
21 |
+
config: AudioConfig
|
22 |
+
|
23 |
+
|
24 |
+
class TTSFailureResponse(BaseModel):
|
25 |
+
status: str = 'ERROR'
|
26 |
+
status_text: str
|
src/postprocessor/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .postprocessor import PostProcessor
|
2 |
+
from .denoiser import Denoiser
|
3 |
+
from .vad import VoiceActivityDetection
|
src/postprocessor/denoiser.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
class Denoiser:
|
6 |
+
|
7 |
+
def __init__(self, orig_sr:int, target_sr:int):
|
8 |
+
self.orig_sr = orig_sr
|
9 |
+
self.target_sr = target_sr
|
10 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
11 |
+
|
12 |
+
from asteroid.models import BaseModel as AsteroidBaseModel
|
13 |
+
self.model = AsteroidBaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k").to(self.device)
|
14 |
+
|
15 |
+
def denoise(self, wav):
|
16 |
+
if type(wav) != np.ndarray:
|
17 |
+
wav = np.array(wav)
|
18 |
+
|
19 |
+
if len(wav.shape) > 1:
|
20 |
+
wav = np.mean(wav, axis=1)
|
21 |
+
wav = librosa.resample(wav, orig_sr=self.orig_sr, target_sr=self.target_sr)
|
22 |
+
wav = torch.Tensor(wav.reshape(1, 1, wav.shape[0])).float().to(self.device)
|
23 |
+
wav = self.model.separate(wav)[0][0] #(batch, channels, time) -> (time)
|
24 |
+
return wav.cpu().detach().numpy()
|
src/postprocessor/postprocessor.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import ffmpeg
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
import soundfile as sf
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
from .vad import VoiceActivityDetection
|
9 |
+
|
10 |
+
|
11 |
+
class PostProcessor:
|
12 |
+
|
13 |
+
def __init__(self, target_sr:int):
|
14 |
+
self.target_sr = target_sr
|
15 |
+
self.vad = VoiceActivityDetection()
|
16 |
+
|
17 |
+
def set_tempo(self, wav:np.ndarray, atempo:str ='1'):
|
18 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
19 |
+
inpath = os.path.join(tmpdirname, 'input.wav')
|
20 |
+
outpath = inpath.replace('input.wav', 'output.wav')
|
21 |
+
sf.write(inpath, wav, self.target_sr)
|
22 |
+
in_stream = ffmpeg.input(inpath)
|
23 |
+
audio_stream = ffmpeg.filter_(in_stream, 'atempo', atempo)
|
24 |
+
audio_stream = audio_stream.output(outpath)
|
25 |
+
ffmpeg.run(audio_stream, overwrite_output=True)
|
26 |
+
wav, _ = librosa.load(outpath, sr=self.target_sr)
|
27 |
+
return wav
|
28 |
+
|
29 |
+
def trim_silence(self, wav:np.ndarray):
|
30 |
+
return self.vad.process(wav, sc_threshold=40)
|
31 |
+
|
32 |
+
def process(self, wav, lang:str, gender:str):
|
33 |
+
if type(wav) != np.ndarray:
|
34 |
+
wav = np.array(wav)
|
35 |
+
|
36 |
+
if (lang == "te") and (gender=='female'): # Telugu female speaker slow down
|
37 |
+
wav = self.set_tempo(wav, '0.85')
|
38 |
+
wav = self.trim_silence(wav)
|
39 |
+
elif (lang == 'mr') and (gender=='female'): # Marathi female speaker speed up
|
40 |
+
wav = self.trim_silence(wav)
|
41 |
+
wav = self.set_tempo(wav, '1.15')
|
42 |
+
elif (lang == 'gu'): # Gujarati speaker speed up
|
43 |
+
# wav = trim_silence(wav)
|
44 |
+
wav = self.set_tempo(wav, '1.20')
|
45 |
+
|
46 |
+
return wav
|
src/postprocessor/vad.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
'''
|
4 |
+
MIT License
|
5 |
+
|
6 |
+
Copyright (c) 2018 Mauricio
|
7 |
+
|
8 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
of this software and associated documentation files (the "Software"), to deal
|
10 |
+
in the Software without restriction, including without limitation the rights
|
11 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
copies of the Software, and to permit persons to whom the Software is
|
13 |
+
furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
The above copyright notice and this permission notice shall be included in all
|
16 |
+
copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
+
SOFTWARE.
|
25 |
+
|
26 |
+
Adapted from https://github.com/mauriciovander/silence-removal/blob/master/vad.py
|
27 |
+
'''
|
28 |
+
import numpy
|
29 |
+
|
30 |
+
class VoiceActivityDetection:
|
31 |
+
|
32 |
+
def __init__(self):
|
33 |
+
self.__step = 160
|
34 |
+
self.__buffer_size = 160
|
35 |
+
self.__buffer = numpy.array([],dtype=numpy.int16)
|
36 |
+
self.__out_buffer = numpy.array([],dtype=numpy.int16)
|
37 |
+
self.__n = 0
|
38 |
+
self.__VADthd = 0.
|
39 |
+
self.__VADn = 0.
|
40 |
+
self.__silence_counter = 0
|
41 |
+
|
42 |
+
# Voice Activity Detection
|
43 |
+
# Adaptive threshold
|
44 |
+
def vad(self, _frame, sc_threshold=20):
|
45 |
+
frame = numpy.array(_frame) ** 2.
|
46 |
+
result = True
|
47 |
+
threshold = 0.2
|
48 |
+
thd = numpy.min(frame) + numpy.ptp(frame) * threshold
|
49 |
+
self.__VADthd = (self.__VADn * self.__VADthd + thd) / float(self.__VADn + 1.)
|
50 |
+
self.__VADn += 1.
|
51 |
+
|
52 |
+
if numpy.mean(frame) <= self.__VADthd:
|
53 |
+
self.__silence_counter += 1
|
54 |
+
else:
|
55 |
+
self.__silence_counter = 0
|
56 |
+
if self.__silence_counter > sc_threshold:
|
57 |
+
result = False
|
58 |
+
return result
|
59 |
+
|
60 |
+
# Push new audio samples into the buffer.
|
61 |
+
def add_samples(self, data):
|
62 |
+
self.__buffer = numpy.append(self.__buffer, data)
|
63 |
+
result = len(self.__buffer) >= self.__buffer_size
|
64 |
+
# print('__buffer size %i'%self.__buffer.size)
|
65 |
+
return result
|
66 |
+
|
67 |
+
# Pull a portion of the buffer to process
|
68 |
+
# (pulled samples are deleted after being
|
69 |
+
# processed
|
70 |
+
def get_frame(self):
|
71 |
+
window = self.__buffer[:self.__buffer_size]
|
72 |
+
self.__buffer = self.__buffer[self.__step:]
|
73 |
+
# print('__buffer size %i'%self.__buffer.size)
|
74 |
+
return window
|
75 |
+
|
76 |
+
# Adds new audio samples to the internal
|
77 |
+
# buffer and process them
|
78 |
+
def process(self, data, sc_threshold):
|
79 |
+
self.__buffer = numpy.array([],dtype=numpy.int16)
|
80 |
+
self.__out_buffer = numpy.array([],dtype=numpy.int16)
|
81 |
+
if self.add_samples(data):
|
82 |
+
while len(self.__buffer) >= self.__buffer_size:
|
83 |
+
# Framing
|
84 |
+
window = self.get_frame()
|
85 |
+
if self.vad(window, sc_threshold): # speech frame
|
86 |
+
self.__out_buffer = numpy.append(self.__out_buffer, window)
|
87 |
+
return self.__out_buffer
|
src/utils/alphabet2phone.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"a": "aey", "b": "bee", "c": "see", "d": "dee", "e": "eee", "f": "eff", "g": "jee", "h": "ech", "i": "aai", "j": "jay", "k": "kay", "l": "ell", "m": "em", "n": "en", "o": "oh", "p": "pee", "q": "kyuu", "r": "aar", "s": "es", "t": "tea", "u": "you", "v": "vee", "w": "doubleu", "x": "ex", "y": "why", "z": "zedd"}
|
src/utils/paragraph_handler.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! /usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
import re
|
5 |
+
|
6 |
+
non_chars_regex = re.compile(r'[^\w]')
|
7 |
+
|
8 |
+
class ParagraphHandler():
|
9 |
+
|
10 |
+
def __init__(self, max_text_len=512):
|
11 |
+
self.L = max_text_len
|
12 |
+
|
13 |
+
def split_text(self, text:str, delimiter='.'):
|
14 |
+
'''Splits text at delimiter into paragraphs of max. length self.L'''
|
15 |
+
delimiter = ' ' if delimiter not in text else delimiter
|
16 |
+
if delimiter not in text:
|
17 |
+
return [text]
|
18 |
+
|
19 |
+
paragraphs = []
|
20 |
+
l_pos, r_pos = 0, 0
|
21 |
+
while r_pos < len(text):
|
22 |
+
r_pos = l_pos + self.L
|
23 |
+
if r_pos >= len(text): # append last paragraph.
|
24 |
+
paragraphs.append(text[l_pos:len(text)])
|
25 |
+
break
|
26 |
+
while delimiter is not None and text[r_pos] != delimiter and r_pos > l_pos and r_pos > 0: # find nearest delimiter < r_pos to split paragraph at.
|
27 |
+
r_pos -= 1
|
28 |
+
extracted_paragraph = text[l_pos:r_pos+1]
|
29 |
+
extracted_paragraph_without_special_chars = non_chars_regex.sub('', extracted_paragraph)
|
30 |
+
if extracted_paragraph_without_special_chars:
|
31 |
+
paragraphs.append(extracted_paragraph)
|
32 |
+
l_pos = r_pos + 1 # handle next paragraph
|
33 |
+
return paragraphs
|
34 |
+
|
35 |
+
|
36 |
+
if __name__ == '__main__':
|
37 |
+
text = "The following are quotes from A.P.J. Abdul Kalam. To succeed in your mission, you must have single-minded devotion to your goal. Look at the sky. We are not alone. The whole universe is friendly to us and conspires only to give the best to those who dream and work. The youth need to be enabled to become job generators from job seekers. If four things are followed - having a great aim, acquiring knowledge, hard work, and perseverance - then anything can be achieved. Where there is righteousness in the heart, there is beauty in the character. When there is beauty in the character, there is harmony in the home. When there is harmony in the home, there is order in the nation. When there is order in the nation, there is peace in the world. Great teachers emanate out of knowledge, passion and compassion. Let me define a leader. He must have vision and passion and not be afraid of any problem. Instead, he should know how to defeat it. Most importantly, he must work with integrity."
|
38 |
+
print('LENGTH: ', len(text)) # 988
|
39 |
+
|
40 |
+
paragraph_handler = ParagraphHandler()
|
41 |
+
paragraphs = paragraph_handler.split_text(text)
|
42 |
+
for p in paragraphs:
|
43 |
+
print(len(p), p)
|
src/utils/symbols.json
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"₹": {
|
3 |
+
"as": "Rupees",
|
4 |
+
"bn": "রুপি",
|
5 |
+
"brx": "Rupees",
|
6 |
+
"en": "Rupees",
|
7 |
+
"gu": "રૂપિયા",
|
8 |
+
"hi": "रुपये",
|
9 |
+
"kn": "ರೂಪಾಯಿ",
|
10 |
+
"ml": "രൂപ",
|
11 |
+
"mni": "Rupees",
|
12 |
+
"mr": "रुपये",
|
13 |
+
"or": "ଟଙ୍କ।",
|
14 |
+
"pa": "ਰੁਪਏ",
|
15 |
+
"raj": "रीप्या",
|
16 |
+
"ta": "ரூபாய்",
|
17 |
+
"te": "రూపాయలు"
|
18 |
+
},
|
19 |
+
"@": {
|
20 |
+
"as": "আঁ ট",
|
21 |
+
"bn": "আট",
|
22 |
+
"brx": "at",
|
23 |
+
"en": "at",
|
24 |
+
"gu": "આત્",
|
25 |
+
"hi": "आट",
|
26 |
+
"kn": "ಅಟ್",
|
27 |
+
"ml": "ആറ്റ്",
|
28 |
+
"mni": "ꯑꯊ",
|
29 |
+
"mr": "आट",
|
30 |
+
"or": "ଆଟ",
|
31 |
+
"pa": "ਆਤ",
|
32 |
+
"raj": "आट",
|
33 |
+
"ta": "ஆட்",
|
34 |
+
"te": "ఆ ట"
|
35 |
+
},
|
36 |
+
".": {
|
37 |
+
"as": "ডোট",
|
38 |
+
"bn": "ডোট",
|
39 |
+
"brx": "dot",
|
40 |
+
"en": "dot",
|
41 |
+
"gu": "ડોટ",
|
42 |
+
"hi": "डोट",
|
43 |
+
"kn": "dot",
|
44 |
+
"ml": "ഡോട്ട്",
|
45 |
+
"mni": "ꯗꯣꯇ",
|
46 |
+
"mr": "डॉट",
|
47 |
+
"or": "ଡୋଟ୍",
|
48 |
+
"pa": "ਡੋਟ",
|
49 |
+
"raj": "डॉट",
|
50 |
+
"ta": "டாட்",
|
51 |
+
"te": "డాట్"
|
52 |
+
},
|
53 |
+
"/": {
|
54 |
+
"as": "শ্লাচ",
|
55 |
+
"bn": "স্লাশ",
|
56 |
+
"brx": "स्लाश",
|
57 |
+
"en": "slash",
|
58 |
+
"gu": "સ્લેશ",
|
59 |
+
"hi": "सलाश",
|
60 |
+
"kn": "ಸ್ಲಾಶ್",
|
61 |
+
"ml": "സ്ലാഷ്",
|
62 |
+
"mni": "ꯁ꯭ꯂꯦꯁ",
|
63 |
+
"mr": "सलाश",
|
64 |
+
"or": "ସ୍ଲାଶ୍",
|
65 |
+
"pa": "slash",
|
66 |
+
"raj": "स्लाश",
|
67 |
+
"ta": "ஸ்லாஷ்",
|
68 |
+
"te": "స్లాష్"
|
69 |
+
},
|
70 |
+
":": {
|
71 |
+
"as": "কোলন",
|
72 |
+
"bn": "কোলন",
|
73 |
+
"brx": "कोलन",
|
74 |
+
"en": "colon",
|
75 |
+
"gu": "કોલન",
|
76 |
+
"hi": "कोलन",
|
77 |
+
"kn": "ಕೋಲನ್",
|
78 |
+
"ml": "കോളൻ",
|
79 |
+
"mni": "ꯀꯣꯂꯦꯟ",
|
80 |
+
"mr": "कोलन",
|
81 |
+
"or": "କୋଲୋନ",
|
82 |
+
"pa": "ਕੌਲੋਨ",
|
83 |
+
"raj": "कोलन",
|
84 |
+
"ta": "கோலன்",
|
85 |
+
"te": "కోలన్"
|
86 |
+
},
|
87 |
+
"+": {
|
88 |
+
"as": "প্লাচ",
|
89 |
+
"bn": "প্লাস",
|
90 |
+
"brx": "प्लस",
|
91 |
+
"en": "plus",
|
92 |
+
"gu": "પ્લસ",
|
93 |
+
"hi": "प्लस",
|
94 |
+
"kn": "ಪ್ಲಸ್",
|
95 |
+
"ml": "പ്ലസ്",
|
96 |
+
"mni": "ꯄ꯭ꯂꯁ",
|
97 |
+
"mr": "प्लॅस",
|
98 |
+
"or": "ପ୍ଲସ୍",
|
99 |
+
"pa": "ਪਲੱਸ",
|
100 |
+
"raj": "प्लस",
|
101 |
+
"ta": "பிளஸ்",
|
102 |
+
"te": "ప్లస్"
|
103 |
+
},
|
104 |
+
"-": {
|
105 |
+
"as": "ডাছ",
|
106 |
+
"bn": "ডাছ",
|
107 |
+
"brx": "दाश",
|
108 |
+
"en": "dash",
|
109 |
+
"gu": "ડાશ",
|
110 |
+
"hi": "दाश",
|
111 |
+
"kn": "ದಾಶು",
|
112 |
+
"ml": "ദാശ",
|
113 |
+
"mni": "ꯗꯥꯁ",
|
114 |
+
"mr": "दाश",
|
115 |
+
"or": "ଦାଶ",
|
116 |
+
"pa": "ਮਾਈਨਉਸ",
|
117 |
+
"raj": "दाश",
|
118 |
+
"ta": "டாஷ்",
|
119 |
+
"te": "డాష్"
|
120 |
+
},
|
121 |
+
"www": {
|
122 |
+
"as": "ডাব্লিঅডাব্লিওডাব্লিও",
|
123 |
+
"bn": "ডাব্লিউডাব্লিউডাব্লিউ",
|
124 |
+
"brx": "डबलयु डबलयु डबलयु",
|
125 |
+
"en": "doubleyou doubleyou doubleyou",
|
126 |
+
"gu": "ડબલ્યું ડબલ્યું ડબલ્યું",
|
127 |
+
"hi": "डबल्यू डबल्यू डबल्यू",
|
128 |
+
"kn": "ಡುಬ್ಲ್ಯುಡುಬ್ಲ್ಯುಡುಬ್ಲ್ಯು ",
|
129 |
+
"ml": "ഡബ്ലിയൂ ഡബ്ലിയൂ ഡബ്ലിയൂ",
|
130 |
+
"mni": "ꯗꯕꯜꯌꯨꯗꯕꯜꯌꯨꯗꯕꯜꯌꯨ",
|
131 |
+
"mr": "डब्ल्यू डब्ल्यू डब्ल्यू",
|
132 |
+
"or": "ଡବ୍ଲିୟୁଡବ୍ଲିୟୁଡବ୍ଲିୟୁ",
|
133 |
+
"pa": "ਡਬਲਿਊ ਡਬਲਿਊ ਡਬਲਿਊ",
|
134 |
+
"raj": "डब्ल्यू डब्ल्यू डब्ल्यू",
|
135 |
+
"ta": "டபிளியூ டபிளியூ டபிளியூ",
|
136 |
+
"te": "డబుల్యూడబుల్యూడబుల్యూ"
|
137 |
+
},
|
138 |
+
"%": {
|
139 |
+
"as": "শতাংশ",
|
140 |
+
"bn": "শতাংশ",
|
141 |
+
"brx": "percent",
|
142 |
+
"en": "percent",
|
143 |
+
"gu": "ટકા",
|
144 |
+
"hi": "प्रतिशत",
|
145 |
+
"kn": "ಶೇಕಡಾ",
|
146 |
+
"ml": "ശതമാനം",
|
147 |
+
"mni": "ꯆꯥꯗꯥ ꯆꯥꯗꯥ ꯴",
|
148 |
+
"mr": "टक्के",
|
149 |
+
"or": "ଶତକଡା",
|
150 |
+
"pa": "ਪ੍ਰਤੀਸ਼ਤ",
|
151 |
+
"raj": "percent",
|
152 |
+
"ta": "சதவீதம்",
|
153 |
+
"te": "శాతం"
|
154 |
+
}
|
155 |
+
}
|
src/utils/text.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
PWD = os.path.dirname(__file__)
|
3 |
+
import re
|
4 |
+
import regex
|
5 |
+
import json
|
6 |
+
import traceback
|
7 |
+
|
8 |
+
from nemo_text_processing.text_normalization.normalize import Normalizer
|
9 |
+
from indic_numtowords import num2words, supported_langs
|
10 |
+
from .translator import GoogleTranslator
|
11 |
+
|
12 |
+
indic_acronym_matcher = regex.compile(r"([\p{L}\p{M}]+\.\s*){2,}")
|
13 |
+
|
14 |
+
# short_form_regex = re.compile(r'\b[A-Z\.]{2,}s?\b')
|
15 |
+
# def get_shortforms_from_string(text):
|
16 |
+
# return short_form_regex.findall(text)
|
17 |
+
|
18 |
+
short_form_regex = re.compile(r"\b([A-Z][\.\s]+)+([A-Z])?\b")
|
19 |
+
eng_consonants_regex = re.compile(r"\b[BCDFGHJKLMNPQRSTVWXZbcdfghjklmnpqrstvwxz]+\b")
|
20 |
+
def get_shortforms_from_string(text):
|
21 |
+
dotted_shortforms = [m.group() for m in re.finditer(short_form_regex, text)]
|
22 |
+
non_dotted_shortforms = [m.group() for m in re.finditer(eng_consonants_regex, text)]
|
23 |
+
return dotted_shortforms + non_dotted_shortforms
|
24 |
+
|
25 |
+
decimal_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)")
|
26 |
+
def get_all_decimals_from_string(text):
|
27 |
+
return decimal_str_regex.findall(text)
|
28 |
+
|
29 |
+
num_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)?")
|
30 |
+
def get_all_numbers_from_string(text):
|
31 |
+
return num_str_regex.findall(text)
|
32 |
+
|
33 |
+
multiple_stops_regex = r'\.\.+'
|
34 |
+
def replace_multiple_stops(text):
|
35 |
+
return re.sub(multiple_stops_regex, '.', text)
|
36 |
+
|
37 |
+
date_generic_match_regex = re.compile("(?:[^0-9]\d*[./-]\d*[./-]\d*)")
|
38 |
+
date_str_regex = re.compile("(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4})|(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})") # match like dd/mm/yyyy or dd-mm-yy or yyyy.mm.dd or yy/mm/dd
|
39 |
+
def get_all_dates_from_string(text):
|
40 |
+
candidates = date_generic_match_regex.findall(text)
|
41 |
+
candidates = [c.replace(' ', '') for c in candidates]
|
42 |
+
candidates = [c for c in candidates if len(c) <= 10] # Prune invalid dates
|
43 |
+
candidates = ' '.join(candidates)
|
44 |
+
return date_str_regex.findall(candidates)
|
45 |
+
|
46 |
+
def get_decimal_substitution(decimal):
|
47 |
+
decimal_parts = decimal.split('.')
|
48 |
+
l_part = decimal_parts[0]
|
49 |
+
r_part = ""
|
50 |
+
for part in decimal_parts[1:]:
|
51 |
+
r_part += ' '.join(list(part)) # space between every digit after decimal point
|
52 |
+
decimal_sub = l_part + " point " + r_part
|
53 |
+
decimal_sub = decimal_sub.strip()
|
54 |
+
return decimal_sub
|
55 |
+
|
56 |
+
email_regex = r'[\w.+-]+@[\w-]+\.[\w.-]+'
|
57 |
+
url_regex = r'((?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*)|(\w*.com/?[\w\.\?=#]*)'
|
58 |
+
currency_regex = r"\₹\ ?[+-]?[0-9]{1,3}(?:,?[0-9])*(?:\.[0-9]{1,2})?"
|
59 |
+
phone_regex = r'\+?\d[ \d-]{6,12}\d'
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
class TextNormalizer:
|
64 |
+
def __init__(self):
|
65 |
+
self.translator = GoogleTranslator()
|
66 |
+
self.normalizer = Normalizer(input_case='cased', lang='en')
|
67 |
+
self.symbols2lang2word = json.load(open(os.path.join(PWD, "symbols.json"), encoding="utf-8"))
|
68 |
+
self.alphabet2phone = json.load(open(os.path.join(PWD, "alphabet2phone.json"), encoding="utf-8"))
|
69 |
+
|
70 |
+
def normalize_text(self, text, lang):
|
71 |
+
text = text.replace("।", ".").replace("|", ".").replace("꯫", ".").strip()
|
72 |
+
text = self.expand_shortforms(text, lang)
|
73 |
+
text = self.normalize_decimals(text, lang)
|
74 |
+
text = self.replace_punctutations(text, lang)
|
75 |
+
text = self.convert_dates_to_words(text, lang)
|
76 |
+
text = self.convert_symbols_to_words(text, lang)
|
77 |
+
text = self.convert_numbers_to_words(text, lang)
|
78 |
+
return text
|
79 |
+
|
80 |
+
def normalize_decimals(self, text, lang):
|
81 |
+
decimal_strs = get_all_decimals_from_string(text)
|
82 |
+
if not decimal_strs:
|
83 |
+
return text
|
84 |
+
decimals = [str(decimal_str.replace(',', '')) for decimal_str in decimal_strs]
|
85 |
+
decimal_substitutions = [get_decimal_substitution(decimal) for decimal in decimals]
|
86 |
+
for decimal_str, decimal_sub in zip(decimal_strs, decimal_substitutions):
|
87 |
+
text = text.replace(decimal_str, decimal_sub)
|
88 |
+
return text
|
89 |
+
|
90 |
+
def replace_punctutations(self, text, lang):
|
91 |
+
text = replace_multiple_stops(text)
|
92 |
+
if lang not in ['brx', 'or']:
|
93 |
+
text = text.replace('।', '.')
|
94 |
+
if text[-1] not in ['.', '!', '?', ',', ':', ';']:
|
95 |
+
text = text + ' .'
|
96 |
+
else:
|
97 |
+
text = text.replace('.', '।')
|
98 |
+
text = text.replace('|', '.')
|
99 |
+
for bracket in ['(', ')', '{', '}', '[', ']']:
|
100 |
+
text = text.replace(bracket, ',')
|
101 |
+
# text = text.replace(':', ',').replace(';',',')
|
102 |
+
text = text.replace(';',',')
|
103 |
+
return text
|
104 |
+
|
105 |
+
def convert_numbers_to_words(self, text, lang):
|
106 |
+
num_strs = get_all_numbers_from_string(text)
|
107 |
+
if not num_strs:
|
108 |
+
return text
|
109 |
+
|
110 |
+
# TODO: If it is a large integer without commas (say >5 digits), spell it out numeral by numeral
|
111 |
+
# NOTE: partially handled by phones
|
112 |
+
numbers = [int(num_str.replace(',', '')) for num_str in num_strs]
|
113 |
+
|
114 |
+
if lang in supported_langs:
|
115 |
+
# print(lang, numbers)
|
116 |
+
num_words = [num2words(num, lang=lang) for num in numbers]
|
117 |
+
else: # Fallback, converting to Indian-English, followed by NMT
|
118 |
+
try:
|
119 |
+
num_words = [num2words(num, lang="en") for num in numbers]
|
120 |
+
translated_num_words = [self.translator(text=num_word, from_lang="en", to_lang=lang) for num_word in num_words]
|
121 |
+
# TODO: Cache the results?
|
122 |
+
num_words = translated_num_words
|
123 |
+
except:
|
124 |
+
traceback.print_exc()
|
125 |
+
|
126 |
+
for num_str, num_word in zip(num_strs, num_words):
|
127 |
+
text = text.replace(num_str, ' '+num_word+' ', 1)
|
128 |
+
return text.replace(" ", ' ')
|
129 |
+
|
130 |
+
def convert_dates_to_words(self, text, lang):
|
131 |
+
date_strs = get_all_dates_from_string(text)
|
132 |
+
if not date_strs:
|
133 |
+
return text
|
134 |
+
for date_str in date_strs:
|
135 |
+
normalized_str = self.normalizer.normalize(date_str, verbose=False, punct_post_process=True)
|
136 |
+
if lang in ['brx', 'en']: # no translate
|
137 |
+
translated_str = normalized_str
|
138 |
+
else:
|
139 |
+
translated_str = self.translator(text=normalized_str, from_lang="en", to_lang=lang)
|
140 |
+
text = text.replace(date_str, translated_str)
|
141 |
+
return text
|
142 |
+
|
143 |
+
def expand_phones(self, item):
|
144 |
+
return ' '.join(list(item))
|
145 |
+
|
146 |
+
def find_valid(self, regex_str, text):
|
147 |
+
items = re.findall(regex_str, text)
|
148 |
+
return_items = []
|
149 |
+
for item in items:
|
150 |
+
if isinstance(item, tuple):
|
151 |
+
for subitem in item:
|
152 |
+
if len(subitem) > 0:
|
153 |
+
return_items.append(subitem)
|
154 |
+
break # choose first valid sub item
|
155 |
+
elif len(item) > 0:
|
156 |
+
return_items.append(item)
|
157 |
+
return return_items
|
158 |
+
|
159 |
+
def convert_symbols_to_words(self, text, lang):
|
160 |
+
symbols = self.symbols2lang2word.keys()
|
161 |
+
emails = self.find_valid(email_regex, text)
|
162 |
+
# urls = re.findall(r'(?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*', text)
|
163 |
+
urls = self.find_valid(url_regex, text)
|
164 |
+
# print('URLS', urls)
|
165 |
+
for item in emails + urls:
|
166 |
+
item_norm = item
|
167 |
+
for symbol in symbols:
|
168 |
+
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
|
169 |
+
text = text.replace(item, item_norm)
|
170 |
+
|
171 |
+
currencies = self.find_valid(currency_regex, text)
|
172 |
+
for item in currencies:
|
173 |
+
item_norm = item.replace('₹','') + '₹' # Pronounce after numerals
|
174 |
+
for symbol in symbols:
|
175 |
+
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
|
176 |
+
text = text.replace(item, item_norm)
|
177 |
+
|
178 |
+
phones = self.find_valid(phone_regex, text)
|
179 |
+
for item in phones:
|
180 |
+
item_norm = item.replace('-', ' ')
|
181 |
+
for symbol in symbols:
|
182 |
+
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
|
183 |
+
item_norm = self.expand_phones(item_norm)
|
184 |
+
text = text.replace(item, item_norm)
|
185 |
+
|
186 |
+
# percentage
|
187 |
+
text = text.replace('%', self.symbols2lang2word['%'][lang])
|
188 |
+
|
189 |
+
return text
|
190 |
+
|
191 |
+
def convert_char2phone(self, char):
|
192 |
+
return self.alphabet2phone[char.lower()] if char.lower() in self.alphabet2phone else ''
|
193 |
+
|
194 |
+
def expand_shortforms(self, text, lang):
|
195 |
+
if lang!='en':
|
196 |
+
# Remove dots, as it speaks out like each letter is separate sentence
|
197 |
+
# Example: अई. अई. टी. -> अई अई टी
|
198 |
+
for match in regex.finditer(indic_acronym_matcher, text):
|
199 |
+
match = match.group()
|
200 |
+
match_without_dot = match.replace('.', ' ')
|
201 |
+
text = text.replace(match, match_without_dot)
|
202 |
+
return text
|
203 |
+
|
204 |
+
shortforms = get_shortforms_from_string(text)
|
205 |
+
for shortform in shortforms:
|
206 |
+
shortform = shortform.strip()
|
207 |
+
if shortform == 'I' or shortform == "A":
|
208 |
+
# Skip if valid English words
|
209 |
+
continue
|
210 |
+
expanded = ' '.join([self.convert_char2phone(char) for char in shortform])
|
211 |
+
text = text.replace(shortform, expanded, 1)
|
212 |
+
return text
|
src/utils/translator.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class GoogleTranslator:
|
2 |
+
def __init__(self):
|
3 |
+
from translators.server import google, _google
|
4 |
+
self._translate = google
|
5 |
+
|
6 |
+
google("Testing...")
|
7 |
+
self.supported_languages = set(_google.language_map['en'])
|
8 |
+
self.custom_lang_map = {
|
9 |
+
"mni": "mni-Mtei",
|
10 |
+
"raj": "hi",
|
11 |
+
}
|
12 |
+
|
13 |
+
def translate(self, text, from_lang, to_lang):
|
14 |
+
if from_lang in self.custom_lang_map:
|
15 |
+
from_lang = self.custom_lang_map[from_lang]
|
16 |
+
elif from_lang not in self.supported_languages:
|
17 |
+
return text
|
18 |
+
|
19 |
+
if to_lang in self.custom_lang_map:
|
20 |
+
to_lang = self.custom_lang_map[to_lang]
|
21 |
+
elif to_lang not in self.supported_languages:
|
22 |
+
return text
|
23 |
+
|
24 |
+
return self._translate(text, from_language=from_lang, to_language=to_lang)
|
25 |
+
|
26 |
+
def __call__(self, **kwargs):
|
27 |
+
return self.translate(**kwargs)
|