File size: 9,550 Bytes
a920b41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import io
import re
import base64
import numpy as np
import traceback
from typing import Union

from TTS.utils.synthesizer import Synthesizer
from aksharamukha.transliterate import process as aksharamukha_xlit
from scipy.io.wavfile import write as scipy_wav_write

import nltk
import pysbd

from .models.common import Language
from .models.request import TTSRequest
from .models.response import AudioFile, AudioConfig, TTSResponse, TTSFailureResponse
from .utils.text import TextNormalizer
from .utils.paragraph_handler import ParagraphHandler
from src.postprocessor import PostProcessor

class TextToSpeechEngine:
    def __init__(
        self,
        models: dict,
        allow_transliteration: bool = True,
        enable_denoiser: bool = True,
    ):
        self.models = models
        # TODO: Ability to instantiate models by accepting standard paths or auto-downloading
        
        code_mixed_found = False
        if allow_transliteration:
            # Initialize Indic-Xlit models for the languages corresponding to TTS models
            from ai4bharat.transliteration import XlitEngine
            xlit_langs = set()
            
            for lang in list(models):
                if lang == 'en':
                    continue # No need of any Indic-transliteration for English
                
                if '+' in lang:
                    # If it's a code-mixed model like Hinglish, we need Hindi Xlit for non-English words
                    # TODO: Make it mandatory irrespective of `allow_transliteration` boolean
                    lang = lang.split('+')[1]
                    code_mixed_found = True
                xlit_langs.add(lang)
            
            self.xlit_engine = XlitEngine(xlit_langs, beam_width=6)
        else:
            self.xlit_engine = None

        self.text_normalizer = TextNormalizer()
        self.paragraph_handler = ParagraphHandler()
        self.sent_seg = pysbd.Segmenter(language="en", clean=True)

        self.orig_sr = 22050 # model.output_sample_rate
        self.enable_denoiser = enable_denoiser
        if enable_denoiser:
            from src.postprocessor import Denoiser
            self.target_sr = 16000
            self.denoiser = Denoiser(self.orig_sr, self.target_sr)
        else:
            self.target_sr = self.orig_sr
        
        self.post_processor = PostProcessor(self.target_sr)

        if code_mixed_found:
            # Dictionary of English words
            import enchant
            from enchant.tokenize import get_tokenizer

            self.enchant_dicts = {
                "en_US": enchant.Dict("en_US"),
                "en_GB": enchant.Dict("en_GB"),
            }
            self.enchant_tokenizer = get_tokenizer("en")

    def concatenate_chunks(self, wav: np.ndarray, wav_chunk: np.ndarray):
        # TODO: Move to utils
        if type(wav_chunk) != np.ndarray:
            wav_chunk = np.array(wav_chunk)
        if wav is None:
            return wav_chunk
        return np.concatenate([wav, wav_chunk])

    def infer_from_request(
        self,
        request: TTSRequest,
        transliterate_roman_to_native: bool = True
    ) -> TTSResponse:

        config = request.config
        lang = config.language.sourceLanguage
        gender = config.gender

        # If there's no separate English model, use the Hinglish one
        if lang == "en" and lang not in self.models and "en+hi" in self.models:
            lang = "en+hi"

        if lang not in self.models:
            return TTSFailureResponse(status_text="Unsupported language!")
        
        if lang == "brx" and gender == "male":
            return TTSFailureResponse(status_text="Sorry, `male` speaker not supported for this language!")
        
        output_list = []

        for sentence in request.input:
            raw_audio = self.infer_from_text(sentence.source, lang, gender, transliterate_roman_to_native=transliterate_roman_to_native)
            # Convert PCM to WAV
            byte_io = io.BytesIO()
            scipy_wav_write(byte_io, self.target_sr, raw_audio)
            # Encode WAV fileobject as base64 for transmission via JSON
            encoded_bytes = base64.b64encode(byte_io.read())
            encoded_string = encoded_bytes.decode()
            speech_response = AudioFile(audioContent=encoded_string)
            
            output_list.append(speech_response)

        audio_config = AudioConfig(language=Language(sourceLanguage=lang))
        return TTSResponse(audio=output_list, config=audio_config)
    
    def infer_from_text(
        self,
        input_text: str,
        lang: str,
        speaker_name: str,
        transliterate_roman_to_native: bool = True
    ) -> np.ndarray:
        
        # If there's no separate English model, use the Hinglish one
        if lang == "en" and lang not in self.models and "en+hi" in self.models:
            lang = "en+hi"
        
        input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)

        wav = None
        paragraphs = self.paragraph_handler.split_text(input_text)

        for paragraph in paragraphs:
            paragraph = self.handle_transliteration(paragraph, primary_lang, transliterate_roman_to_native)
            paras = []
            for sent in self.sent_seg.segment(paragraph):
                if sent.strip() and not re.match(r'^[_\W]+$', sent.strip()):
                    paras.append(sent.strip())
            paragraph = " ".join(paras)
            
            # Run Inference. TODO: Support for batch inference
            wav_chunk = self.models[lang].tts(paragraph, speaker_name=speaker_name, style_wav="")

            wav_chunk = self.postprocess_audio(wav_chunk, primary_lang, speaker_name)
            # Concatenate current chunk with previous audio outputs
            wav = self.concatenate_chunks(wav, wav_chunk)
        return wav
    
    def parse_langs_normalise_text(self, input_text: str, lang: str) -> Union[str, str, str]:
        # If there's no separate English model, use the Hinglish one if present
        if lang == "en" and lang not in self.models and "en+hi" in self.models:
            lang = "en+hi"

        if lang == "en+hi": # Hinglish (English+Hindi code-mixed)
            primary_lang, secondary_lang = lang.split('+')
        else:
            primary_lang = lang
            secondary_lang = None

        input_text = self.text_normalizer.normalize_text(input_text, primary_lang)
        if secondary_lang:
            # TODO: Write a proper `transliterate_native_words_using_eng_dictionary`
            input_text = self.transliterate_native_words_using_spell_checker(input_text, secondary_lang)

        return input_text, primary_lang, secondary_lang
    
    def handle_transliteration(self, input_text: str, primary_lang: str, transliterate_roman_to_native: bool) -> str:
        if transliterate_roman_to_native and primary_lang != 'en':
            input_text = self.transliterate_sentence(input_text, primary_lang)

            # Manipuri was trained using the Central-govt's Bangla script
            # So convert the words in native state-govt script to Eastern-Nagari
            if primary_lang == "mni":
                # TODO: Delete explicit-schwa
                input_text = aksharamukha_xlit("MeeteiMayek", "Bengali", input_text)
        return input_text
        
    def preprocess_text(
        self,
        input_text: str,
        lang: str,
        # speaker_name: str,
        transliterate_roman_to_native: bool = True
    ) -> np.ndarray:

        input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang)
        input_text = self.handle_transliteration(input_text, primary_lang, transliterate_roman_to_native)
        return input_text

    def postprocess_audio(self, wav_chunk, primary_lang, speaker_name):
        if self.enable_denoiser:
            wav_chunk = self.denoiser.denoise(wav_chunk)
        wav_chunk = self.post_processor.process(wav_chunk, primary_lang, speaker_name)
        return wav_chunk

    def transliterate_native_words_using_spell_checker(self, input_text, lang):
        tokens = [result[0] for result in self.enchant_tokenizer(input_text)]
        pos_tags = [result[1] for result in nltk.tag.pos_tag(tokens)]

        # Transliterate non-English Roman words to Indic
        for word, pos_tag in zip(tokens, pos_tags):
            if pos_tag == "NNP" or pos_tag == "NNPS":
                # Enchant has many proper-nouns as well in its dictionary, don't know why.
                # So if it's a proper-noun, always nativize
                # FIXME: But NLTK's `averaged_perceptron_tagger` does not seem to be 100% accurate, it has false positives 🤦‍♂️ 
                pass
            elif self.enchant_dicts["en_US"].check(word) or self.enchant_dicts["en_GB"].check(word):
                # TODO: Merge British and American dicts into 1 somehow
                continue
            
            # Convert "Ram's" -> "Ram". TODO: Think what are the failure cases
            word = word.split("'")[0]

            transliterated_word = self.transliterate_sentence(word, lang)
            input_text = input_text.replace(word, transliterated_word, 1)
        return input_text

    def transliterate_sentence(self, input_text, lang):
        if not self.xlit_engine:
            return input_text

        if lang == "raj":
            lang = "hi" # Approximate
        
        return self.xlit_engine.translit_sentence(input_text, lang)