Spaces:
Runtime error
Runtime error
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts | |
# | |
# @author Anoop Kunchukuttan | |
# | |
import sys | |
from indicnlp.normalize import indic_normalize | |
from indicnlp.transliterate import unicode_transliterate | |
from indicnlp import loader | |
class AggressiveScriptUnifier(): | |
def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'): | |
self.common_lang=common_lang | |
self.nasals_mode=nasals_mode | |
self.do_normalize_chandras=True | |
self.do_normalize_vowel_ending=True | |
self.remove_nuktas=True | |
self.normalizer_map={} | |
self._init_normalizers() | |
def _init_normalizers(self): | |
normalizer_factory=indic_normalize.IndicNormalizerFactory() | |
## for languages with common parameters | |
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']: | |
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, | |
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, | |
do_normalize_vowel_ending=self.do_normalize_vowel_ending) | |
## for languages with language specific parameters | |
self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, | |
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, | |
do_normalize_vowel_ending=self.do_normalize_vowel_ending, | |
do_canonicalize_addak=True, do_canonicalize_tippi=True, | |
do_replace_vowel_bases=True) | |
self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, | |
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, | |
do_normalize_vowel_ending=self.do_normalize_vowel_ending, | |
do_remap_wa=True) | |
self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, | |
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, | |
do_normalize_vowel_ending=self.do_normalize_vowel_ending, | |
do_remap_assamese_chars=True) | |
self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, | |
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, | |
do_normalize_vowel_ending=self.do_normalize_vowel_ending, | |
do_canonicalize_chillus=True, do_correct_geminated_T=True) | |
def transform(self,text,lang): | |
text=self.normalizer_map[lang].normalize(text) | |
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) | |
return text | |
class BasicScriptUnifier(): | |
def __init__(self,common_lang='hi',nasals_mode='do_nothing'): | |
self.common_lang=common_lang | |
self.nasals_mode=nasals_mode | |
self.normalizer_map={} | |
self._init_normalizers() | |
def _init_normalizers(self): | |
normalizer_factory=indic_normalize.IndicNormalizerFactory() | |
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']: | |
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode) | |
def transform(self,text,lang): | |
if lang in self.normalizer_map: | |
text=self.normalizer_map[lang].normalize(text) | |
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) | |
return text | |
class NaiveScriptUnifier(): | |
def __init__(self,common_lang='hi'): | |
self.common_lang=common_lang | |
def transform(self,text,lang): | |
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) | |
return text | |
if __name__ == '__main__': | |
loader.load() | |
if len(sys.argv)<=4: | |
print("Usage: python script_unifier <command> <infile> <outfile> <language>") | |
sys.exit(1) | |
if sys.argv[1]=='aggressive': | |
language=sys.argv[4] | |
unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants') | |
with open(sys.argv[2],'r',encoding='utf-8') as ifile: | |
with open(sys.argv[3],'w',encoding='utf-8') as ofile: | |
for i, line in enumerate(ifile.readlines()): | |
line=line.strip() | |
transliterated_line=unifier.transform(line,language) | |
ofile.write(transliterated_line+'\n') | |
elif sys.argv[1]=='moderate': | |
language=sys.argv[4] | |
unifier=AggressiveScriptUnifier(nasals_mode='do_nothing') | |
with open(sys.argv[2],'r',encoding='utf-8') as ifile: | |
with open(sys.argv[3],'w',encoding='utf-8') as ofile: | |
for i, line in enumerate(ifile.readlines()): | |
line=line.strip() | |
transliterated_line=unifier.transform(line,language) | |
ofile.write(transliterated_line+'\n') | |
elif sys.argv[1]=='basic': | |
language=sys.argv[4] | |
unifier=BasicScriptUnifier() | |
with open(sys.argv[2],'r',encoding='utf-8') as ifile: | |
with open(sys.argv[3],'w',encoding='utf-8') as ofile: | |
for i, line in enumerate(ifile.readlines()): | |
line=line.strip() | |
transliterated_line=unifier.transform(line,language) | |
ofile.write(transliterated_line+'\n') | |
elif sys.argv[1]=='naive': | |
language=sys.argv[4] | |
unifier=NaiveScriptUnifier() | |
with open(sys.argv[2],'r',encoding='utf-8') as ifile: | |
with open(sys.argv[3],'w',encoding='utf-8') as ofile: | |
for i, line in enumerate(ifile.readlines()): | |
line=line.strip() | |
transliterated_line=unifier.transform(line,language) | |
ofile.write(transliterated_line+'\n') | |