masoudmzb commited on
Commit
f0ca535
·
1 Parent(s): 6b89177

Upload normalizer.py

Browse files
Files changed (1) hide show
  1. normalizer.py +203 -0
normalizer.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from parsivar import Normalizer
2
+
3
+ import num2fawords
4
+ import re
5
+ import string
6
+
7
+
8
+ _normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
9
+ chars_to_ignore = [
10
+ ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
11
+ "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
12
+ ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
13
+ 'ā', 'š', 'ّ', 'ْ',
14
+ ]
15
+ chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
16
+ chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
17
+ zwnj = "\u200c"
18
+ silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
19
+
20
+
21
+ def multiple_replace(text, chars_to_mapping):
22
+ pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
23
+ return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
24
+
25
+
26
+ def remove_special_characters(text, chars_to_ignore_regex):
27
+ text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
28
+ return text
29
+
30
+
31
+ def convert_word_nums_to_text(word):
32
+ try:
33
+ word = int(word)
34
+ word = num2fawords.words(word)
35
+ except:
36
+ word = word
37
+
38
+ return word
39
+
40
+
41
+ def normalizer_at_word_level(text):
42
+ words = text.split()
43
+ _text = []
44
+
45
+ for word in words:
46
+ word = convert_word_nums_to_text(word)
47
+ word = fixator_dictionary.get(word, word)
48
+
49
+ _text.append(word)
50
+
51
+ return " ".join(_text) + " "
52
+
53
+
54
+ def finder(ss, s, starter=False):
55
+ found = []
56
+ for m in re.finditer(ss, s):
57
+ if starter:
58
+ found.append(m.start())
59
+ else:
60
+ found.append((m.start(), m.end()))
61
+
62
+ return found
63
+
64
+
65
+ def substring_replace(ss, s, start, end, stripped=True):
66
+ s_start = s[:start]
67
+ s_end = s[end:]
68
+
69
+ counter = 0
70
+ if stripped:
71
+ counter = 1 if s_start.endswith(" ") else counter
72
+ s_start = s_start.rstrip()
73
+
74
+ return s_start + ss + s_end, counter
75
+
76
+
77
+ def normalizer(
78
+ batch,
79
+ is_normalize=True,
80
+ return_dict=True,
81
+ filter_trivials=False,
82
+ remove_extra_space=False
83
+ ):
84
+ text = batch["sentence"].lower().strip()
85
+
86
+ # Parsivar normalizer
87
+ if is_normalize:
88
+ text = _normalizer.normalize(text)
89
+
90
+ # Dictionary mapping
91
+ text = multiple_replace(text, dictionary_mapping)
92
+ text = re.sub(" +", " ", text)
93
+
94
+ # Remove specials
95
+ text = remove_special_characters(text, chars_to_ignore)
96
+ text = re.sub(" +", " ", text)
97
+
98
+ # Replace connected آ
99
+ special, pointer = "آ", int("0")
100
+ for f in sorted(finder(special, text, True)):
101
+ index = f + pointer - 1
102
+ if len(text) >= index:
103
+ if text[index] not in silent_chars:
104
+ new_text, extra_pointer = substring_replace(
105
+ f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
106
+ text = new_text
107
+ pointer += 1 + 1 - 1 - extra_pointer
108
+
109
+ # Replace connected ها
110
+ pointer = int("0")
111
+ special_list = [
112
+ # "ام", "ای", "است", "ایم", "اید", "اند",
113
+ "هایمان", "هایم", "هایت", "هایش",
114
+ "هایتان", "هایشان", "هام", "هات",
115
+ "هاتان", "هامون", "هامان", "هاش",
116
+ "هاتون", "هاشان", "هاشون",
117
+ "هایی", "های", "هاس", "ها"
118
+ ]
119
+ for special in special_list:
120
+ pointer = 0
121
+ text = text
122
+ for f in sorted(finder(special, text, False)):
123
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
124
+ if len(text) >= (end + 1):
125
+ if len(text) == (end + 1):
126
+ new_text, extra_pointer = substring_replace(
127
+ f"{zwnj}{special}",
128
+ text,
129
+ start + 1,
130
+ end + 1,
131
+ stripped=True)
132
+ text = new_text
133
+ pointer += 1 + 1 - 1 - extra_pointer
134
+ else:
135
+ if text[end + 1] == " ":
136
+ new_text, extra_pointer = substring_replace(
137
+ f"{zwnj}{special}",
138
+ text,
139
+ start + 1,
140
+ end + 1,
141
+ stripped=True)
142
+ text = new_text
143
+ pointer += 1 + 1 - 1 - extra_pointer
144
+
145
+ special, pointer = "افزار", int("0")
146
+ for f in sorted(finder(special, text, False)):
147
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
148
+
149
+ if len(text) >= (end + 1):
150
+ new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
151
+ text = new_text
152
+ pointer += 1 + 1 - 1 - extra_pointer
153
+
154
+ # Replace connected ها
155
+ pointer = int("0")
156
+ special_list = [
157
+ "ترین", "تر"
158
+ ]
159
+ for special in special_list:
160
+ pointer = 0
161
+ text = text
162
+ for f in sorted(finder(special, text, False)):
163
+ start, end = f[0] + pointer - 1, f[1] + pointer - 1
164
+ if len(text) >= (end + 1):
165
+ if len(text) == (end + 1):
166
+ new_text, extra_pointer = substring_replace(
167
+ f"{zwnj}{special}",
168
+ text,
169
+ start + 1,
170
+ end + 1,
171
+ stripped=True)
172
+ text = new_text
173
+ pointer += 1 + 1 - 1 - extra_pointer
174
+ else:
175
+ if text[end + 1] == " ":
176
+ new_text, extra_pointer = substring_replace(
177
+ f"{zwnj}{special}",
178
+ text,
179
+ start + 1,
180
+ end + 1,
181
+ stripped=True)
182
+ text = new_text
183
+ pointer += 1 + 1 - 1 - extra_pointer
184
+
185
+ # Normalizer at word level
186
+ text = normalizer_at_word_level(text)
187
+ text = re.sub(" +", " ", text)
188
+
189
+ if remove_extra_space:
190
+ text = text.strip()
191
+ else:
192
+ text = text.strip() + " "
193
+
194
+ if filter_trivials:
195
+ if not len(text) > 2:
196
+ text = None
197
+
198
+ if not return_dict:
199
+ return text
200
+
201
+ batch["sentence"] = text
202
+ return batch
203
+