DipsankarSinha commited on
Commit
af09f07
·
verified ·
1 Parent(s): bc13c9a

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +1 -1
  2. tokenizer_config.json +17 -1
  3. vocab.json +228 -228
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "</s>": 231,
3
- "[PAD]": 230
4
  }
 
1
  {
2
  "</s>": 231,
3
+ "<s>": 230
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,14 @@
1
  {
2
  "added_tokens_decoder": {
3
- "230": {
 
 
 
 
 
 
 
 
4
  "content": "[PAD]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,6 +16,14 @@
8
  "single_word": false,
9
  "special": false
10
  },
 
 
 
 
 
 
 
 
11
  "231": {
12
  "content": "</s>",
13
  "lstrip": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "228": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "229": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "230": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
  "231": {
28
  "content": "</s>",
29
  "lstrip": false,
vocab.json CHANGED
@@ -1,232 +1,232 @@
1
  {
2
- "[PAD]": 230,
3
- "[UNK]": 230,
4
- "|": 24,
5
- "ሀ": 12,
6
- "ሁ": 38,
7
- "ሂ": 197,
8
- "ሃ": 173,
9
- "ሄ": 128,
10
- "ህ": 98,
11
- "ሆ": 3,
12
- "ለ": 4,
13
- "ሉ": 224,
14
- "ሊ": 181,
15
- "ላ": 163,
16
- "ሌ": 211,
17
- "ል": 96,
18
- "ሎ": 167,
19
- "ሏ": 103,
20
- "ሐ": 85,
21
- "ሑ": 62,
22
- "ሓ": 82,
23
- "ሔ": 223,
24
- "ሕ": 161,
25
- "መ": 30,
26
- "ሙ": 58,
27
- "ሚ": 182,
28
- "ማ": 188,
29
- "ሜ": 194,
30
- "ም": 41,
31
- "ሞ": 185,
32
- "ሟ": 143,
33
- "ሠ": 205,
34
- "ሡ": 151,
35
- "ሣ": 171,
36
- "ሥ": 72,
37
- "ሦ": 70,
38
- "ረ": 123,
39
- "ሩ": 104,
40
- "ሪ": 206,
41
- "ራ": 152,
42
- "ሬ": 105,
43
- "ር": 73,
44
- "ሮ": 43,
45
- "ሯ": 226,
46
- "ሰ": 156,
47
- "ሱ": 121,
48
- "ሲ": 39,
49
- "ሳ": 64,
50
- "ሴ": 9,
51
- "ስ": 145,
52
- "ሶ": 180,
53
- "ሷ": 192,
54
- "ሸ": 8,
55
- "ሹ": 183,
56
- "ሺ": 27,
57
- "ሻ": 63,
58
- "ሼ": 150,
59
- "ሽ": 32,
60
- "ሾ": 76,
61
- "ቀ": 222,
62
- "ቁ": 2,
63
- "ቂ": 141,
64
- "ቃ": 129,
65
- "ቄ": 187,
66
- "ቅ": 120,
67
- "ቆ": 91,
68
- "ቋ": 202,
69
- "በ": 90,
70
- "ቡ": 175,
71
- "ቢ": 46,
72
- "ባ": 136,
73
- "ቤ": 190,
74
- "ብ": 80,
75
- "ቦ": 174,
76
- "ቧ": 138,
77
- "ቨ": 172,
78
- "ቪ": 15,
79
- "ቫ": 6,
80
- "ቭ": 155,
81
- "ቮ": 34,
82
- "ተ": 26,
83
- "ቱ": 164,
84
- "ቲ": 144,
85
- "ታ": 78,
86
- "ቴ": 93,
87
- "ት": 61,
88
- "ቶ": 148,
89
- "ቷ": 25,
90
- "ቸ": 68,
91
- "ቹ": 10,
92
- "ቺ": 86,
93
- "ቻ": 110,
94
- "ቼ": 195,
95
- "ች": 19,
96
- "ቾ": 74,
97
- "ቿ": 83,
98
- "ኀ": 139,
99
- "ኃ": 159,
100
- "ኅ": 65,
101
- "ኋ": 13,
102
- "ነ": 107,
103
- "ኑ": 184,
104
- "ኒ": 77,
105
- "ና": 218,
106
- "ኔ": 210,
107
- "ን": 108,
108
- "ኖ": 204,
109
- "ኗ": 51,
110
- "ኘ": 48,
111
- "ኙ": 52,
112
- "ኛ": 119,
113
- "ኝ": 45,
114
- "ኞ": 201,
115
- "ኟ": 177,
116
- "አ": 92,
117
- "ኡ": 142,
118
- "ኢ": 133,
119
- "ኤ": 115,
120
- "እ": 140,
121
- "ኦ": 36,
122
- "ከ": 17,
123
- "ኩ": 131,
124
- "ኪ": 102,
125
- "ካ": 178,
126
- "ኬ": 57,
127
- "ክ": 135,
128
- "ኮ": 54,
129
- "ኳ": 23,
130
- "ኸ": 21,
131
- "ኽ": 209,
132
- "ወ": 168,
133
- "ዊ": 153,
134
- "ዋ": 165,
135
- "ዌ": 169,
136
- "ው": 191,
137
- "ዎ": 198,
138
- "ዐ": 146,
139
- "ዑ": 95,
140
- "ዒ": 18,
141
- "ዓ": 217,
142
- "ዕ": 125,
143
- "ዖ": 67,
144
- "ዘ": 132,
145
- "ዙ": 227,
146
- "ዚ": 220,
147
- "ዛ": 126,
148
- "ዜ": 130,
149
- "ዝ": 117,
150
- "ዞ": 118,
151
- "ዟ": 157,
152
- "ዢ": 154,
153
- "ዣ": 106,
154
- "ዤ": 147,
155
- "ዥ": 79,
156
- "ዦ": 7,
157
- "የ": 200,
158
- "ዩ": 71,
159
- "ያ": 166,
160
- "ዬ": 127,
161
- "ይ": 179,
162
- "ዮ": 87,
163
- "ደ": 170,
164
- "ዱ": 89,
165
- "ዲ": 0,
166
- "ዳ": 122,
167
- "ዴ": 42,
168
- "ድ": 134,
169
- "ዶ": 207,
170
- "ዷ": 214,
171
- "ጀ": 14,
172
  "ጁ": 16,
173
- "ጂ": 44,
174
- "ጃ": 81,
175
- "ጄ": 33,
176
- "ጅ": 5,
177
- "ጆ": 113,
178
- "ገ": 212,
179
- "ጉ": 56,
180
- "ጊ": 160,
181
- "ጋ": 69,
182
- "��": 203,
183
- "ግ": 35,
184
- "ጎ": 199,
185
- "ጓ": 40,
186
- "ጠ": 162,
187
- "ጡ": 49,
188
- "ጢ": 112,
189
- "ጣ": 99,
190
- "ጤ": 158,
191
- "ጥ": 37,
192
- "ጦ": 225,
193
- "ጧ": 189,
194
  "ጨ": 88,
195
- "ጩ": 53,
196
- "ጪ": 29,
197
- "ጫ": 100,
198
- "ጬ": 60,
199
- "ጭ": 219,
200
- "ጮ": 213,
201
- "ጲ": 101,
202
- "ጴ": 176,
203
- "ጵ": 221,
204
- "ጶ": 116,
205
- "ጸ": 193,
206
- "ጹ": 186,
207
- "ጺ": 11,
208
- "ጻ": 208,
209
- "ጽ": 109,
210
- "ጾ": 20,
211
- "ጿ": 196,
212
- "ፀ": 114,
213
- "ፁ": 55,
214
- "ፃ": 50,
215
- "ፅ": 28,
216
- "ፈ": 215,
217
- "ፉ": 94,
218
- "ፊ": 75,
219
- "ፋ": 22,
220
- "ፌ": 97,
221
- "ፍ": 84,
222
- "ፎ": 216,
223
- "ፏ": 31,
224
- "ፑ": 149,
225
- "ፒ": 1,
226
- "ፓ": 59,
227
- "ፔ": 47,
228
- "ፕ": 111,
229
- "ፖ": 137,
230
- "“": 124,
231
- "”": 66
232
  }
 
1
  {
2
+ "[PAD]": 229,
3
+ "[UNK]": 228,
4
+ "|": 144,
5
+ "ሀ": 78,
6
+ "ሁ": 165,
7
+ "ሂ": 120,
8
+ "ሃ": 77,
9
+ "ሄ": 200,
10
+ "ህ": 142,
11
+ "ሆ": 46,
12
+ "ለ": 150,
13
+ "ሉ": 44,
14
+ "ሊ": 79,
15
+ "ላ": 126,
16
+ "ሌ": 8,
17
+ "ል": 58,
18
+ "ሎ": 40,
19
+ "ሏ": 194,
20
+ "ሐ": 134,
21
+ "ሑ": 167,
22
+ "ሓ": 164,
23
+ "ሔ": 110,
24
+ "ሕ": 102,
25
+ "መ": 184,
26
+ "ሙ": 9,
27
+ "ሚ": 87,
28
+ "ማ": 148,
29
+ "ሜ": 24,
30
+ "ም": 7,
31
+ "ሞ": 172,
32
+ "ሟ": 218,
33
+ "ሠ": 220,
34
+ "ሡ": 176,
35
+ "ሣ": 141,
36
+ "ሥ": 27,
37
+ "ሦ": 192,
38
+ "ረ": 55,
39
+ "ሩ": 170,
40
+ "ሪ": 107,
41
+ "ራ": 22,
42
+ "ሬ": 5,
43
+ "ር": 136,
44
+ "ሮ": 81,
45
+ "ሯ": 157,
46
+ "ሰ": 94,
47
+ "ሱ": 13,
48
+ "ሲ": 1,
49
+ "ሳ": 2,
50
+ "ሴ": 208,
51
+ "ስ": 191,
52
+ "ሶ": 80,
53
+ "ሷ": 123,
54
+ "ሸ": 175,
55
+ "ሹ": 34,
56
+ "ሺ": 129,
57
+ "ሻ": 226,
58
+ "ሼ": 223,
59
+ "ሽ": 187,
60
+ "ሾ": 163,
61
+ "ቀ": 198,
62
+ "ቁ": 90,
63
+ "ቂ": 35,
64
+ "ቃ": 207,
65
+ "ቄ": 60,
66
+ "ቅ": 173,
67
+ "ቆ": 222,
68
+ "ቋ": 195,
69
+ "በ": 169,
70
+ "ቡ": 75,
71
+ "ቢ": 97,
72
+ "ባ": 186,
73
+ "ቤ": 202,
74
+ "ብ": 151,
75
+ "ቦ": 188,
76
+ "ቧ": 182,
77
+ "ቨ": 158,
78
+ "ቪ": 95,
79
+ "ቫ": 36,
80
+ "ቭ": 156,
81
+ "ቮ": 197,
82
+ "ተ": 114,
83
+ "ቱ": 43,
84
+ "ቲ": 139,
85
+ "ታ": 23,
86
+ "ቴ": 105,
87
+ "ት": 138,
88
+ "ቶ": 82,
89
+ "ቷ": 168,
90
+ "ቸ": 108,
91
+ "ቹ": 137,
92
+ "ቺ": 203,
93
+ "ቻ": 224,
94
+ "ቼ": 183,
95
+ "ች": 225,
96
+ "ቾ": 210,
97
+ "ቿ": 51,
98
+ "ኀ": 214,
99
+ "ኃ": 185,
100
+ "ኅ": 11,
101
+ "ኋ": 130,
102
+ "ነ": 177,
103
+ "ኑ": 204,
104
+ "ኒ": 30,
105
+ "ና": 49,
106
+ "ኔ": 45,
107
+ "ን": 14,
108
+ "ኖ": 112,
109
+ "ኗ": 145,
110
+ "ኘ": 70,
111
+ "ኙ": 174,
112
+ "ኛ": 64,
113
+ "ኝ": 59,
114
+ "ኞ": 38,
115
+ "ኟ": 180,
116
+ "አ": 19,
117
+ "ኡ": 53,
118
+ "ኢ": 29,
119
+ "ኤ": 213,
120
+ "እ": 101,
121
+ "ኦ": 153,
122
+ "ከ": 221,
123
+ "ኩ": 91,
124
+ "ኪ": 17,
125
+ "ካ": 12,
126
+ "ኬ": 106,
127
+ "ክ": 190,
128
+ "ኮ": 72,
129
+ "ኳ": 116,
130
+ "ኸ": 84,
131
+ "ኽ": 132,
132
+ "ወ": 85,
133
+ "ዊ": 122,
134
+ "ዋ": 50,
135
+ "ዌ": 201,
136
+ "ው": 0,
137
+ "ዎ": 121,
138
+ "ዐ": 73,
139
+ "ዑ": 155,
140
+ "ዒ": 178,
141
+ "ዓ": 26,
142
+ "ዕ": 69,
143
+ "ዖ": 152,
144
+ "ዘ": 118,
145
+ "ዙ": 146,
146
+ "ዚ": 76,
147
+ "ዛ": 206,
148
+ "ዜ": 28,
149
+ "ዝ": 127,
150
+ "ዞ": 115,
151
+ "ዟ": 104,
152
+ "ዢ": 83,
153
+ "ዣ": 219,
154
+ "ዤ": 125,
155
+ "ዥ": 140,
156
+ "ዦ": 99,
157
+ "የ": 57,
158
+ "ዩ": 62,
159
+ "ያ": 119,
160
+ "ዬ": 3,
161
+ "ይ": 211,
162
+ "ዮ": 33,
163
+ "ደ": 74,
164
+ "ዱ": 96,
165
+ "ዲ": 67,
166
+ "ዳ": 215,
167
+ "ዴ": 18,
168
+ "ድ": 103,
169
+ "ዶ": 10,
170
+ "ዷ": 154,
171
+ "ጀ": 117,
172
  "ጁ": 16,
173
+ "ጂ": 89,
174
+ "ጃ": 227,
175
+ "ጄ": 61,
176
+ "ጅ": 86,
177
+ "ጆ": 181,
178
+ "ገ": 39,
179
+ "ጉ": 113,
180
+ "ጊ": 161,
181
+ "ጋ": 52,
182
+ "": 196,
183
+ "ግ": 66,
184
+ "ጎ": 217,
185
+ "ጓ": 166,
186
+ "ጠ": 41,
187
+ "ጡ": 143,
188
+ "ጢ": 205,
189
+ "ጣ": 209,
190
+ "ጤ": 4,
191
+ "ጥ": 160,
192
+ "ጦ": 93,
193
+ "ጧ": 193,
194
  "ጨ": 88,
195
+ "ጩ": 54,
196
+ "ጪ": 56,
197
+ "ጫ": 199,
198
+ "ጬ": 162,
199
+ "ጭ": 6,
200
+ "ጮ": 32,
201
+ "ጲ": 21,
202
+ "ጴ": 128,
203
+ "ጵ": 149,
204
+ "ጶ": 68,
205
+ "ጸ": 47,
206
+ "ጹ": 31,
207
+ "ጺ": 179,
208
+ "ጻ": 98,
209
+ "ጽ": 37,
210
+ "ጾ": 42,
211
+ "ጿ": 189,
212
+ "ፀ": 65,
213
+ "ፁ": 100,
214
+ "ፃ": 216,
215
+ "ፅ": 212,
216
+ "ፈ": 48,
217
+ "ፉ": 15,
218
+ "ፊ": 135,
219
+ "ፋ": 111,
220
+ "ፌ": 71,
221
+ "ፍ": 147,
222
+ "ፎ": 109,
223
+ "ፏ": 159,
224
+ "ፑ": 124,
225
+ "ፒ": 25,
226
+ "ፓ": 133,
227
+ "ፔ": 131,
228
+ "ፕ": 171,
229
+ "ፖ": 92,
230
+ "“": 63,
231
+ "”": 20
232
  }