codebyzeb commited on
Commit
703b055
·
verified ·
1 Parent(s): c98c964

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +109 -225
  2. vocab.json +1 -1
tokenizer.json CHANGED
@@ -136,233 +136,117 @@
136
  "PAD": 1,
137
  "W": 2,
138
  "UTT_BOUNDARY": 3,
139
- "d": 4,
140
- "o": 5,
141
- "y": 6,
142
- "u": 7,
143
- "w": 8,
144
- "a": 9,
145
- "n": 10,
146
- "t": 11,
147
- "l": 12,
148
- "k": 13,
149
- "h": 14,
150
- "i": 15,
151
- "s": 16,
152
- "?": 17,
153
- "e": 18,
154
- ".": 19,
155
- "r": 20,
156
- "'": 21,
157
- "f": 22,
158
- "c": 23,
159
- "g": 24,
160
- "p": 25,
161
- "b": 26,
162
- "m": 27,
163
- "v": 28,
164
- "j": 29,
165
- "!": 30,
166
- "x": 31,
167
- "q": 32,
168
- "z": 33,
169
- "-": 34,
170
- "&": 35,
171
- ",": 36,
172
- "/": 37,
173
- "1": 38,
174
- "9": 39,
175
- "5": 40,
176
- "0": 41,
177
- ";": 42,
178
- "": 43,
179
- "": 44,
180
- "": 45,
181
- ":": 46,
182
- "+": 47,
183
- "8": 48,
184
- "3": 49,
185
- "7": 50,
186
- "4": 51,
187
- "6": 52,
188
- "2": 53,
189
- "=": 54,
190
  ")": 55,
191
- "(": 56,
192
- "_": 57,
193
- "*": 58,
194
- "£": 59,
195
- "": 60,
196
- "#": 61,
197
- "`": 62,
198
- "\"": 63,
199
- "æ": 64,
200
- "]": 65,
201
- "|": 66,
202
- "$": 67,
203
- "": 68,
204
- "": 69,
205
- "[": 70,
206
- "œ": 71,
207
- "{": 72,
208
- "}": 73,
209
- "": 74,
210
  "°": 75,
211
- "§": 76,
212
- ">": 77,
213
- "·": 78,
214
- "¢": 79,
215
- "%": 80,
216
- "^": 81,
217
- "½": 82,
218
- "": 83,
219
- "×": 84,
220
- "¼": 85,
221
- "¾": 86,
222
- "φ": 87,
223
- "<": 88,
224
- "´": 89,
225
- "¯": 90,
226
- "¦": 91,
227
- "†": 92,
228
- "«": 93,
229
- "¬": 94,
230
- "©": 95,
231
- "“": 96,
232
- "": 97,
233
- "¹": 98,
234
- "": 99,
235
- "²": 100,
236
- "": 101,
237
- "": 102,
238
- "δ": 103,
239
- "α": 104,
240
- "ο": 105,
241
- "υ": 106,
242
- "χ": 107,
243
- "ι": 108,
244
- "η": 109,
245
- "μ": 110,
246
- "ε": 111,
247
- "γ": 112,
248
- "λ": 113,
249
- "β": 114,
250
- "κ": 115,
251
- "ω": 116,
252
- "ν": 117,
253
- "ρ": 118,
254
- "σ": 119,
255
- "ς": 120,
256
- "τ": 121,
257
- "θ": 122,
258
- "π": 123,
259
- "′": 124,
260
- "³": 125,
261
- "⸺": 126,
262
- "―": 127,
263
- "±": 128,
264
- "~": 129,
265
- "ß": 130,
266
- "µ": 131,
267
- "„": 132,
268
- "@": 133,
269
- "\\": 134,
270
- "♪": 135,
271
- "¿": 136,
272
- "¡": 137,
273
- "​": 138,
274
- "─": 139,
275
- "–": 140,
276
- "ð": 141,
277
- "­": 142,
278
- "º": 143,
279
- "¸": 144,
280
- "þ": 145,
281
- "и": 146,
282
- "♫": 147,
283
- "¤": 148,
284
- "¨": 149,
285
- "ø": 150,
286
- "": 151,
287
- "ª": 152,
288
- "đ": 153,
289
- "€": 154,
290
- "™": 155,
291
- "€": 156,
292
- "fl": 157,
293
- "♬": 158,
294
- "’": 159,
295
- ",": 160,
296
- "": 161,
297
- "™": 162,
298
- "®": 163,
299
- "ı": 164,
300
- "с": 165,
301
- "в": 166,
302
- "‎": 167,
303
- "ц": 168,
304
- "ь": 169,
305
- "д": 170,
306
- "н": 171,
307
- "е": 172,
308
- "м": 173,
309
- "о": 174,
310
- "т": 175,
311
- "г": 176,
312
- "а": 177,
313
- "р": 178,
314
- "ч": 179,
315
- "п": 180,
316
- "я": 181,
317
- "б": 182,
318
- "•": 183,
319
- "ł": 184,
320
- "ร": 185,
321
- "ก": 186,
322
- "к": 187,
323
- "у": 188,
324
- "‚": 189,
325
- "ˆ": 190,
326
- "„": 191,
327
- "ƒ": 192,
328
- "‡": 193,
329
- "Š": 194,
330
- "‘": 195,
331
- "大": 196,
332
- "”": 197,
333
- "ا": 198,
334
- "л": 199,
335
- "з": 200,
336
- "‑": 201,
337
- "ʻ": 202,
338
- "ت": 203,
339
- "ו": 204,
340
- "י": 205,
341
- "ر": 206,
342
- "ي": 207,
343
- "ل": 208,
344
- "ه": 209,
345
- "ع": 210,
346
- "ن": 211,
347
- "ə": 212,
348
- "−": 213,
349
- "→": 214,
350
- "ы": 215,
351
- "ː": 216,
352
- "و": 217,
353
- "م": 218,
354
- "س": 219,
355
- "ح": 220,
356
- "د": 221,
357
- "ب": 222,
358
- "ی": 223,
359
- "»": 224,
360
- "น": 225,
361
- "า": 226,
362
- "ง": 227,
363
- "ม": 228,
364
- "ʼ": 229,
365
- "ˈ": 230
366
  },
367
  "unk_token": "UNK"
368
  }
 
136
  "PAD": 1,
137
  "W": 2,
138
  "UTT_BOUNDARY": 3,
139
+ "y": 4,
140
+ "e": 5,
141
+ "a": 6,
142
+ "h": 7,
143
+ ".": 8,
144
+ "c": 9,
145
+ "o": 10,
146
+ "m": 11,
147
+ "p": 12,
148
+ "u": 13,
149
+ "n": 14,
150
+ "d": 15,
151
+ "'": 16,
152
+ "s": 17,
153
+ "t": 18,
154
+ "i": 19,
155
+ "g": 20,
156
+ "l": 21,
157
+ "k": 22,
158
+ "x": 23,
159
+ ",": 24,
160
+ "r": 25,
161
+ "w": 26,
162
+ "v": 27,
163
+ "f": 28,
164
+ "b": 29,
165
+ "j": 30,
166
+ "?": 31,
167
+ "-": 32,
168
+ "q": 33,
169
+ ";": 34,
170
+ "2": 35,
171
+ "": 36,
172
+ "": 37,
173
+ "!": 38,
174
+ "/": 39,
175
+ "1": 40,
176
+ ":": 41,
177
+ "z": 42,
178
+ "3": 43,
179
+ "6": 44,
180
+ "9": 45,
181
+ "&": 46,
182
+ "4": 47,
183
+ "5": 48,
184
+ "0": 49,
185
+ "=": 50,
186
+ "8": 51,
187
+ "7": 52,
188
+ "£": 53,
189
+ "(": 54,
190
  ")": 55,
191
+ "": 56,
192
+ "*": 57,
193
+ "]": 58,
194
+ "[": 59,
195
+ "\"": 60,
196
+ "_": 61,
197
+ "%": 62,
198
+ "": 63,
199
+ "": 64,
200
+ "+": 65,
201
+ "$": 66,
202
+ "^": 67,
203
+ "#": 68,
204
+ "æ": 69,
205
+ "ʌ": 70,
206
+ "ɩ": 71,
207
+ "ə": 72,
208
+ "": 73,
209
+ "|": 74,
210
  "°": 75,
211
+ "ø": 76,
212
+ "~": 77,
213
+ "": 78,
214
+ "`": 79,
215
+ "": 80,
216
+ "": 81,
217
+ "@": 82,
218
+ "}": 83,
219
+ "{": 84,
220
+ "": 85,
221
+ "": 86,
222
+ "·": 87,
223
+ "": 88,
224
+ "¡": 89,
225
+ "÷": 90,
226
+ "\\": 91,
227
+ "": 92,
228
+ "ð": 93,
229
+ "¿": 94,
230
+ "­": 95,
231
+ "": 96,
232
+ "": 97,
233
+ "œ": 98,
234
+ "ł": 99,
235
+ "¦": 100,
236
+ "×": 101,
237
+ "™": 102,
238
+ "ß": 103,
239
+ "ˈ": 104,
240
+ "ı": 105,
241
+ "đ": 106,
242
+ "": 107,
243
+ "ː": 108,
244
+ "": 109,
245
+ "": 110,
246
+ "": 111,
247
+ "ŋ": 112,
248
+ "ʼ": 113,
249
+ "\t": 114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  },
251
  "unk_token": "UNK"
252
  }
vocab.json CHANGED
@@ -1 +1 @@
1
- {"UNK":0,"PAD":1,"W":2,"UTT_BOUNDARY":3,"d":4,"o":5,"y":6,"u":7,"w":8,"a":9,"n":10,"t":11,"l":12,"k":13,"h":14,"i":15,"s":16,"?":17,"e":18,".":19,"r":20,"'":21,"f":22,"c":23,"g":24,"p":25,"b":26,"m":27,"v":28,"j":29,"!":30,"x":31,"q":32,"z":33,"-":34,"&":35,",":36,"/":37,"1":38,"9":39,"5":40,"0":41,";":42,"":43,"":44,"":45,":":46,"+":47,"8":48,"3":49,"7":50,"4":51,"6":52,"2":53,"=":54,")":55,"(":56,"_":57,"*":58,"£":59,"":60,"#":61,"`":62,"\"":63,"æ":64,"]":65,"|":66,"$":67,"":68,"":69,"[":70,"œ":71,"{":72,"}":73,"":74,"°":75,"§":76,">":77,"·":78,"¢":79,"%":80,"^":81,"½":82,"":83,"×":84,"¼":85,"¾":86,"φ":87,"<":88,"´":89,"¯":90,"¦":91,"†":92,"«":93,"¬":94,"©":95,"“":96,"":97,"¹":98,"":99,"²":100,"":101,"":102,"δ":103,"α":104,"ο":105,"υ":106,"χ":107,"ι":108,"η":109,"μ":110,"ε":111,"γ":112,"λ":113,"β":114,"κ":115,"ω":116,"ν":117,"ρ":118,"σ":119,"ς":120,"τ":121,"θ":122,"π":123,"′":124,"³":125,"⸺":126,"―":127,"±":128,"~":129,"ß":130,"µ":131,"„":132,"@":133,"\\":134,"♪":135,"¿":136,"¡":137,"​":138,"─":139,"–":140,"ð":141,"­":142,"º":143,"¸":144,"þ":145,"и":146,"♫":147,"¤":148,"¨":149,"ø":150,"":151,"ª":152,"đ":153,"€":154,"™":155,"€":156,"fl":157,"♬":158,"’":159,",":160,"":161,"™":162,"®":163,"ı":164,"с":165,"в":166,"‎":167,"ц":168,"ь":169,"д":170,"н":171,"е":172,"м":173,"о":174,"т":175,"г":176,"а":177,"р":178,"ч":179,"п":180,"я":181,"б":182,"•":183,"ł":184,"ร":185,"ก":186,"к":187,"у":188,"‚":189,"ˆ":190,"„":191,"ƒ":192,"‡":193,"Š":194,"‘":195,"大":196,"”":197,"ا":198,"л":199,"з":200,"‑":201,"ʻ":202,"ت":203,"ו":204,"י":205,"ر":206,"ي":207,"ل":208,"ه":209,"ع":210,"ن":211,"ə":212,"−":213,"→":214,"ы":215,"ː":216,"و":217,"م":218,"س":219,"ح":220,"د":221,"ب":222,"ی":223,"»":224,"น":225,"า":226,"ง":227,"ม":228,"ʼ":229,"ˈ":230}
 
1
+ {"UNK":0,"PAD":1,"W":2,"UTT_BOUNDARY":3,"y":4,"e":5,"a":6,"h":7,".":8,"c":9,"o":10,"m":11,"p":12,"u":13,"n":14,"d":15,"'":16,"s":17,"t":18,"i":19,"g":20,"l":21,"k":22,"x":23,",":24,"r":25,"w":26,"v":27,"f":28,"b":29,"j":30,"?":31,"-":32,"q":33,";":34,"2":35,"":36,"":37,"!":38,"/":39,"1":40,":":41,"z":42,"3":43,"6":44,"9":45,"&":46,"4":47,"5":48,"0":49,"=":50,"8":51,"7":52,"£":53,"(":54,")":55,"":56,"*":57,"]":58,"[":59,"\"":60,"_":61,"%":62,"":63,"":64,"+":65,"$":66,"^":67,"#":68,"æ":69,"ʌ":70,"ɩ":71,"ə":72,"":73,"|":74,"°":75,"ø":76,"~":77,"":78,"`":79,"":80,"":81,"@":82,"}":83,"{":84,"":85,"":86,"·":87,"":88,"¡":89,"÷":90,"\\":91,"":92,"ð":93,"¿":94,"­":95,"":96,"":97,"œ":98,"ł":99,"¦":100,"×":101,"™":102,"ß":103,"ˈ":104,"ı":105,"đ":106,"":107,"ː":108,"":109,"":110,"":111,"ŋ":112,"ʼ":113,"\t":114}