english-char-roberta / tokenizer.json
rchan26's picture
End of training
3613199
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 53,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 54,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 55,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 56,
"content": "<mask>",
"single_word": false,
"lstrip": true,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "RobertaProcessing",
"sep": [
"</s>",
54
],
"cls": [
"<s>",
53
],
"trim_offsets": true,
"add_prefix_space": false
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<unk>": 0,
"a": 1,
"b": 2,
"c": 3,
"d": 4,
"e": 5,
"f": 6,
"g": 7,
"h": 8,
"i": 9,
"j": 10,
"k": 11,
"l": 12,
"m": 13,
"n": 14,
"o": 15,
"p": 16,
"q": 17,
"r": 18,
"s": 19,
"t": 20,
"u": 21,
"v": 22,
"w": 23,
"x": 24,
"y": 25,
"z": 26,
"x</w>": 27,
"v</w>": 28,
"d</w>": 29,
"r</w>": 30,
"s</w>": 31,
"w</w>": 32,
"c</w>": 33,
"k</w>": 34,
"n</w>": 35,
"y</w>": 36,
"p</w>": 37,
"j</w>": 38,
"g</w>": 39,
"f</w>": 40,
"t</w>": 41,
"z</w>": 42,
"a</w>": 43,
"e</w>": 44,
"o</w>": 45,
"q</w>": 46,
"i</w>": 47,
"u</w>": 48,
"m</w>": 49,
"h</w>": 50,
"l</w>": 51,
"b</w>": 52
},
"merges": []
}
}