File size: 413 Bytes
0189de0
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Wrap your own tokenizer
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json", # You can load from the tokenizer file
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# Finally, save your own pretrained tokenizer
wrapped_tokenizer.save_pretrained('my-tokenizer')