# Wrap your own tokenizer | |
from transformers import PreTrainedTokenizerFast | |
wrapped_tokenizer = PreTrainedTokenizerFast( | |
tokenizer_file="tokenizer.json", # You can load from the tokenizer file | |
unk_token="[UNK]", | |
pad_token="[PAD]", | |
cls_token="[CLS]", | |
sep_token="[SEP]", | |
mask_token="[MASK]", | |
) | |
# Finally, save your own pretrained tokenizer | |
wrapped_tokenizer.save_pretrained('my-tokenizer') | |