File size: 413 Bytes
0189de0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
# Wrap your own tokenizer
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_file="tokenizer.json", # You can load from the tokenizer file
unk_token="[UNK]",
pad_token="[PAD]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
)
# Finally, save your own pretrained tokenizer
wrapped_tokenizer.save_pretrained('my-tokenizer')
|