from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from utils import batch_text_iterator
from pretrain_datasets import tokenizer_datasets
#
# special_tokens
#
bos_token = '<|begin_of_text|>'
eos_token = '<|end_of_text|>'
special_tokens = [
bos_token,
eos_token,
'<|start_header_id|>',
'<|end_header_id|>',
'<|eom_id|>',
'<|eot_id|>',
'system',
'user',
'assistant',
# tool/function calling
'',
'',
'',
'',
'',
'',
'',
'',
'"name"',
'"arguments"',
#
# JSON Schema
#
# General Metadata Keywords
'"$schema"',
'"$id"',
'"$ref"',
'"$defs"',
'"$anchor"',
'"$dynamicAnchor"',
'"$dynamicRef"',
'"$vocabulary"',
'"$comment"',
# Data Types
'"null"',
'"boolean"',
'"object"',
'"array"',
'"number"',
'"string"',
'"integer"',
# Validation Keywords
'"type"',
'"enum"',
'"const"',
'"multipleOf"',
'"maximum"',
'"exclusiveMaximum"',
'"minimum"',
'"exclusiveMinimum"',
'"maxLength"',
'"minLength"',
'"pattern"',
'"additionalItems"',
'"items"',
'"prefixItems"',
'"contains"',
'"maxItems"',
'"minItems"',
'"uniqueItems"',
'"maxProperties"',
'"minProperties"',
'"required"',
'"properties"',
'"patternProperties"',
'"additionalProperties"',
'"dependentRequired"',
'"dependentSchemas"',
'"propertyNames"',
# Conditional Keywords
'"if"',
'"then"',
'"else"',
'"allOf"',
'"anyOf"',
'"oneOf"',
'"not"',
# Additional Keywords for Evaluation Control
'"unevaluatedItems"',
'"unevaluatedProperties"',
# Informational Keywords
'"title"',
'"description"',
'"default"',
'"deprecated"',
'"readOnly"',
'"writeOnly"',
'"examples"',
# Content-Related Keywords
'"contentEncoding"',
'"contentMediaType"',
'"contentSchema"',
# Additional Keywords
'"next"', # Typically used in reference to linked or next items
'"value"', # Represents the value of a property or item
# misc
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
'
',
'',
'',
'',
'',
# qa
'',
'',
'',
'',
# thought
'',
'',
'',
'',
'',
'',
'',
'',
# reasoning
'',
'',
'',
'',
'',
'',
'',
'',
'',
'',
# reflection
'',
'',
'',
'',
'',
'',
# graph
'',
'',
'',
'',
'',
'',
'',
'',
'',
# '',
# '',
]
# for i in range(2, 25):
# special_tokens.append(' ' * i)
# for i in range(2, 25):
# special_tokens.append('\t' * i)
# for i in range(2, 25):
# special_tokens.append('\n' * i)
# for i in range(2, 25):
# special_tokens.append('\r' * i)
# for i in range(2, 25):
# special_tokens.append('\r\n' * i)
for i in range(256):
special_tokens.append(f'<0x{i:02X}>')
for i in range(64):
special_tokens.append(f'<|reserved_special_token_{i}|>')
#
# BPE Tokenizer
#
bpe = BPE(unk_token=None, byte_fallback=True)
tokenizer = Tokenizer(bpe)
# normalizer
tokenizer.normalizer = None
# pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
# post-processor
tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)
# decoder
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
#
# BPE Trainer
#
trainer = BpeTrainer(
vocab_size=65536, # 64 * 1024
min_frequency=3,
special_tokens=special_tokens,
max_token_length=24,
)
tokenizer.train_from_iterator(
batch_text_iterator(tokenizer_datasets),
trainer,
)
tokenizer.save('../tokenizer.json')
tokenizer.model.save('../')
#
# PreTrainedTokenizerFast
#
CHAT_TEMPLATE = (
"{{ bos_token }}"
"{% for message in messages %}"
"{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|start_header_id|>assistant<|end_header_id|>' }}"
"{% else %}"
"{{ eos_token }}"
"{% endif %}"
)
fast_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
chat_template=CHAT_TEMPLATE,
bos_token=bos_token,
eos_token=eos_token,
clean_up_tokenization_spaces=False,
)
fast_tokenizer.save_pretrained('../')