from transformers import PreTrainedTokenizerFast from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from utils import batch_text_iterator from pretrain_datasets import tokenizer_datasets # # special_tokens # bos_token = '<|begin_of_text|>' eos_token = '<|end_of_text|>' special_tokens = [ bos_token, eos_token, '<|start_header_id|>', '<|end_header_id|>', '<|eom_id|>', '<|eot_id|>', 'system', 'user', 'assistant', # tool/function calling '', '', '', '', '', '', '', '', '"name"', '"arguments"', # # JSON Schema # # General Metadata Keywords '"$schema"', '"$id"', '"$ref"', '"$defs"', '"$anchor"', '"$dynamicAnchor"', '"$dynamicRef"', '"$vocabulary"', '"$comment"', # Data Types '"null"', '"boolean"', '"object"', '"array"', '"number"', '"string"', '"integer"', # Validation Keywords '"type"', '"enum"', '"const"', '"multipleOf"', '"maximum"', '"exclusiveMaximum"', '"minimum"', '"exclusiveMinimum"', '"maxLength"', '"minLength"', '"pattern"', '"additionalItems"', '"items"', '"prefixItems"', '"contains"', '"maxItems"', '"minItems"', '"uniqueItems"', '"maxProperties"', '"minProperties"', '"required"', '"properties"', '"patternProperties"', '"additionalProperties"', '"dependentRequired"', '"dependentSchemas"', '"propertyNames"', # Conditional Keywords '"if"', '"then"', '"else"', '"allOf"', '"anyOf"', '"oneOf"', '"not"', # Additional Keywords for Evaluation Control '"unevaluatedItems"', '"unevaluatedProperties"', # Informational Keywords '"title"', '"description"', '"default"', '"deprecated"', '"readOnly"', '"writeOnly"', '"examples"', # Content-Related Keywords '"contentEncoding"', '"contentMediaType"', '"contentSchema"', # Additional Keywords '"next"', # Typically used in reference to linked or next items '"value"', # Represents the value of a property or item # misc '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # qa '', '', '', '', # thought '', '', '', '', '', '', '', '', # reasoning '', '', '', '', '', '', '', '', '', '', # reflection '', '', '', '', '', '', # graph '', '', '', '', '', '', '', '', '', '', # '', # '', ] # for i in range(2, 25): # special_tokens.append(' ' * i) # for i in range(2, 25): # special_tokens.append('\t' * i) # for i in range(2, 25): # special_tokens.append('\n' * i) # for i in range(2, 25): # special_tokens.append('\r' * i) # for i in range(2, 25): # special_tokens.append('\r\n' * i) for i in range(256): special_tokens.append(f'<0x{i:02X}>') for i in range(64): special_tokens.append(f'<|reserved_special_token_{i}|>') # # BPE Tokenizer # bpe = BPE(unk_token=None, byte_fallback=True) tokenizer = Tokenizer(bpe) # normalizer tokenizer.normalizer = None # pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) # post-processor tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True) # decoder tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True) # # BPE Trainer # trainer = BpeTrainer( vocab_size=65536, # 64 * 1024 min_frequency=3, special_tokens=special_tokens, max_token_length=24, ) tokenizer.train_from_iterator( batch_text_iterator(tokenizer_datasets), trainer, ) tokenizer.save('../tokenizer.json') tokenizer.model.save('../') # # PreTrainedTokenizerFast # CHAT_TEMPLATE = ( "{{ bos_token }}" "{% for message in messages %}" "{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + message['content'] + '<|eot_id|>'}}" "{% endfor %}" "{% if add_generation_prompt %}" "{{ '<|start_header_id|>assistant<|end_header_id|>' }}" "{% else %}" "{{ eos_token }}" "{% endif %}" ) fast_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, chat_template=CHAT_TEMPLATE, bos_token=bos_token, eos_token=eos_token, clean_up_tokenization_spaces=False, ) fast_tokenizer.save_pretrained('../')