tangled-llama-l-128k-v0.1 / scripts /prepare_contrain_datasets.py
mtasic85's picture
contrain instruct datasets
7e030a8
from functools import partial
from litgpt.tokenizer import Tokenizer
from litdata import optimize, TokensLoader, StreamingDataset
from transformers import AutoTokenizer
from utils import tokenize_chat_fn
from contrain_datasets import contrain_datasets
#
# optimize datasets
#
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
# i = 0
# block_size = 8193
# chunk_size = block_size * 2000
chunk_size = block_size * subchunk_size
output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
outputs = optimize(
fn=partial(
tokenize_chat_fn,
hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
tokenizer=Tokenizer('..'),
),
inputs=contrain_datasets,
output_dir=output_dir,
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
num_workers=32,
reorder_files=False,
)
#
# total number of chunks in datasets
#
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]):
# i = 0
# block_size = 8193
# chunk_size = block_size * 2000
chunk_size = block_size * subchunk_size
input_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
dataset = StreamingDataset(
input_dir=input_dir,
item_loader=TokensLoader(block_size=block_size),
)
print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}')