|
from functools import partial |
|
|
|
from litgpt.tokenizer import Tokenizer |
|
from litdata import optimize, TokensLoader, StreamingDataset |
|
from transformers import AutoTokenizer |
|
|
|
from utils import tokenize_chat_fn |
|
from contrain_datasets import contrain_datasets |
|
|
|
|
|
|
|
|
|
|
|
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]): |
|
|
|
|
|
|
|
chunk_size = block_size * subchunk_size |
|
output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}' |
|
|
|
outputs = optimize( |
|
fn=partial( |
|
tokenize_chat_fn, |
|
hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True), |
|
tokenizer=Tokenizer('..'), |
|
), |
|
inputs=contrain_datasets, |
|
output_dir=output_dir, |
|
chunk_size=chunk_size, |
|
num_workers=32, |
|
reorder_files=False, |
|
) |
|
|
|
|
|
|
|
|
|
for i, (block_size, subchunk_size) in enumerate([(4097, 4000), (8193, 2000)]): |
|
|
|
|
|
|
|
chunk_size = block_size * subchunk_size |
|
input_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}' |
|
|
|
dataset = StreamingDataset( |
|
input_dir=input_dir, |
|
item_loader=TokensLoader(block_size=block_size), |
|
) |
|
|
|
print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}') |
|
|