mtasic85 commited on
Commit
f6fa207
·
1 Parent(s): 82c786e

contrain instruct datasets

Browse files
scripts/prepare_contrain_datasets.py CHANGED
@@ -1,5 +1,6 @@
1
  from functools import partial
2
 
 
3
  from litdata import optimize, TokensLoader, StreamingDataset
4
  from transformers import AutoTokenizer
5
 
@@ -16,7 +17,11 @@ chunk_size = block_size * 4000
16
  output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
17
 
18
  outputs = optimize(
19
- fn=partial(tokenize_chat_fn, tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)),
 
 
 
 
20
  inputs=contrain_datasets,
21
  output_dir=output_dir,
22
  chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
 
1
  from functools import partial
2
 
3
+ from litgpt.tokenizer import Tokenizer
4
  from litdata import optimize, TokensLoader, StreamingDataset
5
  from transformers import AutoTokenizer
6
 
 
17
  output_dir = f'../contrain-data-{i}-{block_size}-{chunk_size}'
18
 
19
  outputs = optimize(
20
+ fn=partial(
21
+ tokenize_chat_fn,
22
+ hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True),
23
+ tokenizer=Tokenizer('..'),
24
+ ),
25
  inputs=contrain_datasets,
26
  output_dir=output_dir,
27
  chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
scripts/utils.py CHANGED
@@ -123,9 +123,12 @@ def tokenize_text_fn(dataset_config: list, tokenizer: Tokenizer, min_len: Option
123
  yield text_ids
124
 
125
 
126
- def tokenize_chat_fn(dataset_config: list, tokenizer: AutoTokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
127
  for messages in batch_chat_iterator(dataset_config):
128
- text_ids: torch.Tensor = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt")
 
 
 
129
 
130
  if min_len is None and max_len is None:
131
  yield text_ids
 
123
  yield text_ids
124
 
125
 
126
+ def tokenize_chat_fn(dataset_config: list, hf_tokenizer: AutoTokenizer, tokenizer: Tokenizer, min_len: Optional[int]=None, max_len: Optional[int]=None) -> Iterator[torch.Tensor]:
127
  for messages in batch_chat_iterator(dataset_config):
128
+ # text_ids: torch.Tensor = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors='pt')
129
+ # text_ids = text_ids.to(torch.int)
130
+ text: str = hf_tokenizer.apply_chat_template(messages, tokenize=False)
131
+ text_ids: torch.Tensor = tokenizer.encode(text, bos=False, eos=False)
132
 
133
  if min_len is None and max_len is None:
134
  yield text_ids