mtasic85 commited on
Commit
27cd301
·
1 Parent(s): c1e1c58

tokenizer training

Browse files
Files changed (1) hide show
  1. scripts/pretrain_datasets.py +4 -0
scripts/pretrain_datasets.py CHANGED
@@ -42,6 +42,10 @@ pretrain_datasets = [
42
  # stem
43
  #
44
  # 1.44 GB, 63,357
 
 
 
 
45
  *[
46
  {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
47
  for i in range(0, 100, 5)
 
42
  # stem
43
  #
44
  # 1.44 GB, 63,357
45
+ *[
46
+ {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
47
+ for i in range(0, 100, 5)
48
+ ],
49
  *[
50
  {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
51
  for i in range(0, 100, 5)