mtasic85 commited on
Commit
6527e94
·
1 Parent(s): ef3fdf7

pretrain dataset

Browse files
scripts/prepare_contrain_dataset.py CHANGED
@@ -44,6 +44,7 @@ def batch_dict_iterator(path: Optional[str]=None,
44
  text: list[str] | str = []
45
 
46
  for m in n:
 
47
  fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
48
  text.append(fm)
49
 
@@ -173,12 +174,34 @@ datasets_configs = [
173
  for i in range(0, 100, 20)
174
  ],
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  #
177
  # math
178
  #
179
- # 6.07 GB, 11,402,286
 
 
 
 
 
180
  [
181
- {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
182
  for i in range(0, 100, 10)
183
  ],
184
 
 
44
  text: list[str] | str = []
45
 
46
  for m in n:
47
+ # ???
48
  fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
49
  text.append(fm)
50
 
 
174
  for i in range(0, 100, 20)
175
  ],
176
 
177
+ # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
178
+ # meta-math/MetaMathQA 395,000
179
+ # openbmb/UltraInteract_sft 288,579
180
+ # HuggingFaceH4/ultrachat_200k 207,865
181
+ # microsoft/orca-math-word-problems-200k 200,035
182
+ # HuggingFaceH4/ultrafeedback_binarized 187,405
183
+ # theblackcat102/evol-codealpaca-v1 111,272
184
+ # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
185
+ # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
186
+ [
187
+ {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
188
+ {'role': roles_map[m['from']], 'content': m['value']}
189
+ for m in msgs
190
+ ]}
191
+ for i in range(0, 100, 20)
192
+ ],
193
+
194
  #
195
  # math
196
  #
197
+ ## 6.07 GB, 11,402,286
198
+ # [
199
+ # {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
200
+ # for i in range(0, 100, 10)
201
+ # ],
202
+ # 912 MB, 2,570,505
203
  [
204
+ {'path': 'ai2-adapt-dev/openmath-2-gsm8k', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
205
  for i in range(0, 100, 10)
206
  ],
207
 
scripts/prepare_pretrain_dataset.py CHANGED
@@ -97,7 +97,7 @@ datasets_configs = [
97
  # ~3 GB, 4,976,850
98
  {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
99
  for name in [
100
- 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
101
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
102
  ]
103
  ],
@@ -110,11 +110,11 @@ datasets_configs = [
110
  # {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
111
  # for i in range(0, 100, 20)
112
  # ],
113
- # 2.89 GB, 430,000, English September of 2017
114
- [
115
- {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
116
- for i in range(0, 100, 20)
117
- ],
118
  # 65.1 MB, 7,819
119
  {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
120
 
@@ -127,16 +127,12 @@ datasets_configs = [
127
  #
128
  # math
129
  #
 
 
130
  # 12.2 MB, 500,000
131
- [
132
- {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]+test', 'format': '{instruction} = {output}'}
133
- for i in range(0, 100, 20)
134
- ],
135
  # 125 MB, 1,000,000
136
- [
137
- {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]', 'format': '{expression} = {result}'}
138
- for i in range(0, 100, 20)
139
- ],
140
  ## 3.49 GB, 22,259,474
141
  # [
142
  # {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
@@ -147,18 +143,20 @@ datasets_configs = [
147
  # {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
148
  # for i in range(0, 100, 20)
149
  # ],
150
- # 12.6 GB, 21,972,791 - we use 1M subset
151
  [
152
  {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
153
  for i in range(0, 100, 20)
154
  ],
155
 
 
156
  # stem
157
- # 1.44 GB, 63,357
158
- [
159
- {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['markdown']}
160
- for i in range(0, 100, 20)
161
- ],
 
162
 
163
  #
164
  # code
 
97
  # ~3 GB, 4,976,850
98
  {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
99
  for name in [
100
+ # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
101
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
102
  ]
103
  ],
 
110
  # {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
111
  # for i in range(0, 100, 20)
112
  # ],
113
+ ## 2.89 GB, 430,000, English September of 2017
114
+ # [
115
+ # {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
116
+ # for i in range(0, 100, 20)
117
+ # ],
118
  # 65.1 MB, 7,819
119
  {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
120
 
 
127
  #
128
  # math
129
  #
130
+ # 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
131
+ {'path': 'OleehyO/latex-formulas', 'name': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
132
  # 12.2 MB, 500,000
133
+ {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
 
 
 
134
  # 125 MB, 1,000,000
135
+ {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
 
 
 
136
  ## 3.49 GB, 22,259,474
137
  # [
138
  # {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
 
143
  # {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
144
  # for i in range(0, 100, 20)
145
  # ],
146
+ # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
147
  [
148
  {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
149
  for i in range(0, 100, 20)
150
  ],
151
 
152
+ #
153
  # stem
154
+ #
155
+ ## 1.44 GB, 63,357
156
+ # [
157
+ # {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['markdown']}
158
+ # for i in range(0, 100, 20)
159
+ # ],
160
 
161
  #
162
  # code