pretrain dataset
Browse files
scripts/prepare_contrain_dataset.py
CHANGED
@@ -44,6 +44,7 @@ def batch_dict_iterator(path: Optional[str]=None,
|
|
44 |
text: list[str] | str = []
|
45 |
|
46 |
for m in n:
|
|
|
47 |
fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
|
48 |
text.append(fm)
|
49 |
|
@@ -173,12 +174,34 @@ datasets_configs = [
|
|
173 |
for i in range(0, 100, 20)
|
174 |
],
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
#
|
177 |
# math
|
178 |
#
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
[
|
181 |
-
{'path': 'ai2-adapt-dev/openmath-2-
|
182 |
for i in range(0, 100, 10)
|
183 |
],
|
184 |
|
|
|
44 |
text: list[str] | str = []
|
45 |
|
46 |
for m in n:
|
47 |
+
# ???
|
48 |
fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
|
49 |
text.append(fm)
|
50 |
|
|
|
174 |
for i in range(0, 100, 20)
|
175 |
],
|
176 |
|
177 |
+
# mlabonne/open-perfectblend - 1.48 GB, 1,420,909
|
178 |
+
# meta-math/MetaMathQA 395,000
|
179 |
+
# openbmb/UltraInteract_sft 288,579
|
180 |
+
# HuggingFaceH4/ultrachat_200k 207,865
|
181 |
+
# microsoft/orca-math-word-problems-200k 200,035
|
182 |
+
# HuggingFaceH4/ultrafeedback_binarized 187,405
|
183 |
+
# theblackcat102/evol-codealpaca-v1 111,272
|
184 |
+
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
|
185 |
+
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
|
186 |
+
[
|
187 |
+
{'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
|
188 |
+
{'role': roles_map[m['from']], 'content': m['value']}
|
189 |
+
for m in msgs
|
190 |
+
]}
|
191 |
+
for i in range(0, 100, 20)
|
192 |
+
],
|
193 |
+
|
194 |
#
|
195 |
# math
|
196 |
#
|
197 |
+
## 6.07 GB, 11,402,286
|
198 |
+
# [
|
199 |
+
# {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
|
200 |
+
# for i in range(0, 100, 10)
|
201 |
+
# ],
|
202 |
+
# 912 MB, 2,570,505
|
203 |
[
|
204 |
+
{'path': 'ai2-adapt-dev/openmath-2-gsm8k', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
|
205 |
for i in range(0, 100, 10)
|
206 |
],
|
207 |
|
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -97,7 +97,7 @@ datasets_configs = [
|
|
97 |
# ~3 GB, 4,976,850
|
98 |
{'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
|
99 |
for name in [
|
100 |
-
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
101 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
102 |
]
|
103 |
],
|
@@ -110,11 +110,11 @@ datasets_configs = [
|
|
110 |
# {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
111 |
# for i in range(0, 100, 20)
|
112 |
# ],
|
113 |
-
|
114 |
-
[
|
115 |
-
|
116 |
-
|
117 |
-
],
|
118 |
# 65.1 MB, 7,819
|
119 |
{'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
|
120 |
|
@@ -127,16 +127,12 @@ datasets_configs = [
|
|
127 |
#
|
128 |
# math
|
129 |
#
|
|
|
|
|
130 |
# 12.2 MB, 500,000
|
131 |
-
|
132 |
-
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]+test', 'format': '{instruction} = {output}'}
|
133 |
-
for i in range(0, 100, 20)
|
134 |
-
],
|
135 |
# 125 MB, 1,000,000
|
136 |
-
|
137 |
-
{'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]', 'format': '{expression} = {result}'}
|
138 |
-
for i in range(0, 100, 20)
|
139 |
-
],
|
140 |
## 3.49 GB, 22,259,474
|
141 |
# [
|
142 |
# {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
|
@@ -147,18 +143,20 @@ datasets_configs = [
|
|
147 |
# {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
148 |
# for i in range(0, 100, 20)
|
149 |
# ],
|
150 |
-
# 12.6 GB, 21,972,791 - we use 1M subset
|
151 |
[
|
152 |
{'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
|
153 |
for i in range(0, 100, 20)
|
154 |
],
|
155 |
|
|
|
156 |
# stem
|
157 |
-
#
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
162 |
|
163 |
#
|
164 |
# code
|
|
|
97 |
# ~3 GB, 4,976,850
|
98 |
{'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
|
99 |
for name in [
|
100 |
+
# 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
101 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
102 |
]
|
103 |
],
|
|
|
110 |
# {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
111 |
# for i in range(0, 100, 20)
|
112 |
# ],
|
113 |
+
## 2.89 GB, 430,000, English September of 2017
|
114 |
+
# [
|
115 |
+
# {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
|
116 |
+
# for i in range(0, 100, 20)
|
117 |
+
# ],
|
118 |
# 65.1 MB, 7,819
|
119 |
{'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
|
120 |
|
|
|
127 |
#
|
128 |
# math
|
129 |
#
|
130 |
+
# 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
|
131 |
+
{'path': 'OleehyO/latex-formulas', 'name': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
|
132 |
# 12.2 MB, 500,000
|
133 |
+
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
|
|
|
|
|
|
134 |
# 125 MB, 1,000,000
|
135 |
+
{'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
|
|
|
|
|
|
|
136 |
## 3.49 GB, 22,259,474
|
137 |
# [
|
138 |
# {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
|
|
|
143 |
# {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
|
144 |
# for i in range(0, 100, 20)
|
145 |
# ],
|
146 |
+
# 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
|
147 |
[
|
148 |
{'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
|
149 |
for i in range(0, 100, 20)
|
150 |
],
|
151 |
|
152 |
+
#
|
153 |
# stem
|
154 |
+
#
|
155 |
+
## 1.44 GB, 63,357
|
156 |
+
# [
|
157 |
+
# {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['markdown']}
|
158 |
+
# for i in range(0, 100, 20)
|
159 |
+
# ],
|
160 |
|
161 |
#
|
162 |
# code
|