pretrain dataset

Browse files

Files changed (2) hide show

scripts/prepare_contrain_dataset.py +25 -2
scripts/prepare_pretrain_dataset.py +18 -20

scripts/prepare_contrain_dataset.py CHANGED Viewed

@@ -44,6 +44,7 @@ def batch_dict_iterator(path: Optional[str]=None,
         text: list[str] | str = []
         for m in n:
             fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
             text.append(fm)
@@ -173,12 +174,34 @@ datasets_configs = [
         for i in range(0, 100, 20)
     ],
     #
     # math
     #
-    # 6.07 GB, 11,402,286
     [
-        {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
         for i in range(0, 100, 10)
     ],

         text: list[str] | str = []
         for m in n:
+            # ???
             fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
             text.append(fm)
         for i in range(0, 100, 20)
     ],
+    # mlabonne/open-perfectblend - 1.48 GB, 1,420,909
+    #   meta-math/MetaMathQA 	395,000
+    #   openbmb/UltraInteract_sft 	288,579
+    #   HuggingFaceH4/ultrachat_200k 	207,865
+    #   microsoft/orca-math-word-problems-200k 	200,035
+    #   HuggingFaceH4/ultrafeedback_binarized 	187,405
+    #   theblackcat102/evol-codealpaca-v1 	111,272
+    #   Post-training-Data-Flywheel/AutoIF-instruct-61k 	61,492
+    #   mlabonne/lmsys-arena-human-preference-55k-sharegpt 	57,362
+    [
+        {'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
     #
     # math
     #
+    ## 6.07 GB, 11,402,286
+    # [
+    #     {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
+    #     for i in range(0, 100, 10)
+    # ],
+    # 912 MB, 2,570,505
     [
+        {'path': 'ai2-adapt-dev/openmath-2-gsm8k', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
         for i in range(0, 100, 10)
     ],

scripts/prepare_pretrain_dataset.py CHANGED Viewed

@@ -97,7 +97,7 @@ datasets_configs = [
         # ~3 GB, 4,976,850
         {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
         for name in [
-            'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
             'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
         ]
     ],
@@ -110,11 +110,11 @@ datasets_configs = [
     #     {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
     #     for i in range(0, 100, 20)
     # ],
-    # 2.89 GB, 430,000, English September of 2017
-    [
-        {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
-        for i in range(0, 100, 20)
-    ],
     # 65.1 MB, 7,819
     {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
@@ -127,16 +127,12 @@ datasets_configs = [
     #
     # math
     #
     # 12.2 MB, 500,000
-    [
-        {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]+test', 'format': '{instruction} = {output}'}
-        for i in range(0, 100, 20)
-    ],
     # 125 MB, 1,000,000
-    [
-        {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': f'train[{i}%:{i + 20}%]', 'format': '{expression} = {result}'}
-        for i in range(0, 100, 20)
-    ],
     ## 3.49 GB, 22,259,474
     # [
     #     {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
@@ -147,18 +143,20 @@ datasets_configs = [
     #     {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
     #     for i in range(0, 100, 20)
     # ],
-    # 12.6 GB, 21,972,791 - we use 1M subset
     [
         {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
         for i in range(0, 100, 20)
     ],
     # stem
-    # 1.44 GB, 63,357
-    [
-        {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['markdown']}
-        for i in range(0, 100, 20)
-    ],
     #
     # code

         # ~3 GB, 4,976,850
         {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
         for name in [
+            # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
             'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
         ]
     ],
     #     {'path': 'wikimedia/wikipedia', 'name': '20231101.en', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
     #     for i in range(0, 100, 20)
     # ],
+    ## 2.89 GB, 430,000, English September of 2017
+    # [
+    #     {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['summary']}
+    #     for i in range(0, 100, 20)
+    # ],
     # 65.1 MB, 7,819
     {'path': 'Sketched33/Cities_Wikipedia_Information', 'format': lambda n: n['wikipedia_content']},
     #
     # math
     #
+    # 2.87 GB, 552,000 - images/text - we use only latex text, top 10%
+    {'path': 'OleehyO/latex-formulas', 'name': 'cleaned_formulas', 'split': 'train[:10%]', 'format': lambda n: n['latex_formula']},
     # 12.2 MB, 500,000
+    {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
     # 125 MB, 1,000,000
+    {'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
     ## 3.49 GB, 22,259,474
     # [
     #     {'path': 'AtlasUnified/atlas-math-sets', 'split': f'train[{i}%:{i + 20}%]+validation+test', 'format': '{instruction} . {output}'}
     #     {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
     #     for i in range(0, 100, 20)
     # ],
+    # 12.6 GB, 21,972,791 - we use 1M subset - 639 MB, 1,000,000
     [
         {'path': 'nvidia/OpenMathInstruct-2', 'split': f'train_1M[{i}%:{i + 20}%]', 'format': '{problem} {generated_solution} {expected_answer}'}
         for i in range(0, 100, 20)
     ],
+    #
     # stem
+    #
+    ## 1.44 GB, 63,357
+    # [
+    #     {'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['markdown']}
+    #     for i in range(0, 100, 20)
+    # ],
     #
     # code