eval 0 and 1

ffbfd20 3 months ago

17.4 kB

	---
	license: apache-2.0
	base_model: tangledgroup/tangled-llama-33m-32k-base-v0.1
	pipeline_tag: text-generation
	library_name: transformers
	language: ['en', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zu']
	datasets: [
	'Replete-AI/Everything_Instruct_Multilingual',
	'HuggingFaceH4/ultrachat_200k',
	'HuggingFaceH4/no_robots',
	'datatab/ultrachat_200k_serbian',
	'datatab/ultrafeedback_binarized_serbian',
	'datatab/alpaca-cleaned-serbian-full',
	'datatab/orca_math_world_problem_200k_serbian',
	'datatab/open-orca-slim-serbian',
	]
	tags:
	- litgpt
	- litdata
	---

	# tangled-llama-33m-32k-instruct-v0.1

	![logo](./misc/logo.png)

	A pretrained language model based on the Llama model with about 33M parameters. This model has been trained on 4.2B (`4,252,334,823`) tokens from more than 6.2M (`6,271,145`) dataset rows.

	This model isn't designed for immediate use but rather for Continued Pretraining and Finetuning on a downstream task. It can handle a context length of up to 32K (`32,768`) tokens, and it was pretrained with sequences of 32K (`32,768`) tokens.

	The objective is to streamline the cognitive or reasoning core, eliminating any redundant knowledge from the model.

	[loss, val_loss](https://api.wandb.ai/links/mtasic85/rx2cm1ip)

	[val_ppl](https://api.wandb.ai/links/mtasic85/okegm8vs)

	[epoch](https://api.wandb.ai/links/mtasic85/t5lojxa6)

	[learning_rate](https://api.wandb.ai/links/mtasic85/033xhutk)

	## lm-evaluation-harness

	```bash
	litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-0/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
	```

	\| Tasks \|Version\|Filter\|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|-----------------------------------------------------------\|-------\|------\|-----:\|-----------------------\|---\|-----:\|---\|------\|
	\|leaderboard \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_bbh \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_bbh_boolean_expressions \| 1\|none \| 3\|acc_norm \|↑ \|0.4600\|± \|0.0316\|
	\| - leaderboard_bbh_causal_judgement \| 1\|none \| 3\|acc_norm \|↑ \|0.5187\|± \|0.0366\|
	\| - leaderboard_bbh_date_understanding \| 1\|none \| 3\|acc_norm \|↑ \|0.1560\|± \|0.0230\|
	\| - leaderboard_bbh_disambiguation_qa \| 1\|none \| 3\|acc_norm \|↑ \|0.3000\|± \|0.0290\|
	\| - leaderboard_bbh_formal_fallacies \| 1\|none \| 3\|acc_norm \|↑ \|0.4680\|± \|0.0316\|
	\| - leaderboard_bbh_geometric_shapes \| 1\|none \| 3\|acc_norm \|↑ \|0.0920\|± \|0.0183\|
	\| - leaderboard_bbh_hyperbaton \| 1\|none \| 3\|acc_norm \|↑ \|0.5320\|± \|0.0316\|
	\| - leaderboard_bbh_logical_deduction_five_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.2240\|± \|0.0264\|
	\| - leaderboard_bbh_logical_deduction_seven_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.1600\|± \|0.0232\|
	\| - leaderboard_bbh_logical_deduction_three_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.3360\|± \|0.0299\|
	\| - leaderboard_bbh_movie_recommendation \| 1\|none \| 3\|acc_norm \|↑ \|0.0640\|± \|0.0155\|
	\| - leaderboard_bbh_navigate \| 1\|none \| 3\|acc_norm \|↑ \|0.4200\|± \|0.0313\|
	\| - leaderboard_bbh_object_counting \| 1\|none \| 3\|acc_norm \|↑ \|0.0640\|± \|0.0155\|
	\| - leaderboard_bbh_penguins_in_a_table \| 1\|none \| 3\|acc_norm \|↑ \|0.2397\|± \|0.0355\|
	\| - leaderboard_bbh_reasoning_about_colored_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.1520\|± \|0.0228\|
	\| - leaderboard_bbh_ruin_names \| 1\|none \| 3\|acc_norm \|↑ \|0.2720\|± \|0.0282\|
	\| - leaderboard_bbh_salient_translation_error_detection \| 1\|none \| 3\|acc_norm \|↑ \|0.1240\|± \|0.0209\|
	\| - leaderboard_bbh_snarks \| 1\|none \| 3\|acc_norm \|↑ \|0.5618\|± \|0.0373\|
	\| - leaderboard_bbh_sports_understanding \| 1\|none \| 3\|acc_norm \|↑ \|0.4600\|± \|0.0316\|
	\| - leaderboard_bbh_temporal_sequences \| 1\|none \| 3\|acc_norm \|↑ \|0.2920\|± \|0.0288\|
	\| - leaderboard_bbh_tracking_shuffled_objects_five_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.2040\|± \|0.0255\|
	\| - leaderboard_bbh_tracking_shuffled_objects_seven_objects\| 1\|none \| 3\|acc_norm \|↑ \|0.1200\|± \|0.0206\|
	\| - leaderboard_bbh_tracking_shuffled_objects_three_objects\| 1\|none \| 3\|acc_norm \|↑ \|0.3160\|± \|0.0295\|
	\| - leaderboard_bbh_web_of_lies \| 1\|none \| 3\|acc_norm \|↑ \|0.4880\|± \|0.0317\|
	\| - leaderboard_gpqa \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_gpqa_diamond \| 1\|none \| 0\|acc_norm \|↑ \|0.1919\|± \|0.0281\|
	\| - leaderboard_gpqa_extended \| 1\|none \| 0\|acc_norm \|↑ \|0.2289\|± \|0.0180\|
	\| - leaderboard_gpqa_main \| 1\|none \| 0\|acc_norm \|↑ \|0.2768\|± \|0.0212\|
	\| - leaderboard_ifeval \| 3\|none \| 0\|inst_level_loose_acc \|↑ \|0.2098\|± \| N/A\|
	\| \| \|none \| 0\|inst_level_strict_acc \|↑ \|0.1966\|± \| N/A\|
	\| \| \|none \| 0\|prompt_level_loose_acc \|↑ \|0.1109\|± \|0.0135\|
	\| \| \|none \| 0\|prompt_level_strict_acc\|↑ \|0.1072\|± \|0.0133\|
	\| - leaderboard_math_hard \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_math_algebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_counting_and_prob_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_geometry_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_intermediate_algebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_num_theory_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_prealgebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_precalculus_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_mmlu_pro \| 0.1\|none \| 5\|acc \|↑ \|0.1184\|± \|0.0029\|
	\| - leaderboard_musr \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_musr_murder_mysteries \| 1\|none \| 0\|acc_norm \|↑ \|0.5000\|± \|0.0317\|
	\| - leaderboard_musr_object_placements \| 1\|none \| 0\|acc_norm \|↑ \|0.2578\|± \|0.0274\|
	\| - leaderboard_musr_team_allocation \| 1\|none \| 0\|acc_norm \|↑ \|0.2600\|± \|0.0278\|


	```bash
	litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-1/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
	```

	\| Tasks \|Version\| Filter \|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|---------------------------------------\|------:\|----------------\|-----:\|-----------\|---\|-----:\|---\|-----:\|
	\|arc_challenge \| 1\|none \| 0\|acc \|↑ \|0.1962\|± \|0.0116\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2483\|± \|0.0126\|
	\|gsm8k \| 3\|flexible-extract\| 5\|exact_match\|↑ \|0.0008\|± \|0.0008\|
	\| \| \|strict-match \| 5\|exact_match\|↑ \|0.0000\|± \|0.0000\|
	\|hellaswag \| 1\|none \| 0\|acc \|↑ \|0.2594\|± \|0.0044\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2564\|± \|0.0044\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2575\|± \|0.0037\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2410\|± \|0.0062\|
	\| - formal_logic \| 1\|none \| 0\|acc \|↑ \|0.3016\|± \|0.0410\|
	\| - high_school_european_history \| 1\|none \| 0\|acc \|↑ \|0.1879\|± \|0.0305\|
	\| - high_school_us_history \| 1\|none \| 0\|acc \|↑ \|0.2794\|± \|0.0315\|
	\| - high_school_world_history \| 1\|none \| 0\|acc \|↑ \|0.2363\|± \|0.0277\|
	\| - international_law \| 1\|none \| 0\|acc \|↑ \|0.1240\|± \|0.0301\|
	\| - jurisprudence \| 1\|none \| 0\|acc \|↑ \|0.2315\|± \|0.0408\|
	\| - logical_fallacies \| 1\|none \| 0\|acc \|↑ \|0.2270\|± \|0.0329\|
	\| - moral_disputes \| 1\|none \| 0\|acc \|↑ \|0.2139\|± \|0.0221\|
	\| - moral_scenarios \| 1\|none \| 0\|acc \|↑ \|0.2715\|± \|0.0149\|
	\| - philosophy \| 1\|none \| 0\|acc \|↑ \|0.2572\|± \|0.0248\|
	\| - prehistory \| 1\|none \| 0\|acc \|↑ \|0.2500\|± \|0.0241\|
	\| - professional_law \| 1\|none \| 0\|acc \|↑ \|0.2334\|± \|0.0108\|
	\| - world_religions \| 1\|none \| 0\|acc \|↑ \|0.2281\|± \|0.0322\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2379\|± \|0.0076\|
	\| - business_ethics \| 1\|none \| 0\|acc \|↑ \|0.2800\|± \|0.0451\|
	\| - clinical_knowledge \| 1\|none \| 0\|acc \|↑ \|0.2943\|± \|0.0280\|
	\| - college_medicine \| 1\|none \| 0\|acc \|↑ \|0.2832\|± \|0.0344\|
	\| - global_facts \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - human_aging \| 1\|none \| 0\|acc \|↑ \|0.1794\|± \|0.0257\|
	\| - management \| 1\|none \| 0\|acc \|↑ \|0.3398\|± \|0.0469\|
	\| - marketing \| 1\|none \| 0\|acc \|↑ \|0.1838\|± \|0.0254\|
	\| - medical_genetics \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - miscellaneous \| 1\|none \| 0\|acc \|↑ \|0.2171\|± \|0.0147\|
	\| - nutrition \| 1\|none \| 0\|acc \|↑ \|0.2549\|± \|0.0250\|
	\| - professional_accounting \| 1\|none \| 0\|acc \|↑ \|0.2447\|± \|0.0256\|
	\| - professional_medicine \| 1\|none \| 0\|acc \|↑ \|0.3162\|± \|0.0282\|
	\| - virology \| 1\|none \| 0\|acc \|↑ \|0.1506\|± \|0.0278\|
	\| - social sciences \| 2\|none \| \|acc \|↑ \|0.2899\|± \|0.0082\|
	\| - econometrics \| 1\|none \| 0\|acc \|↑ \|0.2807\|± \|0.0423\|
	\| - high_school_geography \| 1\|none \| 0\|acc \|↑ \|0.3030\|± \|0.0327\|
	\| - high_school_government_and_politics\| 1\|none \| 0\|acc \|↑ \|0.3109\|± \|0.0334\|
	\| - high_school_macroeconomics \| 1\|none \| 0\|acc \|↑ \|0.3103\|± \|0.0235\|
	\| - high_school_microeconomics \| 1\|none \| 0\|acc \|↑ \|0.2563\|± \|0.0284\|
	\| - high_school_psychology \| 1\|none \| 0\|acc \|↑ \|0.3193\|± \|0.0200\|
	\| - human_sexuality \| 1\|none \| 0\|acc \|↑ \|0.2824\|± \|0.0395\|
	\| - professional_psychology \| 1\|none \| 0\|acc \|↑ \|0.2451\|± \|0.0174\|
	\| - public_relations \| 1\|none \| 0\|acc \|↑ \|0.2818\|± \|0.0431\|
	\| - security_studies \| 1\|none \| 0\|acc \|↑ \|0.3592\|± \|0.0307\|
	\| - sociology \| 1\|none \| 0\|acc \|↑ \|0.2488\|± \|0.0306\|
	\| - us_foreign_policy \| 1\|none \| 0\|acc \|↑ \|0.2800\|± \|0.0451\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2699\|± \|0.0079\|
	\| - abstract_algebra \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - anatomy \| 1\|none \| 0\|acc \|↑ \|0.2444\|± \|0.0371\|
	\| - astronomy \| 1\|none \| 0\|acc \|↑ \|0.2961\|± \|0.0372\|
	\| - college_biology \| 1\|none \| 0\|acc \|↑ \|0.2431\|± \|0.0359\|
	\| - college_chemistry \| 1\|none \| 0\|acc \|↑ \|0.3500\|± \|0.0479\|
	\| - college_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2400\|± \|0.0429\|
	\| - college_physics \| 1\|none \| 0\|acc \|↑ \|0.3627\|± \|0.0478\|
	\| - computer_security \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - conceptual_physics \| 1\|none \| 0\|acc \|↑ \|0.2766\|± \|0.0292\|
	\| - electrical_engineering \| 1\|none \| 0\|acc \|↑ \|0.2621\|± \|0.0366\|
	\| - elementary_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2725\|± \|0.0229\|
	\| - high_school_biology \| 1\|none \| 0\|acc \|↑ \|0.3065\|± \|0.0262\|
	\| - high_school_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2562\|± \|0.0307\|
	\| - high_school_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2200\|± \|0.0416\|
	\| - high_school_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2630\|± \|0.0268\|
	\| - high_school_physics \| 1\|none \| 0\|acc \|↑ \|0.2649\|± \|0.0360\|
	\| - high_school_statistics \| 1\|none \| 0\|acc \|↑ \|0.3380\|± \|0.0323\|
	\| - machine_learning \| 1\|none \| 0\|acc \|↑ \|0.1607\|± \|0.0349\|
	\|truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \|0.4992\|± \|0.0163\|
	\|winogrande \| 1\|none \| 0\|acc \|↑ \|0.5075\|± \|0.0141\|

	\| Groups \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|
	\|------------------\|------:\|------\|------\|------\|---\|-----:\|---\|-----:\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2575\|± \|0.0037\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2410\|± \|0.0062\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2379\|± \|0.0076\|
	\| - social sciences\| 2\|none \| \|acc \|↑ \|0.2899\|± \|0.0082\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2699\|± \|0.0079\|

	---
	license: apache-2.0
	base_model: tangledgroup/tangled-llama-33m-32k-base-v0.1
	pipeline_tag: text-generation
	library_name: transformers
	language: ['en', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zu']
	datasets: [
	'Replete-AI/Everything_Instruct_Multilingual',
	'HuggingFaceH4/ultrachat_200k',
	'HuggingFaceH4/no_robots',
	'datatab/ultrachat_200k_serbian',
	'datatab/ultrafeedback_binarized_serbian',
	'datatab/alpaca-cleaned-serbian-full',
	'datatab/orca_math_world_problem_200k_serbian',
	'datatab/open-orca-slim-serbian',
	]
	tags:
	- litgpt
	- litdata
	---

	# tangled-llama-33m-32k-instruct-v0.1

	![logo](./misc/logo.png)

	A pretrained language model based on the Llama model with about 33M parameters. This model has been trained on 4.2B (`4,252,334,823`) tokens from more than 6.2M (`6,271,145`) dataset rows.

	This model isn't designed for immediate use but rather for Continued Pretraining and Finetuning on a downstream task. It can handle a context length of up to 32K (`32,768`) tokens, and it was pretrained with sequences of 32K (`32,768`) tokens.

	The objective is to streamline the cognitive or reasoning core, eliminating any redundant knowledge from the model.

	[loss, val_loss](https://api.wandb.ai/links/mtasic85/rx2cm1ip)

	[val_ppl](https://api.wandb.ai/links/mtasic85/okegm8vs)

	[epoch](https://api.wandb.ai/links/mtasic85/t5lojxa6)

	[learning_rate](https://api.wandb.ai/links/mtasic85/033xhutk)

	## lm-evaluation-harness

	```bash
	litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-0/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
	```

	\| Tasks \|Version\|Filter\|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|-----------------------------------------------------------\|-------\|------\|-----:\|-----------------------\|---\|-----:\|---\|------\|
	\|leaderboard \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_bbh \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_bbh_boolean_expressions \| 1\|none \| 3\|acc_norm \|↑ \|0.4600\|± \|0.0316\|
	\| - leaderboard_bbh_causal_judgement \| 1\|none \| 3\|acc_norm \|↑ \|0.5187\|± \|0.0366\|
	\| - leaderboard_bbh_date_understanding \| 1\|none \| 3\|acc_norm \|↑ \|0.1560\|± \|0.0230\|
	\| - leaderboard_bbh_disambiguation_qa \| 1\|none \| 3\|acc_norm \|↑ \|0.3000\|± \|0.0290\|
	\| - leaderboard_bbh_formal_fallacies \| 1\|none \| 3\|acc_norm \|↑ \|0.4680\|± \|0.0316\|
	\| - leaderboard_bbh_geometric_shapes \| 1\|none \| 3\|acc_norm \|↑ \|0.0920\|± \|0.0183\|
	\| - leaderboard_bbh_hyperbaton \| 1\|none \| 3\|acc_norm \|↑ \|0.5320\|± \|0.0316\|
	\| - leaderboard_bbh_logical_deduction_five_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.2240\|± \|0.0264\|
	\| - leaderboard_bbh_logical_deduction_seven_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.1600\|± \|0.0232\|
	\| - leaderboard_bbh_logical_deduction_three_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.3360\|± \|0.0299\|
	\| - leaderboard_bbh_movie_recommendation \| 1\|none \| 3\|acc_norm \|↑ \|0.0640\|± \|0.0155\|
	\| - leaderboard_bbh_navigate \| 1\|none \| 3\|acc_norm \|↑ \|0.4200\|± \|0.0313\|
	\| - leaderboard_bbh_object_counting \| 1\|none \| 3\|acc_norm \|↑ \|0.0640\|± \|0.0155\|
	\| - leaderboard_bbh_penguins_in_a_table \| 1\|none \| 3\|acc_norm \|↑ \|0.2397\|± \|0.0355\|
	\| - leaderboard_bbh_reasoning_about_colored_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.1520\|± \|0.0228\|
	\| - leaderboard_bbh_ruin_names \| 1\|none \| 3\|acc_norm \|↑ \|0.2720\|± \|0.0282\|
	\| - leaderboard_bbh_salient_translation_error_detection \| 1\|none \| 3\|acc_norm \|↑ \|0.1240\|± \|0.0209\|
	\| - leaderboard_bbh_snarks \| 1\|none \| 3\|acc_norm \|↑ \|0.5618\|± \|0.0373\|
	\| - leaderboard_bbh_sports_understanding \| 1\|none \| 3\|acc_norm \|↑ \|0.4600\|± \|0.0316\|
	\| - leaderboard_bbh_temporal_sequences \| 1\|none \| 3\|acc_norm \|↑ \|0.2920\|± \|0.0288\|
	\| - leaderboard_bbh_tracking_shuffled_objects_five_objects \| 1\|none \| 3\|acc_norm \|↑ \|0.2040\|± \|0.0255\|
	\| - leaderboard_bbh_tracking_shuffled_objects_seven_objects\| 1\|none \| 3\|acc_norm \|↑ \|0.1200\|± \|0.0206\|
	\| - leaderboard_bbh_tracking_shuffled_objects_three_objects\| 1\|none \| 3\|acc_norm \|↑ \|0.3160\|± \|0.0295\|
	\| - leaderboard_bbh_web_of_lies \| 1\|none \| 3\|acc_norm \|↑ \|0.4880\|± \|0.0317\|
	\| - leaderboard_gpqa \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_gpqa_diamond \| 1\|none \| 0\|acc_norm \|↑ \|0.1919\|± \|0.0281\|
	\| - leaderboard_gpqa_extended \| 1\|none \| 0\|acc_norm \|↑ \|0.2289\|± \|0.0180\|
	\| - leaderboard_gpqa_main \| 1\|none \| 0\|acc_norm \|↑ \|0.2768\|± \|0.0212\|
	\| - leaderboard_ifeval \| 3\|none \| 0\|inst_level_loose_acc \|↑ \|0.2098\|± \| N/A\|
	\| \| \|none \| 0\|inst_level_strict_acc \|↑ \|0.1966\|± \| N/A\|
	\| \| \|none \| 0\|prompt_level_loose_acc \|↑ \|0.1109\|± \|0.0135\|
	\| \| \|none \| 0\|prompt_level_strict_acc\|↑ \|0.1072\|± \|0.0133\|
	\| - leaderboard_math_hard \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_math_algebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_counting_and_prob_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_geometry_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_intermediate_algebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_num_theory_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_prealgebra_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_math_precalculus_hard \| 1\|none \| 4\|exact_match \|↑ \|0.0000\|± \| 0\|
	\| - leaderboard_mmlu_pro \| 0.1\|none \| 5\|acc \|↑ \|0.1184\|± \|0.0029\|
	\| - leaderboard_musr \| N/A\| \| \| \| \| \| \| \|
	\| - leaderboard_musr_murder_mysteries \| 1\|none \| 0\|acc_norm \|↑ \|0.5000\|± \|0.0317\|
	\| - leaderboard_musr_object_placements \| 1\|none \| 0\|acc_norm \|↑ \|0.2578\|± \|0.0274\|
	\| - leaderboard_musr_team_allocation \| 1\|none \| 0\|acc_norm \|↑ \|0.2600\|± \|0.0278\|


	```bash
	litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-1/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
	```

	\| Tasks \|Version\| Filter \|n-shot\| Metric \| \|Value \| \|Stderr\|
	\|---------------------------------------\|------:\|----------------\|-----:\|-----------\|---\|-----:\|---\|-----:\|
	\|arc_challenge \| 1\|none \| 0\|acc \|↑ \|0.1962\|± \|0.0116\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2483\|± \|0.0126\|
	\|gsm8k \| 3\|flexible-extract\| 5\|exact_match\|↑ \|0.0008\|± \|0.0008\|
	\| \| \|strict-match \| 5\|exact_match\|↑ \|0.0000\|± \|0.0000\|
	\|hellaswag \| 1\|none \| 0\|acc \|↑ \|0.2594\|± \|0.0044\|
	\| \| \|none \| 0\|acc_norm \|↑ \|0.2564\|± \|0.0044\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2575\|± \|0.0037\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2410\|± \|0.0062\|
	\| - formal_logic \| 1\|none \| 0\|acc \|↑ \|0.3016\|± \|0.0410\|
	\| - high_school_european_history \| 1\|none \| 0\|acc \|↑ \|0.1879\|± \|0.0305\|
	\| - high_school_us_history \| 1\|none \| 0\|acc \|↑ \|0.2794\|± \|0.0315\|
	\| - high_school_world_history \| 1\|none \| 0\|acc \|↑ \|0.2363\|± \|0.0277\|
	\| - international_law \| 1\|none \| 0\|acc \|↑ \|0.1240\|± \|0.0301\|
	\| - jurisprudence \| 1\|none \| 0\|acc \|↑ \|0.2315\|± \|0.0408\|
	\| - logical_fallacies \| 1\|none \| 0\|acc \|↑ \|0.2270\|± \|0.0329\|
	\| - moral_disputes \| 1\|none \| 0\|acc \|↑ \|0.2139\|± \|0.0221\|
	\| - moral_scenarios \| 1\|none \| 0\|acc \|↑ \|0.2715\|± \|0.0149\|
	\| - philosophy \| 1\|none \| 0\|acc \|↑ \|0.2572\|± \|0.0248\|
	\| - prehistory \| 1\|none \| 0\|acc \|↑ \|0.2500\|± \|0.0241\|
	\| - professional_law \| 1\|none \| 0\|acc \|↑ \|0.2334\|± \|0.0108\|
	\| - world_religions \| 1\|none \| 0\|acc \|↑ \|0.2281\|± \|0.0322\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2379\|± \|0.0076\|
	\| - business_ethics \| 1\|none \| 0\|acc \|↑ \|0.2800\|± \|0.0451\|
	\| - clinical_knowledge \| 1\|none \| 0\|acc \|↑ \|0.2943\|± \|0.0280\|
	\| - college_medicine \| 1\|none \| 0\|acc \|↑ \|0.2832\|± \|0.0344\|
	\| - global_facts \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - human_aging \| 1\|none \| 0\|acc \|↑ \|0.1794\|± \|0.0257\|
	\| - management \| 1\|none \| 0\|acc \|↑ \|0.3398\|± \|0.0469\|
	\| - marketing \| 1\|none \| 0\|acc \|↑ \|0.1838\|± \|0.0254\|
	\| - medical_genetics \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - miscellaneous \| 1\|none \| 0\|acc \|↑ \|0.2171\|± \|0.0147\|
	\| - nutrition \| 1\|none \| 0\|acc \|↑ \|0.2549\|± \|0.0250\|
	\| - professional_accounting \| 1\|none \| 0\|acc \|↑ \|0.2447\|± \|0.0256\|
	\| - professional_medicine \| 1\|none \| 0\|acc \|↑ \|0.3162\|± \|0.0282\|
	\| - virology \| 1\|none \| 0\|acc \|↑ \|0.1506\|± \|0.0278\|
	\| - social sciences \| 2\|none \| \|acc \|↑ \|0.2899\|± \|0.0082\|
	\| - econometrics \| 1\|none \| 0\|acc \|↑ \|0.2807\|± \|0.0423\|
	\| - high_school_geography \| 1\|none \| 0\|acc \|↑ \|0.3030\|± \|0.0327\|
	\| - high_school_government_and_politics\| 1\|none \| 0\|acc \|↑ \|0.3109\|± \|0.0334\|
	\| - high_school_macroeconomics \| 1\|none \| 0\|acc \|↑ \|0.3103\|± \|0.0235\|
	\| - high_school_microeconomics \| 1\|none \| 0\|acc \|↑ \|0.2563\|± \|0.0284\|
	\| - high_school_psychology \| 1\|none \| 0\|acc \|↑ \|0.3193\|± \|0.0200\|
	\| - human_sexuality \| 1\|none \| 0\|acc \|↑ \|0.2824\|± \|0.0395\|
	\| - professional_psychology \| 1\|none \| 0\|acc \|↑ \|0.2451\|± \|0.0174\|
	\| - public_relations \| 1\|none \| 0\|acc \|↑ \|0.2818\|± \|0.0431\|
	\| - security_studies \| 1\|none \| 0\|acc \|↑ \|0.3592\|± \|0.0307\|
	\| - sociology \| 1\|none \| 0\|acc \|↑ \|0.2488\|± \|0.0306\|
	\| - us_foreign_policy \| 1\|none \| 0\|acc \|↑ \|0.2800\|± \|0.0451\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2699\|± \|0.0079\|
	\| - abstract_algebra \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - anatomy \| 1\|none \| 0\|acc \|↑ \|0.2444\|± \|0.0371\|
	\| - astronomy \| 1\|none \| 0\|acc \|↑ \|0.2961\|± \|0.0372\|
	\| - college_biology \| 1\|none \| 0\|acc \|↑ \|0.2431\|± \|0.0359\|
	\| - college_chemistry \| 1\|none \| 0\|acc \|↑ \|0.3500\|± \|0.0479\|
	\| - college_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2700\|± \|0.0446\|
	\| - college_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2400\|± \|0.0429\|
	\| - college_physics \| 1\|none \| 0\|acc \|↑ \|0.3627\|± \|0.0478\|
	\| - computer_security \| 1\|none \| 0\|acc \|↑ \|0.1900\|± \|0.0394\|
	\| - conceptual_physics \| 1\|none \| 0\|acc \|↑ \|0.2766\|± \|0.0292\|
	\| - electrical_engineering \| 1\|none \| 0\|acc \|↑ \|0.2621\|± \|0.0366\|
	\| - elementary_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2725\|± \|0.0229\|
	\| - high_school_biology \| 1\|none \| 0\|acc \|↑ \|0.3065\|± \|0.0262\|
	\| - high_school_chemistry \| 1\|none \| 0\|acc \|↑ \|0.2562\|± \|0.0307\|
	\| - high_school_computer_science \| 1\|none \| 0\|acc \|↑ \|0.2200\|± \|0.0416\|
	\| - high_school_mathematics \| 1\|none \| 0\|acc \|↑ \|0.2630\|± \|0.0268\|
	\| - high_school_physics \| 1\|none \| 0\|acc \|↑ \|0.2649\|± \|0.0360\|
	\| - high_school_statistics \| 1\|none \| 0\|acc \|↑ \|0.3380\|± \|0.0323\|
	\| - machine_learning \| 1\|none \| 0\|acc \|↑ \|0.1607\|± \|0.0349\|
	\|truthfulqa_mc2 \| 2\|none \| 0\|acc \|↑ \|0.4992\|± \|0.0163\|
	\|winogrande \| 1\|none \| 0\|acc \|↑ \|0.5075\|± \|0.0141\|

	\| Groups \|Version\|Filter\|n-shot\|Metric\| \|Value \| \|Stderr\|
	\|------------------\|------:\|------\|------\|------\|---\|-----:\|---\|-----:\|
	\|mmlu \| 2\|none \| \|acc \|↑ \|0.2575\|± \|0.0037\|
	\| - humanities \| 2\|none \| \|acc \|↑ \|0.2410\|± \|0.0062\|
	\| - other \| 2\|none \| \|acc \|↑ \|0.2379\|± \|0.0076\|
	\| - social sciences\| 2\|none \| \|acc \|↑ \|0.2899\|± \|0.0082\|
	\| - stem \| 2\|none \| \|acc \|↑ \|0.2699\|± \|0.0079\|