lm1-misc-pile
/
3b977b77b
/evaluation
/lm1-3b9-77b-results_lm-eval_global_step73814_2022-12-02-14-09-22.csv
task,metric,value,err,version | |
copa,acc,0.74,0.044084400227680794,0 | |
hendrycksTest-abstract_algebra,acc,0.22,0.04163331998932268,0 | |
hendrycksTest-abstract_algebra,acc_norm,0.26,0.04408440022768079,0 | |
hendrycksTest-anatomy,acc,0.28888888888888886,0.03915450630414251,0 | |
hendrycksTest-anatomy,acc_norm,0.22962962962962963,0.036333844140734636,0 | |
hendrycksTest-astronomy,acc,0.21710526315789475,0.03355045304882921,0 | |
hendrycksTest-astronomy,acc_norm,0.3355263157894737,0.03842498559395271,0 | |
hendrycksTest-business_ethics,acc,0.34,0.04760952285695235,0 | |
hendrycksTest-business_ethics,acc_norm,0.31,0.04648231987117316,0 | |
hendrycksTest-clinical_knowledge,acc,0.24528301886792453,0.026480357179895678,0 | |
hendrycksTest-clinical_knowledge,acc_norm,0.30943396226415093,0.02845015479411863,0 | |
hendrycksTest-college_biology,acc,0.2152777777777778,0.034370793441061344,0 | |
hendrycksTest-college_biology,acc_norm,0.2222222222222222,0.034765901043041336,0 | |
hendrycksTest-college_chemistry,acc,0.26,0.04408440022768078,0 | |
hendrycksTest-college_chemistry,acc_norm,0.29,0.045604802157206845,0 | |
hendrycksTest-college_computer_science,acc,0.29,0.04560480215720684,0 | |
hendrycksTest-college_computer_science,acc_norm,0.24,0.04292346959909284,0 | |
hendrycksTest-college_mathematics,acc,0.2,0.04020151261036845,0 | |
hendrycksTest-college_mathematics,acc_norm,0.3,0.046056618647183814,0 | |
hendrycksTest-college_medicine,acc,0.2543352601156069,0.0332055644308557,0 | |
hendrycksTest-college_medicine,acc_norm,0.2543352601156069,0.0332055644308557,0 | |
hendrycksTest-college_physics,acc,0.2549019607843137,0.043364327079931764,0 | |
hendrycksTest-college_physics,acc_norm,0.28431372549019607,0.04488482852329017,0 | |
hendrycksTest-computer_security,acc,0.28,0.04512608598542126,0 | |
hendrycksTest-computer_security,acc_norm,0.36,0.048241815132442176,0 | |
hendrycksTest-conceptual_physics,acc,0.2553191489361702,0.028504856470514203,0 | |
hendrycksTest-conceptual_physics,acc_norm,0.1829787234042553,0.025276041000449966,0 | |
hendrycksTest-econometrics,acc,0.21929824561403508,0.03892431106518753,0 | |
hendrycksTest-econometrics,acc_norm,0.21929824561403508,0.03892431106518754,0 | |
hendrycksTest-electrical_engineering,acc,0.2689655172413793,0.036951833116502325,0 | |
hendrycksTest-electrical_engineering,acc_norm,0.30344827586206896,0.038312260488503336,0 | |
hendrycksTest-elementary_mathematics,acc,0.21957671957671956,0.02132001859977036,0 | |
hendrycksTest-elementary_mathematics,acc_norm,0.25925925925925924,0.022569897074918407,0 | |
hendrycksTest-formal_logic,acc,0.29365079365079366,0.04073524322147127,0 | |
hendrycksTest-formal_logic,acc_norm,0.23809523809523808,0.038095238095238126,0 | |
hendrycksTest-global_facts,acc,0.19,0.039427724440366234,0 | |
hendrycksTest-global_facts,acc_norm,0.2,0.04020151261036846,0 | |
hendrycksTest-high_school_biology,acc,0.23548387096774193,0.02413763242933771,0 | |
hendrycksTest-high_school_biology,acc_norm,0.3032258064516129,0.026148685930671746,0 | |
hendrycksTest-high_school_chemistry,acc,0.2019704433497537,0.028247350122180277,0 | |
hendrycksTest-high_school_chemistry,acc_norm,0.270935960591133,0.031270907132976984,0 | |
hendrycksTest-high_school_computer_science,acc,0.23,0.04229525846816506,0 | |
hendrycksTest-high_school_computer_science,acc_norm,0.28,0.04512608598542128,0 | |
hendrycksTest-high_school_european_history,acc,0.24848484848484848,0.03374402644139406,0 | |
hendrycksTest-high_school_european_history,acc_norm,0.3090909090909091,0.036085410115739666,0 | |
hendrycksTest-high_school_geography,acc,0.18181818181818182,0.027479603010538787,0 | |
hendrycksTest-high_school_geography,acc_norm,0.2878787878787879,0.03225883512300993,0 | |
hendrycksTest-high_school_government_and_politics,acc,0.21243523316062177,0.02951928261681725,0 | |
hendrycksTest-high_school_government_and_politics,acc_norm,0.2538860103626943,0.03141024780565318,0 | |
hendrycksTest-high_school_macroeconomics,acc,0.2358974358974359,0.02152596540740873,0 | |
hendrycksTest-high_school_macroeconomics,acc_norm,0.27692307692307694,0.022688042352424994,0 | |
hendrycksTest-high_school_mathematics,acc,0.1925925925925926,0.024043075181945192,0 | |
hendrycksTest-high_school_mathematics,acc_norm,0.21481481481481482,0.025040443877000686,0 | |
hendrycksTest-high_school_microeconomics,acc,0.24369747899159663,0.027886828078380558,0 | |
hendrycksTest-high_school_microeconomics,acc_norm,0.29831932773109243,0.02971914287634287,0 | |
hendrycksTest-high_school_physics,acc,0.19205298013245034,0.032162984205936156,0 | |
hendrycksTest-high_school_physics,acc_norm,0.25165562913907286,0.03543304234389985,0 | |
hendrycksTest-high_school_psychology,acc,0.22935779816513763,0.018025349724618684,0 | |
hendrycksTest-high_school_psychology,acc_norm,0.24036697247706423,0.01832060732096407,0 | |
hendrycksTest-high_school_statistics,acc,0.23148148148148148,0.028765111718046976,0 | |
hendrycksTest-high_school_statistics,acc_norm,0.28703703703703703,0.030851992993257017,0 | |
hendrycksTest-high_school_us_history,acc,0.22058823529411764,0.02910225438967409,0 | |
hendrycksTest-high_school_us_history,acc_norm,0.2647058823529412,0.0309645179269234,0 | |
hendrycksTest-high_school_world_history,acc,0.270042194092827,0.028900721906293426,0 | |
hendrycksTest-high_school_world_history,acc_norm,0.3037974683544304,0.029936696387138605,0 | |
hendrycksTest-human_aging,acc,0.3094170403587444,0.031024411740572206,0 | |
hendrycksTest-human_aging,acc_norm,0.22869955156950672,0.028188240046929193,0 | |
hendrycksTest-human_sexuality,acc,0.42748091603053434,0.04338920305792401,0 | |
hendrycksTest-human_sexuality,acc_norm,0.31297709923664124,0.04066962905677698,0 | |
hendrycksTest-international_law,acc,0.23140495867768596,0.03849856098794089,0 | |
hendrycksTest-international_law,acc_norm,0.4462809917355372,0.0453793517794788,0 | |
hendrycksTest-jurisprudence,acc,0.3148148148148148,0.04489931073591312,0 | |
hendrycksTest-jurisprudence,acc_norm,0.42592592592592593,0.0478034362693679,0 | |
hendrycksTest-logical_fallacies,acc,0.25766871165644173,0.03436150827846917,0 | |
hendrycksTest-logical_fallacies,acc_norm,0.3067484662576687,0.036230899157241474,0 | |
hendrycksTest-machine_learning,acc,0.3125,0.043994650575715215,0 | |
hendrycksTest-machine_learning,acc_norm,0.25892857142857145,0.04157751539865629,0 | |
hendrycksTest-management,acc,0.27184466019417475,0.044052680241409216,0 | |
hendrycksTest-management,acc_norm,0.33980582524271846,0.046897659372781335,0 | |
hendrycksTest-marketing,acc,0.27350427350427353,0.029202540153431163,0 | |
hendrycksTest-marketing,acc_norm,0.2905982905982906,0.029745048572674054,0 | |
hendrycksTest-medical_genetics,acc,0.28,0.04512608598542127,0 | |
hendrycksTest-medical_genetics,acc_norm,0.37,0.04852365870939099,0 | |
hendrycksTest-miscellaneous,acc,0.26309067688378035,0.015745497169049046,0 | |
hendrycksTest-miscellaneous,acc_norm,0.2656449553001277,0.01579430248788872,0 | |
hendrycksTest-moral_disputes,acc,0.2658959537572254,0.02378620325550828,0 | |
hendrycksTest-moral_disputes,acc_norm,0.3236994219653179,0.025190181327608408,0 | |
hendrycksTest-moral_scenarios,acc,0.23910614525139665,0.014265554192331144,0 | |
hendrycksTest-moral_scenarios,acc_norm,0.27262569832402234,0.014893391735249588,0 | |
hendrycksTest-nutrition,acc,0.2581699346405229,0.025058503316958157,0 | |
hendrycksTest-nutrition,acc_norm,0.3790849673202614,0.027780141207023334,0 | |
hendrycksTest-philosophy,acc,0.24115755627009647,0.024296594034763426,0 | |
hendrycksTest-philosophy,acc_norm,0.3086816720257235,0.026236965881153252,0 | |
hendrycksTest-prehistory,acc,0.26851851851851855,0.024659685185967287,0 | |
hendrycksTest-prehistory,acc_norm,0.21296296296296297,0.022779719088733396,0 | |
hendrycksTest-professional_accounting,acc,0.2198581560283688,0.024706141070705474,0 | |
hendrycksTest-professional_accounting,acc_norm,0.22695035460992907,0.024987106365642962,0 | |
hendrycksTest-professional_law,acc,0.27509778357235987,0.011405443620996939,0 | |
hendrycksTest-professional_law,acc_norm,0.288135593220339,0.011567140661324565,0 | |
hendrycksTest-professional_medicine,acc,0.1801470588235294,0.023345163616544835,0 | |
hendrycksTest-professional_medicine,acc_norm,0.2610294117647059,0.026679252270103124,0 | |
hendrycksTest-professional_psychology,acc,0.24509803921568626,0.01740181671142766,0 | |
hendrycksTest-professional_psychology,acc_norm,0.2581699346405229,0.017704531653250068,0 | |
hendrycksTest-public_relations,acc,0.24545454545454545,0.04122066502878285,0 | |
hendrycksTest-public_relations,acc_norm,0.15454545454545454,0.03462262571262667,0 | |
hendrycksTest-security_studies,acc,0.32653061224489793,0.030021056238440313,0 | |
hendrycksTest-security_studies,acc_norm,0.2693877551020408,0.02840125202902294,0 | |
hendrycksTest-sociology,acc,0.24378109452736318,0.030360490154014645,0 | |
hendrycksTest-sociology,acc_norm,0.2736318407960199,0.03152439186555402,0 | |
hendrycksTest-us_foreign_policy,acc,0.34,0.04760952285695235,0 | |
hendrycksTest-us_foreign_policy,acc_norm,0.36,0.04824181513244218,0 | |
hendrycksTest-virology,acc,0.3132530120481928,0.036108050180310235,0 | |
hendrycksTest-virology,acc_norm,0.2710843373493976,0.034605799075530276,0 | |
hendrycksTest-world_religions,acc,0.3157894736842105,0.03565079670708311,0 | |
hendrycksTest-world_religions,acc_norm,0.38596491228070173,0.03733756969066164,0 | |
piqa,acc,0.705658324265506,0.01063331147034749,0 | |
piqa,acc_norm,0.7121871599564744,0.01056325038305919,0 | |
rte,acc,0.5379061371841155,0.030009848912529117,0 | |
winogrande,acc,0.5564325177584846,0.0139626949076204,0 | |