lm1-misc-pile / 3b977b77b /evaluation /lm1-3b9-77b-results_lm-eval_global_step73814_2022-12-02-14-09-22.json
Muennighoff's picture
Add
3970485
raw
history blame
16.6 kB
{
"results": {
"copa": {
"acc": 0.74,
"acc_stderr": 0.044084400227680794
},
"piqa": {
"acc": 0.705658324265506,
"acc_stderr": 0.01063331147034749,
"acc_norm": 0.7121871599564744,
"acc_norm_stderr": 0.01056325038305919
},
"rte": {
"acc": 0.5379061371841155,
"acc_stderr": 0.030009848912529117
},
"winogrande": {
"acc": 0.5564325177584846,
"acc_stderr": 0.0139626949076204
},
"hendrycksTest-abstract_algebra": {
"acc": 0.22,
"acc_stderr": 0.04163331998932268,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768079
},
"hendrycksTest-anatomy": {
"acc": 0.28888888888888886,
"acc_stderr": 0.03915450630414251,
"acc_norm": 0.22962962962962963,
"acc_norm_stderr": 0.036333844140734636
},
"hendrycksTest-astronomy": {
"acc": 0.21710526315789475,
"acc_stderr": 0.03355045304882921,
"acc_norm": 0.3355263157894737,
"acc_norm_stderr": 0.03842498559395271
},
"hendrycksTest-business_ethics": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.24528301886792453,
"acc_stderr": 0.026480357179895678,
"acc_norm": 0.30943396226415093,
"acc_norm_stderr": 0.02845015479411863
},
"hendrycksTest-college_biology": {
"acc": 0.2152777777777778,
"acc_stderr": 0.034370793441061344,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.034765901043041336
},
"hendrycksTest-college_chemistry": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.29,
"acc_norm_stderr": 0.045604802157206845
},
"hendrycksTest-college_computer_science": {
"acc": 0.29,
"acc_stderr": 0.04560480215720684,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909284
},
"hendrycksTest-college_mathematics": {
"acc": 0.2,
"acc_stderr": 0.04020151261036845,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_medicine": {
"acc": 0.2543352601156069,
"acc_stderr": 0.0332055644308557,
"acc_norm": 0.2543352601156069,
"acc_norm_stderr": 0.0332055644308557
},
"hendrycksTest-college_physics": {
"acc": 0.2549019607843137,
"acc_stderr": 0.043364327079931764,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.04488482852329017
},
"hendrycksTest-computer_security": {
"acc": 0.28,
"acc_stderr": 0.04512608598542126,
"acc_norm": 0.36,
"acc_norm_stderr": 0.048241815132442176
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2553191489361702,
"acc_stderr": 0.028504856470514203,
"acc_norm": 0.1829787234042553,
"acc_norm_stderr": 0.025276041000449966
},
"hendrycksTest-econometrics": {
"acc": 0.21929824561403508,
"acc_stderr": 0.03892431106518753,
"acc_norm": 0.21929824561403508,
"acc_norm_stderr": 0.03892431106518754
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2689655172413793,
"acc_stderr": 0.036951833116502325,
"acc_norm": 0.30344827586206896,
"acc_norm_stderr": 0.038312260488503336
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.21957671957671956,
"acc_stderr": 0.02132001859977036,
"acc_norm": 0.25925925925925924,
"acc_norm_stderr": 0.022569897074918407
},
"hendrycksTest-formal_logic": {
"acc": 0.29365079365079366,
"acc_stderr": 0.04073524322147127,
"acc_norm": 0.23809523809523808,
"acc_norm_stderr": 0.038095238095238126
},
"hendrycksTest-global_facts": {
"acc": 0.19,
"acc_stderr": 0.039427724440366234,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036846
},
"hendrycksTest-high_school_biology": {
"acc": 0.23548387096774193,
"acc_stderr": 0.02413763242933771,
"acc_norm": 0.3032258064516129,
"acc_norm_stderr": 0.026148685930671746
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2019704433497537,
"acc_stderr": 0.028247350122180277,
"acc_norm": 0.270935960591133,
"acc_norm_stderr": 0.031270907132976984
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542128
},
"hendrycksTest-high_school_european_history": {
"acc": 0.24848484848484848,
"acc_stderr": 0.03374402644139406,
"acc_norm": 0.3090909090909091,
"acc_norm_stderr": 0.036085410115739666
},
"hendrycksTest-high_school_geography": {
"acc": 0.18181818181818182,
"acc_stderr": 0.027479603010538787,
"acc_norm": 0.2878787878787879,
"acc_norm_stderr": 0.03225883512300993
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.21243523316062177,
"acc_stderr": 0.02951928261681725,
"acc_norm": 0.2538860103626943,
"acc_norm_stderr": 0.03141024780565318
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2358974358974359,
"acc_stderr": 0.02152596540740873,
"acc_norm": 0.27692307692307694,
"acc_norm_stderr": 0.022688042352424994
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.1925925925925926,
"acc_stderr": 0.024043075181945192,
"acc_norm": 0.21481481481481482,
"acc_norm_stderr": 0.025040443877000686
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.24369747899159663,
"acc_stderr": 0.027886828078380558,
"acc_norm": 0.29831932773109243,
"acc_norm_stderr": 0.02971914287634287
},
"hendrycksTest-high_school_physics": {
"acc": 0.19205298013245034,
"acc_stderr": 0.032162984205936156,
"acc_norm": 0.25165562913907286,
"acc_norm_stderr": 0.03543304234389985
},
"hendrycksTest-high_school_psychology": {
"acc": 0.22935779816513763,
"acc_stderr": 0.018025349724618684,
"acc_norm": 0.24036697247706423,
"acc_norm_stderr": 0.01832060732096407
},
"hendrycksTest-high_school_statistics": {
"acc": 0.23148148148148148,
"acc_stderr": 0.028765111718046976,
"acc_norm": 0.28703703703703703,
"acc_norm_stderr": 0.030851992993257017
},
"hendrycksTest-high_school_us_history": {
"acc": 0.22058823529411764,
"acc_stderr": 0.02910225438967409,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.0309645179269234
},
"hendrycksTest-high_school_world_history": {
"acc": 0.270042194092827,
"acc_stderr": 0.028900721906293426,
"acc_norm": 0.3037974683544304,
"acc_norm_stderr": 0.029936696387138605
},
"hendrycksTest-human_aging": {
"acc": 0.3094170403587444,
"acc_stderr": 0.031024411740572206,
"acc_norm": 0.22869955156950672,
"acc_norm_stderr": 0.028188240046929193
},
"hendrycksTest-human_sexuality": {
"acc": 0.42748091603053434,
"acc_stderr": 0.04338920305792401,
"acc_norm": 0.31297709923664124,
"acc_norm_stderr": 0.04066962905677698
},
"hendrycksTest-international_law": {
"acc": 0.23140495867768596,
"acc_stderr": 0.03849856098794089,
"acc_norm": 0.4462809917355372,
"acc_norm_stderr": 0.0453793517794788
},
"hendrycksTest-jurisprudence": {
"acc": 0.3148148148148148,
"acc_stderr": 0.04489931073591312,
"acc_norm": 0.42592592592592593,
"acc_norm_stderr": 0.0478034362693679
},
"hendrycksTest-logical_fallacies": {
"acc": 0.25766871165644173,
"acc_stderr": 0.03436150827846917,
"acc_norm": 0.3067484662576687,
"acc_norm_stderr": 0.036230899157241474
},
"hendrycksTest-machine_learning": {
"acc": 0.3125,
"acc_stderr": 0.043994650575715215,
"acc_norm": 0.25892857142857145,
"acc_norm_stderr": 0.04157751539865629
},
"hendrycksTest-management": {
"acc": 0.27184466019417475,
"acc_stderr": 0.044052680241409216,
"acc_norm": 0.33980582524271846,
"acc_norm_stderr": 0.046897659372781335
},
"hendrycksTest-marketing": {
"acc": 0.27350427350427353,
"acc_stderr": 0.029202540153431163,
"acc_norm": 0.2905982905982906,
"acc_norm_stderr": 0.029745048572674054
},
"hendrycksTest-medical_genetics": {
"acc": 0.28,
"acc_stderr": 0.04512608598542127,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"hendrycksTest-miscellaneous": {
"acc": 0.26309067688378035,
"acc_stderr": 0.015745497169049046,
"acc_norm": 0.2656449553001277,
"acc_norm_stderr": 0.01579430248788872
},
"hendrycksTest-moral_disputes": {
"acc": 0.2658959537572254,
"acc_stderr": 0.02378620325550828,
"acc_norm": 0.3236994219653179,
"acc_norm_stderr": 0.025190181327608408
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23910614525139665,
"acc_stderr": 0.014265554192331144,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-nutrition": {
"acc": 0.2581699346405229,
"acc_stderr": 0.025058503316958157,
"acc_norm": 0.3790849673202614,
"acc_norm_stderr": 0.027780141207023334
},
"hendrycksTest-philosophy": {
"acc": 0.24115755627009647,
"acc_stderr": 0.024296594034763426,
"acc_norm": 0.3086816720257235,
"acc_norm_stderr": 0.026236965881153252
},
"hendrycksTest-prehistory": {
"acc": 0.26851851851851855,
"acc_stderr": 0.024659685185967287,
"acc_norm": 0.21296296296296297,
"acc_norm_stderr": 0.022779719088733396
},
"hendrycksTest-professional_accounting": {
"acc": 0.2198581560283688,
"acc_stderr": 0.024706141070705474,
"acc_norm": 0.22695035460992907,
"acc_norm_stderr": 0.024987106365642962
},
"hendrycksTest-professional_law": {
"acc": 0.27509778357235987,
"acc_stderr": 0.011405443620996939,
"acc_norm": 0.288135593220339,
"acc_norm_stderr": 0.011567140661324565
},
"hendrycksTest-professional_medicine": {
"acc": 0.1801470588235294,
"acc_stderr": 0.023345163616544835,
"acc_norm": 0.2610294117647059,
"acc_norm_stderr": 0.026679252270103124
},
"hendrycksTest-professional_psychology": {
"acc": 0.24509803921568626,
"acc_stderr": 0.01740181671142766,
"acc_norm": 0.2581699346405229,
"acc_norm_stderr": 0.017704531653250068
},
"hendrycksTest-public_relations": {
"acc": 0.24545454545454545,
"acc_stderr": 0.04122066502878285,
"acc_norm": 0.15454545454545454,
"acc_norm_stderr": 0.03462262571262667
},
"hendrycksTest-security_studies": {
"acc": 0.32653061224489793,
"acc_stderr": 0.030021056238440313,
"acc_norm": 0.2693877551020408,
"acc_norm_stderr": 0.02840125202902294
},
"hendrycksTest-sociology": {
"acc": 0.24378109452736318,
"acc_stderr": 0.030360490154014645,
"acc_norm": 0.2736318407960199,
"acc_norm_stderr": 0.03152439186555402
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-virology": {
"acc": 0.3132530120481928,
"acc_stderr": 0.036108050180310235,
"acc_norm": 0.2710843373493976,
"acc_norm_stderr": 0.034605799075530276
},
"hendrycksTest-world_religions": {
"acc": 0.3157894736842105,
"acc_stderr": 0.03565079670708311,
"acc_norm": 0.38596491228070173,
"acc_norm_stderr": 0.03733756969066164
}
},
"versions": {
"copa": 0,
"piqa": 0,
"rte": 0,
"winogrande": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-management": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0
}
}