lm1-misc-pile
/
3b977b77b
/evaluation
/lm1-3b9-77b-results_lm-eval_global_step73814_2022-12-02-14-09-22.json
{ | |
"results": { | |
"copa": { | |
"acc": 0.74, | |
"acc_stderr": 0.044084400227680794 | |
}, | |
"piqa": { | |
"acc": 0.705658324265506, | |
"acc_stderr": 0.01063331147034749, | |
"acc_norm": 0.7121871599564744, | |
"acc_norm_stderr": 0.01056325038305919 | |
}, | |
"rte": { | |
"acc": 0.5379061371841155, | |
"acc_stderr": 0.030009848912529117 | |
}, | |
"winogrande": { | |
"acc": 0.5564325177584846, | |
"acc_stderr": 0.0139626949076204 | |
}, | |
"hendrycksTest-abstract_algebra": { | |
"acc": 0.22, | |
"acc_stderr": 0.04163331998932268, | |
"acc_norm": 0.26, | |
"acc_norm_stderr": 0.04408440022768079 | |
}, | |
"hendrycksTest-anatomy": { | |
"acc": 0.28888888888888886, | |
"acc_stderr": 0.03915450630414251, | |
"acc_norm": 0.22962962962962963, | |
"acc_norm_stderr": 0.036333844140734636 | |
}, | |
"hendrycksTest-astronomy": { | |
"acc": 0.21710526315789475, | |
"acc_stderr": 0.03355045304882921, | |
"acc_norm": 0.3355263157894737, | |
"acc_norm_stderr": 0.03842498559395271 | |
}, | |
"hendrycksTest-business_ethics": { | |
"acc": 0.34, | |
"acc_stderr": 0.04760952285695235, | |
"acc_norm": 0.31, | |
"acc_norm_stderr": 0.04648231987117316 | |
}, | |
"hendrycksTest-clinical_knowledge": { | |
"acc": 0.24528301886792453, | |
"acc_stderr": 0.026480357179895678, | |
"acc_norm": 0.30943396226415093, | |
"acc_norm_stderr": 0.02845015479411863 | |
}, | |
"hendrycksTest-college_biology": { | |
"acc": 0.2152777777777778, | |
"acc_stderr": 0.034370793441061344, | |
"acc_norm": 0.2222222222222222, | |
"acc_norm_stderr": 0.034765901043041336 | |
}, | |
"hendrycksTest-college_chemistry": { | |
"acc": 0.26, | |
"acc_stderr": 0.04408440022768078, | |
"acc_norm": 0.29, | |
"acc_norm_stderr": 0.045604802157206845 | |
}, | |
"hendrycksTest-college_computer_science": { | |
"acc": 0.29, | |
"acc_stderr": 0.04560480215720684, | |
"acc_norm": 0.24, | |
"acc_norm_stderr": 0.04292346959909284 | |
}, | |
"hendrycksTest-college_mathematics": { | |
"acc": 0.2, | |
"acc_stderr": 0.04020151261036845, | |
"acc_norm": 0.3, | |
"acc_norm_stderr": 0.046056618647183814 | |
}, | |
"hendrycksTest-college_medicine": { | |
"acc": 0.2543352601156069, | |
"acc_stderr": 0.0332055644308557, | |
"acc_norm": 0.2543352601156069, | |
"acc_norm_stderr": 0.0332055644308557 | |
}, | |
"hendrycksTest-college_physics": { | |
"acc": 0.2549019607843137, | |
"acc_stderr": 0.043364327079931764, | |
"acc_norm": 0.28431372549019607, | |
"acc_norm_stderr": 0.04488482852329017 | |
}, | |
"hendrycksTest-computer_security": { | |
"acc": 0.28, | |
"acc_stderr": 0.04512608598542126, | |
"acc_norm": 0.36, | |
"acc_norm_stderr": 0.048241815132442176 | |
}, | |
"hendrycksTest-conceptual_physics": { | |
"acc": 0.2553191489361702, | |
"acc_stderr": 0.028504856470514203, | |
"acc_norm": 0.1829787234042553, | |
"acc_norm_stderr": 0.025276041000449966 | |
}, | |
"hendrycksTest-econometrics": { | |
"acc": 0.21929824561403508, | |
"acc_stderr": 0.03892431106518753, | |
"acc_norm": 0.21929824561403508, | |
"acc_norm_stderr": 0.03892431106518754 | |
}, | |
"hendrycksTest-electrical_engineering": { | |
"acc": 0.2689655172413793, | |
"acc_stderr": 0.036951833116502325, | |
"acc_norm": 0.30344827586206896, | |
"acc_norm_stderr": 0.038312260488503336 | |
}, | |
"hendrycksTest-elementary_mathematics": { | |
"acc": 0.21957671957671956, | |
"acc_stderr": 0.02132001859977036, | |
"acc_norm": 0.25925925925925924, | |
"acc_norm_stderr": 0.022569897074918407 | |
}, | |
"hendrycksTest-formal_logic": { | |
"acc": 0.29365079365079366, | |
"acc_stderr": 0.04073524322147127, | |
"acc_norm": 0.23809523809523808, | |
"acc_norm_stderr": 0.038095238095238126 | |
}, | |
"hendrycksTest-global_facts": { | |
"acc": 0.19, | |
"acc_stderr": 0.039427724440366234, | |
"acc_norm": 0.2, | |
"acc_norm_stderr": 0.04020151261036846 | |
}, | |
"hendrycksTest-high_school_biology": { | |
"acc": 0.23548387096774193, | |
"acc_stderr": 0.02413763242933771, | |
"acc_norm": 0.3032258064516129, | |
"acc_norm_stderr": 0.026148685930671746 | |
}, | |
"hendrycksTest-high_school_chemistry": { | |
"acc": 0.2019704433497537, | |
"acc_stderr": 0.028247350122180277, | |
"acc_norm": 0.270935960591133, | |
"acc_norm_stderr": 0.031270907132976984 | |
}, | |
"hendrycksTest-high_school_computer_science": { | |
"acc": 0.23, | |
"acc_stderr": 0.04229525846816506, | |
"acc_norm": 0.28, | |
"acc_norm_stderr": 0.04512608598542128 | |
}, | |
"hendrycksTest-high_school_european_history": { | |
"acc": 0.24848484848484848, | |
"acc_stderr": 0.03374402644139406, | |
"acc_norm": 0.3090909090909091, | |
"acc_norm_stderr": 0.036085410115739666 | |
}, | |
"hendrycksTest-high_school_geography": { | |
"acc": 0.18181818181818182, | |
"acc_stderr": 0.027479603010538787, | |
"acc_norm": 0.2878787878787879, | |
"acc_norm_stderr": 0.03225883512300993 | |
}, | |
"hendrycksTest-high_school_government_and_politics": { | |
"acc": 0.21243523316062177, | |
"acc_stderr": 0.02951928261681725, | |
"acc_norm": 0.2538860103626943, | |
"acc_norm_stderr": 0.03141024780565318 | |
}, | |
"hendrycksTest-high_school_macroeconomics": { | |
"acc": 0.2358974358974359, | |
"acc_stderr": 0.02152596540740873, | |
"acc_norm": 0.27692307692307694, | |
"acc_norm_stderr": 0.022688042352424994 | |
}, | |
"hendrycksTest-high_school_mathematics": { | |
"acc": 0.1925925925925926, | |
"acc_stderr": 0.024043075181945192, | |
"acc_norm": 0.21481481481481482, | |
"acc_norm_stderr": 0.025040443877000686 | |
}, | |
"hendrycksTest-high_school_microeconomics": { | |
"acc": 0.24369747899159663, | |
"acc_stderr": 0.027886828078380558, | |
"acc_norm": 0.29831932773109243, | |
"acc_norm_stderr": 0.02971914287634287 | |
}, | |
"hendrycksTest-high_school_physics": { | |
"acc": 0.19205298013245034, | |
"acc_stderr": 0.032162984205936156, | |
"acc_norm": 0.25165562913907286, | |
"acc_norm_stderr": 0.03543304234389985 | |
}, | |
"hendrycksTest-high_school_psychology": { | |
"acc": 0.22935779816513763, | |
"acc_stderr": 0.018025349724618684, | |
"acc_norm": 0.24036697247706423, | |
"acc_norm_stderr": 0.01832060732096407 | |
}, | |
"hendrycksTest-high_school_statistics": { | |
"acc": 0.23148148148148148, | |
"acc_stderr": 0.028765111718046976, | |
"acc_norm": 0.28703703703703703, | |
"acc_norm_stderr": 0.030851992993257017 | |
}, | |
"hendrycksTest-high_school_us_history": { | |
"acc": 0.22058823529411764, | |
"acc_stderr": 0.02910225438967409, | |
"acc_norm": 0.2647058823529412, | |
"acc_norm_stderr": 0.0309645179269234 | |
}, | |
"hendrycksTest-high_school_world_history": { | |
"acc": 0.270042194092827, | |
"acc_stderr": 0.028900721906293426, | |
"acc_norm": 0.3037974683544304, | |
"acc_norm_stderr": 0.029936696387138605 | |
}, | |
"hendrycksTest-human_aging": { | |
"acc": 0.3094170403587444, | |
"acc_stderr": 0.031024411740572206, | |
"acc_norm": 0.22869955156950672, | |
"acc_norm_stderr": 0.028188240046929193 | |
}, | |
"hendrycksTest-human_sexuality": { | |
"acc": 0.42748091603053434, | |
"acc_stderr": 0.04338920305792401, | |
"acc_norm": 0.31297709923664124, | |
"acc_norm_stderr": 0.04066962905677698 | |
}, | |
"hendrycksTest-international_law": { | |
"acc": 0.23140495867768596, | |
"acc_stderr": 0.03849856098794089, | |
"acc_norm": 0.4462809917355372, | |
"acc_norm_stderr": 0.0453793517794788 | |
}, | |
"hendrycksTest-jurisprudence": { | |
"acc": 0.3148148148148148, | |
"acc_stderr": 0.04489931073591312, | |
"acc_norm": 0.42592592592592593, | |
"acc_norm_stderr": 0.0478034362693679 | |
}, | |
"hendrycksTest-logical_fallacies": { | |
"acc": 0.25766871165644173, | |
"acc_stderr": 0.03436150827846917, | |
"acc_norm": 0.3067484662576687, | |
"acc_norm_stderr": 0.036230899157241474 | |
}, | |
"hendrycksTest-machine_learning": { | |
"acc": 0.3125, | |
"acc_stderr": 0.043994650575715215, | |
"acc_norm": 0.25892857142857145, | |
"acc_norm_stderr": 0.04157751539865629 | |
}, | |
"hendrycksTest-management": { | |
"acc": 0.27184466019417475, | |
"acc_stderr": 0.044052680241409216, | |
"acc_norm": 0.33980582524271846, | |
"acc_norm_stderr": 0.046897659372781335 | |
}, | |
"hendrycksTest-marketing": { | |
"acc": 0.27350427350427353, | |
"acc_stderr": 0.029202540153431163, | |
"acc_norm": 0.2905982905982906, | |
"acc_norm_stderr": 0.029745048572674054 | |
}, | |
"hendrycksTest-medical_genetics": { | |
"acc": 0.28, | |
"acc_stderr": 0.04512608598542127, | |
"acc_norm": 0.37, | |
"acc_norm_stderr": 0.04852365870939099 | |
}, | |
"hendrycksTest-miscellaneous": { | |
"acc": 0.26309067688378035, | |
"acc_stderr": 0.015745497169049046, | |
"acc_norm": 0.2656449553001277, | |
"acc_norm_stderr": 0.01579430248788872 | |
}, | |
"hendrycksTest-moral_disputes": { | |
"acc": 0.2658959537572254, | |
"acc_stderr": 0.02378620325550828, | |
"acc_norm": 0.3236994219653179, | |
"acc_norm_stderr": 0.025190181327608408 | |
}, | |
"hendrycksTest-moral_scenarios": { | |
"acc": 0.23910614525139665, | |
"acc_stderr": 0.014265554192331144, | |
"acc_norm": 0.27262569832402234, | |
"acc_norm_stderr": 0.014893391735249588 | |
}, | |
"hendrycksTest-nutrition": { | |
"acc": 0.2581699346405229, | |
"acc_stderr": 0.025058503316958157, | |
"acc_norm": 0.3790849673202614, | |
"acc_norm_stderr": 0.027780141207023334 | |
}, | |
"hendrycksTest-philosophy": { | |
"acc": 0.24115755627009647, | |
"acc_stderr": 0.024296594034763426, | |
"acc_norm": 0.3086816720257235, | |
"acc_norm_stderr": 0.026236965881153252 | |
}, | |
"hendrycksTest-prehistory": { | |
"acc": 0.26851851851851855, | |
"acc_stderr": 0.024659685185967287, | |
"acc_norm": 0.21296296296296297, | |
"acc_norm_stderr": 0.022779719088733396 | |
}, | |
"hendrycksTest-professional_accounting": { | |
"acc": 0.2198581560283688, | |
"acc_stderr": 0.024706141070705474, | |
"acc_norm": 0.22695035460992907, | |
"acc_norm_stderr": 0.024987106365642962 | |
}, | |
"hendrycksTest-professional_law": { | |
"acc": 0.27509778357235987, | |
"acc_stderr": 0.011405443620996939, | |
"acc_norm": 0.288135593220339, | |
"acc_norm_stderr": 0.011567140661324565 | |
}, | |
"hendrycksTest-professional_medicine": { | |
"acc": 0.1801470588235294, | |
"acc_stderr": 0.023345163616544835, | |
"acc_norm": 0.2610294117647059, | |
"acc_norm_stderr": 0.026679252270103124 | |
}, | |
"hendrycksTest-professional_psychology": { | |
"acc": 0.24509803921568626, | |
"acc_stderr": 0.01740181671142766, | |
"acc_norm": 0.2581699346405229, | |
"acc_norm_stderr": 0.017704531653250068 | |
}, | |
"hendrycksTest-public_relations": { | |
"acc": 0.24545454545454545, | |
"acc_stderr": 0.04122066502878285, | |
"acc_norm": 0.15454545454545454, | |
"acc_norm_stderr": 0.03462262571262667 | |
}, | |
"hendrycksTest-security_studies": { | |
"acc": 0.32653061224489793, | |
"acc_stderr": 0.030021056238440313, | |
"acc_norm": 0.2693877551020408, | |
"acc_norm_stderr": 0.02840125202902294 | |
}, | |
"hendrycksTest-sociology": { | |
"acc": 0.24378109452736318, | |
"acc_stderr": 0.030360490154014645, | |
"acc_norm": 0.2736318407960199, | |
"acc_norm_stderr": 0.03152439186555402 | |
}, | |
"hendrycksTest-us_foreign_policy": { | |
"acc": 0.34, | |
"acc_stderr": 0.04760952285695235, | |
"acc_norm": 0.36, | |
"acc_norm_stderr": 0.04824181513244218 | |
}, | |
"hendrycksTest-virology": { | |
"acc": 0.3132530120481928, | |
"acc_stderr": 0.036108050180310235, | |
"acc_norm": 0.2710843373493976, | |
"acc_norm_stderr": 0.034605799075530276 | |
}, | |
"hendrycksTest-world_religions": { | |
"acc": 0.3157894736842105, | |
"acc_stderr": 0.03565079670708311, | |
"acc_norm": 0.38596491228070173, | |
"acc_norm_stderr": 0.03733756969066164 | |
} | |
}, | |
"versions": { | |
"copa": 0, | |
"piqa": 0, | |
"rte": 0, | |
"winogrande": 0, | |
"hendrycksTest-abstract_algebra": 0, | |
"hendrycksTest-anatomy": 0, | |
"hendrycksTest-astronomy": 0, | |
"hendrycksTest-business_ethics": 0, | |
"hendrycksTest-clinical_knowledge": 0, | |
"hendrycksTest-college_biology": 0, | |
"hendrycksTest-college_chemistry": 0, | |
"hendrycksTest-college_computer_science": 0, | |
"hendrycksTest-college_mathematics": 0, | |
"hendrycksTest-college_medicine": 0, | |
"hendrycksTest-college_physics": 0, | |
"hendrycksTest-computer_security": 0, | |
"hendrycksTest-conceptual_physics": 0, | |
"hendrycksTest-econometrics": 0, | |
"hendrycksTest-electrical_engineering": 0, | |
"hendrycksTest-elementary_mathematics": 0, | |
"hendrycksTest-formal_logic": 0, | |
"hendrycksTest-global_facts": 0, | |
"hendrycksTest-high_school_biology": 0, | |
"hendrycksTest-high_school_chemistry": 0, | |
"hendrycksTest-high_school_computer_science": 0, | |
"hendrycksTest-high_school_european_history": 0, | |
"hendrycksTest-high_school_geography": 0, | |
"hendrycksTest-high_school_government_and_politics": 0, | |
"hendrycksTest-high_school_macroeconomics": 0, | |
"hendrycksTest-high_school_mathematics": 0, | |
"hendrycksTest-high_school_microeconomics": 0, | |
"hendrycksTest-high_school_physics": 0, | |
"hendrycksTest-high_school_psychology": 0, | |
"hendrycksTest-high_school_statistics": 0, | |
"hendrycksTest-high_school_us_history": 0, | |
"hendrycksTest-high_school_world_history": 0, | |
"hendrycksTest-human_aging": 0, | |
"hendrycksTest-human_sexuality": 0, | |
"hendrycksTest-international_law": 0, | |
"hendrycksTest-jurisprudence": 0, | |
"hendrycksTest-logical_fallacies": 0, | |
"hendrycksTest-machine_learning": 0, | |
"hendrycksTest-management": 0, | |
"hendrycksTest-marketing": 0, | |
"hendrycksTest-medical_genetics": 0, | |
"hendrycksTest-miscellaneous": 0, | |
"hendrycksTest-moral_disputes": 0, | |
"hendrycksTest-moral_scenarios": 0, | |
"hendrycksTest-nutrition": 0, | |
"hendrycksTest-philosophy": 0, | |
"hendrycksTest-prehistory": 0, | |
"hendrycksTest-professional_accounting": 0, | |
"hendrycksTest-professional_law": 0, | |
"hendrycksTest-professional_medicine": 0, | |
"hendrycksTest-professional_psychology": 0, | |
"hendrycksTest-public_relations": 0, | |
"hendrycksTest-security_studies": 0, | |
"hendrycksTest-sociology": 0, | |
"hendrycksTest-us_foreign_policy": 0, | |
"hendrycksTest-virology": 0, | |
"hendrycksTest-world_religions": 0 | |
} | |
} |