{ "results": { "copa": { "acc": 0.74, "acc_stderr": 0.044084400227680794 }, "piqa": { "acc": 0.705658324265506, "acc_stderr": 0.01063331147034749, "acc_norm": 0.7121871599564744, "acc_norm_stderr": 0.01056325038305919 }, "rte": { "acc": 0.5379061371841155, "acc_stderr": 0.030009848912529117 }, "winogrande": { "acc": 0.5564325177584846, "acc_stderr": 0.0139626949076204 }, "hendrycksTest-abstract_algebra": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768079 }, "hendrycksTest-anatomy": { "acc": 0.28888888888888886, "acc_stderr": 0.03915450630414251, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.036333844140734636 }, "hendrycksTest-astronomy": { "acc": 0.21710526315789475, "acc_stderr": 0.03355045304882921, "acc_norm": 0.3355263157894737, "acc_norm_stderr": 0.03842498559395271 }, "hendrycksTest-business_ethics": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-clinical_knowledge": { "acc": 0.24528301886792453, "acc_stderr": 0.026480357179895678, "acc_norm": 0.30943396226415093, "acc_norm_stderr": 0.02845015479411863 }, "hendrycksTest-college_biology": { "acc": 0.2152777777777778, "acc_stderr": 0.034370793441061344, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.034765901043041336 }, "hendrycksTest-college_chemistry": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "hendrycksTest-college_computer_science": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909284 }, "hendrycksTest-college_mathematics": { "acc": 0.2, "acc_stderr": 0.04020151261036845, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-college_medicine": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557 }, "hendrycksTest-college_physics": { "acc": 0.2549019607843137, "acc_stderr": 0.043364327079931764, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "hendrycksTest-computer_security": { "acc": 0.28, "acc_stderr": 0.04512608598542126, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "hendrycksTest-conceptual_physics": { "acc": 0.2553191489361702, "acc_stderr": 0.028504856470514203, "acc_norm": 0.1829787234042553, "acc_norm_stderr": 0.025276041000449966 }, "hendrycksTest-econometrics": { "acc": 0.21929824561403508, "acc_stderr": 0.03892431106518753, "acc_norm": 0.21929824561403508, "acc_norm_stderr": 0.03892431106518754 }, "hendrycksTest-electrical_engineering": { "acc": 0.2689655172413793, "acc_stderr": 0.036951833116502325, "acc_norm": 0.30344827586206896, "acc_norm_stderr": 0.038312260488503336 }, "hendrycksTest-elementary_mathematics": { "acc": 0.21957671957671956, "acc_stderr": 0.02132001859977036, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.022569897074918407 }, "hendrycksTest-formal_logic": { "acc": 0.29365079365079366, "acc_stderr": 0.04073524322147127, "acc_norm": 0.23809523809523808, "acc_norm_stderr": 0.038095238095238126 }, "hendrycksTest-global_facts": { "acc": 0.19, "acc_stderr": 0.039427724440366234, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036846 }, "hendrycksTest-high_school_biology": { "acc": 0.23548387096774193, "acc_stderr": 0.02413763242933771, "acc_norm": 0.3032258064516129, "acc_norm_stderr": 0.026148685930671746 }, "hendrycksTest-high_school_chemistry": { "acc": 0.2019704433497537, "acc_stderr": 0.028247350122180277, "acc_norm": 0.270935960591133, "acc_norm_stderr": 0.031270907132976984 }, "hendrycksTest-high_school_computer_science": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "hendrycksTest-high_school_european_history": { "acc": 0.24848484848484848, "acc_stderr": 0.03374402644139406, "acc_norm": 0.3090909090909091, "acc_norm_stderr": 0.036085410115739666 }, "hendrycksTest-high_school_geography": { "acc": 0.18181818181818182, "acc_stderr": 0.027479603010538787, "acc_norm": 0.2878787878787879, "acc_norm_stderr": 0.03225883512300993 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.21243523316062177, "acc_stderr": 0.02951928261681725, "acc_norm": 0.2538860103626943, "acc_norm_stderr": 0.03141024780565318 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2358974358974359, "acc_stderr": 0.02152596540740873, "acc_norm": 0.27692307692307694, "acc_norm_stderr": 0.022688042352424994 }, "hendrycksTest-high_school_mathematics": { "acc": 0.1925925925925926, "acc_stderr": 0.024043075181945192, "acc_norm": 0.21481481481481482, "acc_norm_stderr": 0.025040443877000686 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.24369747899159663, "acc_stderr": 0.027886828078380558, "acc_norm": 0.29831932773109243, "acc_norm_stderr": 0.02971914287634287 }, "hendrycksTest-high_school_physics": { "acc": 0.19205298013245034, "acc_stderr": 0.032162984205936156, "acc_norm": 0.25165562913907286, "acc_norm_stderr": 0.03543304234389985 }, "hendrycksTest-high_school_psychology": { "acc": 0.22935779816513763, "acc_stderr": 0.018025349724618684, "acc_norm": 0.24036697247706423, "acc_norm_stderr": 0.01832060732096407 }, "hendrycksTest-high_school_statistics": { "acc": 0.23148148148148148, "acc_stderr": 0.028765111718046976, "acc_norm": 0.28703703703703703, "acc_norm_stderr": 0.030851992993257017 }, "hendrycksTest-high_school_us_history": { "acc": 0.22058823529411764, "acc_stderr": 0.02910225438967409, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.0309645179269234 }, "hendrycksTest-high_school_world_history": { "acc": 0.270042194092827, "acc_stderr": 0.028900721906293426, "acc_norm": 0.3037974683544304, "acc_norm_stderr": 0.029936696387138605 }, "hendrycksTest-human_aging": { "acc": 0.3094170403587444, "acc_stderr": 0.031024411740572206, "acc_norm": 0.22869955156950672, "acc_norm_stderr": 0.028188240046929193 }, "hendrycksTest-human_sexuality": { "acc": 0.42748091603053434, "acc_stderr": 0.04338920305792401, "acc_norm": 0.31297709923664124, "acc_norm_stderr": 0.04066962905677698 }, "hendrycksTest-international_law": { "acc": 0.23140495867768596, "acc_stderr": 0.03849856098794089, "acc_norm": 0.4462809917355372, "acc_norm_stderr": 0.0453793517794788 }, "hendrycksTest-jurisprudence": { "acc": 0.3148148148148148, "acc_stderr": 0.04489931073591312, "acc_norm": 0.42592592592592593, "acc_norm_stderr": 0.0478034362693679 }, "hendrycksTest-logical_fallacies": { "acc": 0.25766871165644173, "acc_stderr": 0.03436150827846917, "acc_norm": 0.3067484662576687, "acc_norm_stderr": 0.036230899157241474 }, "hendrycksTest-machine_learning": { "acc": 0.3125, "acc_stderr": 0.043994650575715215, "acc_norm": 0.25892857142857145, "acc_norm_stderr": 0.04157751539865629 }, "hendrycksTest-management": { "acc": 0.27184466019417475, "acc_stderr": 0.044052680241409216, "acc_norm": 0.33980582524271846, "acc_norm_stderr": 0.046897659372781335 }, "hendrycksTest-marketing": { "acc": 0.27350427350427353, "acc_stderr": 0.029202540153431163, "acc_norm": 0.2905982905982906, "acc_norm_stderr": 0.029745048572674054 }, "hendrycksTest-medical_genetics": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "hendrycksTest-miscellaneous": { "acc": 0.26309067688378035, "acc_stderr": 0.015745497169049046, "acc_norm": 0.2656449553001277, "acc_norm_stderr": 0.01579430248788872 }, "hendrycksTest-moral_disputes": { "acc": 0.2658959537572254, "acc_stderr": 0.02378620325550828, "acc_norm": 0.3236994219653179, "acc_norm_stderr": 0.025190181327608408 }, "hendrycksTest-moral_scenarios": { "acc": 0.23910614525139665, "acc_stderr": 0.014265554192331144, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-nutrition": { "acc": 0.2581699346405229, "acc_stderr": 0.025058503316958157, "acc_norm": 0.3790849673202614, "acc_norm_stderr": 0.027780141207023334 }, "hendrycksTest-philosophy": { "acc": 0.24115755627009647, "acc_stderr": 0.024296594034763426, "acc_norm": 0.3086816720257235, "acc_norm_stderr": 0.026236965881153252 }, "hendrycksTest-prehistory": { "acc": 0.26851851851851855, "acc_stderr": 0.024659685185967287, "acc_norm": 0.21296296296296297, "acc_norm_stderr": 0.022779719088733396 }, "hendrycksTest-professional_accounting": { "acc": 0.2198581560283688, "acc_stderr": 0.024706141070705474, "acc_norm": 0.22695035460992907, "acc_norm_stderr": 0.024987106365642962 }, "hendrycksTest-professional_law": { "acc": 0.27509778357235987, "acc_stderr": 0.011405443620996939, "acc_norm": 0.288135593220339, "acc_norm_stderr": 0.011567140661324565 }, "hendrycksTest-professional_medicine": { "acc": 0.1801470588235294, "acc_stderr": 0.023345163616544835, "acc_norm": 0.2610294117647059, "acc_norm_stderr": 0.026679252270103124 }, "hendrycksTest-professional_psychology": { "acc": 0.24509803921568626, "acc_stderr": 0.01740181671142766, "acc_norm": 0.2581699346405229, "acc_norm_stderr": 0.017704531653250068 }, "hendrycksTest-public_relations": { "acc": 0.24545454545454545, "acc_stderr": 0.04122066502878285, "acc_norm": 0.15454545454545454, "acc_norm_stderr": 0.03462262571262667 }, "hendrycksTest-security_studies": { "acc": 0.32653061224489793, "acc_stderr": 0.030021056238440313, "acc_norm": 0.2693877551020408, "acc_norm_stderr": 0.02840125202902294 }, "hendrycksTest-sociology": { "acc": 0.24378109452736318, "acc_stderr": 0.030360490154014645, "acc_norm": 0.2736318407960199, "acc_norm_stderr": 0.03152439186555402 }, "hendrycksTest-us_foreign_policy": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "hendrycksTest-virology": { "acc": 0.3132530120481928, "acc_stderr": 0.036108050180310235, "acc_norm": 0.2710843373493976, "acc_norm_stderr": 0.034605799075530276 }, "hendrycksTest-world_religions": { "acc": 0.3157894736842105, "acc_stderr": 0.03565079670708311, "acc_norm": 0.38596491228070173, "acc_norm_stderr": 0.03733756969066164 } }, "versions": { "copa": 0, "piqa": 0, "rte": 0, "winogrande": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-international_law": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-management": 0, "hendrycksTest-marketing": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-sociology": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-virology": 0, "hendrycksTest-world_religions": 0 } }