lm1-misc-pile / 1b121b21b /evaluation /lm1-1b1-21b-results_lm-eval_global_step39672_2022-12-01-17-56-56.csv
Muennighoff's picture
Add
ca4f9b6
task,metric,value,err,version
copa,acc,0.68,0.04688261722621505,0
hendrycksTest-abstract_algebra,acc,0.28,0.04512608598542127,0
hendrycksTest-abstract_algebra,acc_norm,0.29,0.045604802157206824,0
hendrycksTest-anatomy,acc,0.2074074074074074,0.03502553170678318,0
hendrycksTest-anatomy,acc_norm,0.1925925925925926,0.03406542058502651,0
hendrycksTest-astronomy,acc,0.21052631578947367,0.03317672787533157,0
hendrycksTest-astronomy,acc_norm,0.27631578947368424,0.03639057569952924,0
hendrycksTest-business_ethics,acc,0.34,0.04760952285695236,0
hendrycksTest-business_ethics,acc_norm,0.29,0.04560480215720683,0
hendrycksTest-clinical_knowledge,acc,0.20754716981132076,0.02495991802891127,0
hendrycksTest-clinical_knowledge,acc_norm,0.32452830188679244,0.028815615713432115,0
hendrycksTest-college_biology,acc,0.25,0.03621034121889507,0
hendrycksTest-college_biology,acc_norm,0.2361111111111111,0.03551446610810826,0
hendrycksTest-college_chemistry,acc,0.23,0.042295258468165085,0
hendrycksTest-college_chemistry,acc_norm,0.31,0.04648231987117316,0
hendrycksTest-college_computer_science,acc,0.27,0.0446196043338474,0
hendrycksTest-college_computer_science,acc_norm,0.28,0.04512608598542128,0
hendrycksTest-college_mathematics,acc,0.26,0.0440844002276808,0
hendrycksTest-college_mathematics,acc_norm,0.31,0.04648231987117316,0
hendrycksTest-college_medicine,acc,0.2138728323699422,0.031265112061730424,0
hendrycksTest-college_medicine,acc_norm,0.26011560693641617,0.03345036916788992,0
hendrycksTest-college_physics,acc,0.2549019607843137,0.04336432707993178,0
hendrycksTest-college_physics,acc_norm,0.30392156862745096,0.045766654032077636,0
hendrycksTest-computer_security,acc,0.29,0.04560480215720684,0
hendrycksTest-computer_security,acc_norm,0.33,0.047258156262526045,0
hendrycksTest-conceptual_physics,acc,0.23829787234042554,0.027851252973889764,0
hendrycksTest-conceptual_physics,acc_norm,0.1574468085106383,0.023809905196619695,0
hendrycksTest-econometrics,acc,0.24561403508771928,0.04049339297748142,0
hendrycksTest-econometrics,acc_norm,0.24561403508771928,0.04049339297748142,0
hendrycksTest-electrical_engineering,acc,0.2896551724137931,0.03780019230438015,0
hendrycksTest-electrical_engineering,acc_norm,0.2896551724137931,0.03780019230438014,0
hendrycksTest-elementary_mathematics,acc,0.21428571428571427,0.021132859182754433,0
hendrycksTest-elementary_mathematics,acc_norm,0.23809523809523808,0.021935878081184756,0
hendrycksTest-formal_logic,acc,0.2857142857142857,0.04040610178208841,0
hendrycksTest-formal_logic,acc_norm,0.2857142857142857,0.0404061017820884,0
hendrycksTest-global_facts,acc,0.18,0.03861229196653694,0
hendrycksTest-global_facts,acc_norm,0.19,0.039427724440366234,0
hendrycksTest-high_school_biology,acc,0.20967741935483872,0.023157879349083522,0
hendrycksTest-high_school_biology,acc_norm,0.2645161290322581,0.02509189237885928,0
hendrycksTest-high_school_chemistry,acc,0.16748768472906403,0.026273086047535414,0
hendrycksTest-high_school_chemistry,acc_norm,0.2413793103448276,0.03010833071801162,0
hendrycksTest-high_school_computer_science,acc,0.23,0.042295258468165044,0
hendrycksTest-high_school_computer_science,acc_norm,0.23,0.04229525846816505,0
hendrycksTest-high_school_european_history,acc,0.24242424242424243,0.033464098810559534,0
hendrycksTest-high_school_european_history,acc_norm,0.26666666666666666,0.03453131801885414,0
hendrycksTest-high_school_geography,acc,0.20202020202020202,0.028606204289229872,0
hendrycksTest-high_school_geography,acc_norm,0.2727272727272727,0.03173071239071724,0
hendrycksTest-high_school_government_and_politics,acc,0.18652849740932642,0.02811209121011748,0
hendrycksTest-high_school_government_and_politics,acc_norm,0.23834196891191708,0.030748905363909895,0
hendrycksTest-high_school_macroeconomics,acc,0.2282051282051282,0.021278393863586282,0
hendrycksTest-high_school_macroeconomics,acc_norm,0.258974358974359,0.02221110681006166,0
hendrycksTest-high_school_mathematics,acc,0.2222222222222222,0.025348097468097838,0
hendrycksTest-high_school_mathematics,acc_norm,0.3148148148148148,0.028317533496066468,0
hendrycksTest-high_school_microeconomics,acc,0.20588235294117646,0.026265024608275882,0
hendrycksTest-high_school_microeconomics,acc_norm,0.29411764705882354,0.029597329730978093,0
hendrycksTest-high_school_physics,acc,0.24503311258278146,0.03511807571804724,0
hendrycksTest-high_school_physics,acc_norm,0.2119205298013245,0.033367670865679766,0
hendrycksTest-high_school_psychology,acc,0.22201834862385322,0.017818849564796634,0
hendrycksTest-high_school_psychology,acc_norm,0.23486238532110093,0.018175110510343578,0
hendrycksTest-high_school_statistics,acc,0.2222222222222222,0.02835321286686346,0
hendrycksTest-high_school_statistics,acc_norm,0.24537037037037038,0.029346665094372937,0
hendrycksTest-high_school_us_history,acc,0.28431372549019607,0.03166009679399812,0
hendrycksTest-high_school_us_history,acc_norm,0.27941176470588236,0.031493281045079556,0
hendrycksTest-high_school_world_history,acc,0.25738396624472576,0.028458820991460285,0
hendrycksTest-high_school_world_history,acc_norm,0.2742616033755274,0.029041333510598035,0
hendrycksTest-human_aging,acc,0.3273542600896861,0.031493846709941306,0
hendrycksTest-human_aging,acc_norm,0.2645739910313901,0.029605103217038308,0
hendrycksTest-human_sexuality,acc,0.31297709923664124,0.04066962905677697,0
hendrycksTest-human_sexuality,acc_norm,0.3282442748091603,0.04118438565806298,0
hendrycksTest-international_law,acc,0.1652892561983471,0.03390780612972776,0
hendrycksTest-international_law,acc_norm,0.4462809917355372,0.0453793517794788,0
hendrycksTest-jurisprudence,acc,0.23148148148148148,0.04077494709252626,0
hendrycksTest-jurisprudence,acc_norm,0.4074074074074074,0.047500773411999854,0
hendrycksTest-logical_fallacies,acc,0.1901840490797546,0.030833491146281235,0
hendrycksTest-logical_fallacies,acc_norm,0.26380368098159507,0.03462419931615623,0
hendrycksTest-machine_learning,acc,0.33035714285714285,0.044642857142857144,0
hendrycksTest-machine_learning,acc_norm,0.21428571428571427,0.03894641120044792,0
hendrycksTest-management,acc,0.17475728155339806,0.037601780060266224,0
hendrycksTest-management,acc_norm,0.23300970873786409,0.04185832598928315,0
hendrycksTest-marketing,acc,0.2863247863247863,0.02961432369045665,0
hendrycksTest-marketing,acc_norm,0.3162393162393162,0.030463656747340244,0
hendrycksTest-medical_genetics,acc,0.31,0.04648231987117316,0
hendrycksTest-medical_genetics,acc_norm,0.38,0.04878317312145633,0
hendrycksTest-miscellaneous,acc,0.25925925925925924,0.015671006009339572,0
hendrycksTest-miscellaneous,acc_norm,0.2515964240102171,0.01551732236552963,0
hendrycksTest-moral_disputes,acc,0.2630057803468208,0.02370309952525817,0
hendrycksTest-moral_disputes,acc_norm,0.3092485549132948,0.02488314057007175,0
hendrycksTest-moral_scenarios,acc,0.24022346368715083,0.014288343803925293,0
hendrycksTest-moral_scenarios,acc_norm,0.24692737430167597,0.014422292204808835,0
hendrycksTest-nutrition,acc,0.24183006535947713,0.024518195641879334,0
hendrycksTest-nutrition,acc_norm,0.38235294117647056,0.027826109307283686,0
hendrycksTest-philosophy,acc,0.20257234726688103,0.022827317491059682,0
hendrycksTest-philosophy,acc_norm,0.28938906752411575,0.025755865922632935,0
hendrycksTest-prehistory,acc,0.21604938271604937,0.022899162918445803,0
hendrycksTest-prehistory,acc_norm,0.1882716049382716,0.021751866060815875,0
hendrycksTest-professional_accounting,acc,0.2375886524822695,0.025389512552729903,0
hendrycksTest-professional_accounting,acc_norm,0.24468085106382978,0.025645553622266726,0
hendrycksTest-professional_law,acc,0.2561929595827901,0.011149173153110582,0
hendrycksTest-professional_law,acc_norm,0.28292046936114734,0.011503891323188976,0
hendrycksTest-professional_medicine,acc,0.22058823529411764,0.025187786660227248,0
hendrycksTest-professional_medicine,acc_norm,0.21323529411764705,0.024880971512294275,0
hendrycksTest-professional_psychology,acc,0.24509803921568626,0.017401816711427667,0
hendrycksTest-professional_psychology,acc_norm,0.2679738562091503,0.017917974069594726,0
hendrycksTest-public_relations,acc,0.24545454545454545,0.04122066502878285,0
hendrycksTest-public_relations,acc_norm,0.2,0.03831305140884603,0
hendrycksTest-security_studies,acc,0.2979591836734694,0.02927956741106567,0
hendrycksTest-security_studies,acc_norm,0.2571428571428571,0.02797982353874455,0
hendrycksTest-sociology,acc,0.31840796019900497,0.03294118479054095,0
hendrycksTest-sociology,acc_norm,0.3333333333333333,0.03333333333333334,0
hendrycksTest-us_foreign_policy,acc,0.33,0.04725815626252605,0
hendrycksTest-us_foreign_policy,acc_norm,0.33,0.047258156262526045,0
hendrycksTest-virology,acc,0.3072289156626506,0.03591566797824662,0
hendrycksTest-virology,acc_norm,0.25903614457831325,0.03410646614071855,0
hendrycksTest-world_religions,acc,0.2982456140350877,0.03508771929824565,0
hendrycksTest-world_religions,acc_norm,0.3684210526315789,0.036996580176568775,0
piqa,acc,0.6545157780195865,0.011094802893617745,0
piqa,acc_norm,0.6605005440696409,0.011048455047173913,0
rte,acc,0.5234657039711191,0.03006330041190266,0
winogrande,acc,0.4996053670086819,0.014052481306049516,0