lm1-misc-pile / 280m5b95b9 /280m5b95b9pile /evaluation /rankeval /lm1-280m-5b9-oscar-results_lm-eval_global_step11269_2022-11-29-22-45-06.csv
Muennighoff's picture
Add
c305798
task,metric,value,err,version
copa,acc,0.62,0.04878317312145632,0
hendrycksTest-abstract_algebra,acc,0.2,0.04020151261036846,0
hendrycksTest-abstract_algebra,acc_norm,0.2,0.040201512610368445,0
hendrycksTest-anatomy,acc,0.2814814814814815,0.03885004245800253,0
hendrycksTest-anatomy,acc_norm,0.21481481481481482,0.035478541985608236,0
hendrycksTest-astronomy,acc,0.25,0.03523807393012047,0
hendrycksTest-astronomy,acc_norm,0.3223684210526316,0.03803510248351585,0
hendrycksTest-business_ethics,acc,0.38,0.048783173121456316,0
hendrycksTest-business_ethics,acc_norm,0.31,0.04648231987117316,0
hendrycksTest-clinical_knowledge,acc,0.22641509433962265,0.025757559893106748,0
hendrycksTest-clinical_knowledge,acc_norm,0.3018867924528302,0.02825420034443866,0
hendrycksTest-college_biology,acc,0.22916666666666666,0.03514697467862388,0
hendrycksTest-college_biology,acc_norm,0.2013888888888889,0.033536474697138406,0
hendrycksTest-college_chemistry,acc,0.28,0.045126085985421276,0
hendrycksTest-college_chemistry,acc_norm,0.34,0.04760952285695236,0
hendrycksTest-college_computer_science,acc,0.24,0.042923469599092816,0
hendrycksTest-college_computer_science,acc_norm,0.28,0.04512608598542128,0
hendrycksTest-college_mathematics,acc,0.18,0.03861229196653697,0
hendrycksTest-college_mathematics,acc_norm,0.24,0.04292346959909283,0
hendrycksTest-college_medicine,acc,0.18497109826589594,0.029605623981771224,0
hendrycksTest-college_medicine,acc_norm,0.2658959537572254,0.033687629322594316,0
hendrycksTest-college_physics,acc,0.21568627450980393,0.04092563958237655,0
hendrycksTest-college_physics,acc_norm,0.24509803921568626,0.04280105837364395,0
hendrycksTest-computer_security,acc,0.23,0.04229525846816507,0
hendrycksTest-computer_security,acc_norm,0.27,0.0446196043338474,0
hendrycksTest-conceptual_physics,acc,0.23404255319148937,0.027678452578212408,0
hendrycksTest-conceptual_physics,acc_norm,0.20851063829787234,0.026556982117838718,0
hendrycksTest-econometrics,acc,0.24561403508771928,0.04049339297748141,0
hendrycksTest-econometrics,acc_norm,0.18421052631578946,0.03646758875075566,0
hendrycksTest-electrical_engineering,acc,0.2689655172413793,0.036951833116502325,0
hendrycksTest-electrical_engineering,acc_norm,0.3103448275862069,0.038552896163789485,0
hendrycksTest-elementary_mathematics,acc,0.21164021164021163,0.021037331505262886,0
hendrycksTest-elementary_mathematics,acc_norm,0.2328042328042328,0.02176596167215453,0
hendrycksTest-formal_logic,acc,0.30952380952380953,0.04134913018303316,0
hendrycksTest-formal_logic,acc_norm,0.2777777777777778,0.04006168083848878,0
hendrycksTest-global_facts,acc,0.25,0.04351941398892446,0
hendrycksTest-global_facts,acc_norm,0.24,0.04292346959909282,0
hendrycksTest-high_school_biology,acc,0.25483870967741934,0.024790118459332204,0
hendrycksTest-high_school_biology,acc_norm,0.29354838709677417,0.02590608702131929,0
hendrycksTest-high_school_chemistry,acc,0.19704433497536947,0.02798672466673621,0
hendrycksTest-high_school_chemistry,acc_norm,0.2413793103448276,0.030108330718011625,0
hendrycksTest-high_school_computer_science,acc,0.2,0.04020151261036843,0
hendrycksTest-high_school_computer_science,acc_norm,0.24,0.04292346959909283,0
hendrycksTest-high_school_european_history,acc,0.24848484848484848,0.03374402644139404,0
hendrycksTest-high_school_european_history,acc_norm,0.2909090909090909,0.03546563019624336,0
hendrycksTest-high_school_geography,acc,0.18686868686868688,0.027772533334218974,0
hendrycksTest-high_school_geography,acc_norm,0.2727272727272727,0.03173071239071724,0
hendrycksTest-high_school_government_and_politics,acc,0.20725388601036268,0.02925282329180365,0
hendrycksTest-high_school_government_and_politics,acc_norm,0.27461139896373055,0.03221024508041154,0
hendrycksTest-high_school_macroeconomics,acc,0.23846153846153847,0.02160629449464773,0
hendrycksTest-high_school_macroeconomics,acc_norm,0.258974358974359,0.02221110681006167,0
hendrycksTest-high_school_mathematics,acc,0.2,0.024388430433987657,0
hendrycksTest-high_school_mathematics,acc_norm,0.22962962962962963,0.025644108639267634,0
hendrycksTest-high_school_microeconomics,acc,0.25210084033613445,0.028205545033277733,0
hendrycksTest-high_school_microeconomics,acc_norm,0.3025210084033613,0.02983796238829194,0
hendrycksTest-high_school_physics,acc,0.17218543046357615,0.03082613696196239,0
hendrycksTest-high_school_physics,acc_norm,0.23841059602649006,0.03479185572599659,0
hendrycksTest-high_school_psychology,acc,0.22935779816513763,0.018025349724618684,0
hendrycksTest-high_school_psychology,acc_norm,0.22935779816513763,0.018025349724618684,0
hendrycksTest-high_school_statistics,acc,0.24537037037037038,0.029346665094372944,0
hendrycksTest-high_school_statistics,acc_norm,0.27314814814814814,0.03038805130167812,0
hendrycksTest-high_school_us_history,acc,0.29411764705882354,0.03198001660115069,0
hendrycksTest-high_school_us_history,acc_norm,0.27941176470588236,0.031493281045079556,0
hendrycksTest-high_school_world_history,acc,0.29535864978902954,0.029696338713422893,0
hendrycksTest-high_school_world_history,acc_norm,0.2911392405063291,0.02957160106575337,0
hendrycksTest-human_aging,acc,0.34080717488789236,0.0318114974705536,0
hendrycksTest-human_aging,acc_norm,0.24663677130044842,0.028930413120910884,0
hendrycksTest-human_sexuality,acc,0.24427480916030533,0.037683359597287434,0
hendrycksTest-human_sexuality,acc_norm,0.29770992366412213,0.04010358942462203,0
hendrycksTest-international_law,acc,0.19834710743801653,0.03640118271990946,0
hendrycksTest-international_law,acc_norm,0.4462809917355372,0.0453793517794788,0
hendrycksTest-jurisprudence,acc,0.24074074074074073,0.0413311944024384,0
hendrycksTest-jurisprudence,acc_norm,0.4074074074074074,0.047500773411999854,0
hendrycksTest-logical_fallacies,acc,0.17791411042944785,0.030047357655806652,0
hendrycksTest-logical_fallacies,acc_norm,0.2392638036809816,0.03351953879521269,0
hendrycksTest-machine_learning,acc,0.3392857142857143,0.04493949068613539,0
hendrycksTest-machine_learning,acc_norm,0.25,0.04109974682633932,0
hendrycksTest-management,acc,0.17475728155339806,0.037601780060266224,0
hendrycksTest-management,acc_norm,0.24271844660194175,0.04245022486384495,0
hendrycksTest-marketing,acc,0.2948717948717949,0.02987257770889117,0
hendrycksTest-marketing,acc_norm,0.34615384615384615,0.031166957367235903,0
hendrycksTest-medical_genetics,acc,0.32,0.04688261722621505,0
hendrycksTest-medical_genetics,acc_norm,0.32,0.046882617226215034,0
hendrycksTest-miscellaneous,acc,0.2848020434227331,0.016139174096522574,0
hendrycksTest-miscellaneous,acc_norm,0.2784163473818646,0.016028295188992465,0
hendrycksTest-moral_disputes,acc,0.27167630057803466,0.02394851290546836,0
hendrycksTest-moral_disputes,acc_norm,0.3063583815028902,0.024818350129436596,0
hendrycksTest-moral_scenarios,acc,0.2446927374301676,0.014378169884098431,0
hendrycksTest-moral_scenarios,acc_norm,0.23798882681564246,0.014242630070574903,0
hendrycksTest-nutrition,acc,0.29411764705882354,0.02609016250427904,0
hendrycksTest-nutrition,acc_norm,0.3627450980392157,0.027530078447110303,0
hendrycksTest-philosophy,acc,0.2090032154340836,0.023093140398374224,0
hendrycksTest-philosophy,acc_norm,0.31189710610932475,0.026311858071854155,0
hendrycksTest-prehistory,acc,0.2345679012345679,0.023576881744005712,0
hendrycksTest-prehistory,acc_norm,0.19444444444444445,0.022021366100220194,0
hendrycksTest-professional_accounting,acc,0.2730496453900709,0.026577860943307857,0
hendrycksTest-professional_accounting,acc_norm,0.2730496453900709,0.026577860943307857,0
hendrycksTest-professional_law,acc,0.2653194263363755,0.011276198843958878,0
hendrycksTest-professional_law,acc_norm,0.2926988265971317,0.01162094919584953,0
hendrycksTest-professional_medicine,acc,0.2610294117647059,0.026679252270103114,0
hendrycksTest-professional_medicine,acc_norm,0.27205882352941174,0.02703304115168146,0
hendrycksTest-professional_psychology,acc,0.272875816993464,0.01802047414839358,0
hendrycksTest-professional_psychology,acc_norm,0.25980392156862747,0.017740899509177788,0
hendrycksTest-public_relations,acc,0.2636363636363636,0.04220224692971987,0
hendrycksTest-public_relations,acc_norm,0.17272727272727273,0.03620691833929218,0
hendrycksTest-security_studies,acc,0.32653061224489793,0.030021056238440313,0
hendrycksTest-security_studies,acc_norm,0.24489795918367346,0.02752963744017492,0
hendrycksTest-sociology,acc,0.26865671641791045,0.031343283582089536,0
hendrycksTest-sociology,acc_norm,0.2835820895522388,0.03187187537919797,0
hendrycksTest-us_foreign_policy,acc,0.27,0.04461960433384741,0
hendrycksTest-us_foreign_policy,acc_norm,0.31,0.04648231987117316,0
hendrycksTest-virology,acc,0.27710843373493976,0.03484331592680586,0
hendrycksTest-virology,acc_norm,0.28313253012048195,0.03507295431370519,0
hendrycksTest-world_religions,acc,0.2573099415204678,0.03352799844161865,0
hendrycksTest-world_religions,acc_norm,0.24561403508771928,0.03301405946987251,0
piqa,acc,0.602829162132753,0.011416453840790266,0
piqa,acc_norm,0.5990206746463548,0.011434766962108312,0
rte,acc,0.5090252707581228,0.030091559826331334,0
winogrande,acc,0.5114443567482242,0.014048804199859329,0