Muennighoff commited on
Commit
2e29a11
·
1 Parent(s): 9e5d26a
Files changed (28) hide show
  1. 8b7178b44b/evaluation/generation/merged.csv +39 -0
  2. 8b7178b44b/evaluation/generation/merged.json +1 -0
  3. 8b7178b44b/evaluation/rankeval/8b7178b44b_0.csv +21 -0
  4. 8b7178b44b/evaluation/rankeval/8b7178b44b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json +0 -87
  5. 8b7178b44b/evaluation/rankeval/8b7178b44b_1.csv +21 -0
  6. 8b7178b44b/evaluation/rankeval/8b7178b44b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json +0 -87
  7. 8b7178b44b/evaluation/rankeval/8b7178b44b_2.csv +21 -0
  8. 8b7178b44b/evaluation/rankeval/8b7178b44b_2_lm-eval_global_step84877_2023-01-31-11-38-06_2shots_backup.json +0 -87
  9. 8b7178b44b/evaluation/rankeval/8b7178b44b_3.csv +21 -0
  10. 8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json +0 -87
  11. 8b7178b44b/evaluation/rankeval/8b7178b44b_4.csv +21 -0
  12. 8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json +0 -87
  13. 8b7178b44b/evaluation/rankeval/8b7178b44b_5.csv +21 -0
  14. 8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json +0 -87
  15. 8b7178b88b/evaluation/generation/merged.csv +39 -0
  16. 8b7178b88b/evaluation/generation/merged.json +1 -0
  17. 8b7178b88b/evaluation/rankeval/8b7178b88b_0.csv +21 -0
  18. 8b7178b88b/evaluation/rankeval/8b7178b88b_0_lm-eval_global_step84877_2023-01-30-20-00-12_0shots_backup.json +0 -87
  19. 8b7178b88b/evaluation/rankeval/8b7178b88b_1.csv +21 -0
  20. 8b7178b88b/evaluation/rankeval/8b7178b88b_1_lm-eval_global_step84877_2023-01-30-20-00-12_1shots_backup.json +0 -87
  21. 8b7178b88b/evaluation/rankeval/8b7178b88b_2.csv +21 -0
  22. 8b7178b88b/evaluation/rankeval/8b7178b88b_2_lm-eval_global_step84877_2023-01-30-20-00-12_2shots_backup.json +0 -87
  23. 8b7178b88b/evaluation/rankeval/8b7178b88b_3.csv +21 -0
  24. 8b7178b88b/evaluation/rankeval/8b7178b88b_3_lm-eval_global_step84877_2023-01-30-20-00-12_3shots_backup.json +0 -87
  25. 8b7178b88b/evaluation/rankeval/8b7178b88b_4.csv +21 -0
  26. 8b7178b88b/evaluation/rankeval/8b7178b88b_4_lm-eval_global_step84877_2023-01-30-20-00-12_4shots_backup.json +0 -87
  27. 8b7178b88b/evaluation/rankeval/8b7178b88b_5.csv +21 -0
  28. 8b7178b88b/evaluation/rankeval/8b7178b88b_5_lm-eval_global_step84877_2023-01-30-20-00-12_5shots_backup.json +0 -87
8b7178b44b/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.08752141617950068
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.08752141617950068
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.23034576083422903
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.23034576083422903
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2578530121604481
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2578530121604481
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.26545692823299993
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.26545692823299993
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.26757332273287654
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.26757332273287654
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.26817812765923393
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.26817812765923393
14
+ e2e_nlg_cleaned,5,average,multiple,0.2294880946332147
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05062634534913608
16
+ gem_xsum,0,median,rouge2_fmeasure,0.05062634534913608
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.046694043648379924
18
+ gem_xsum,1,median,rouge2_fmeasure,0.046694043648379924
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05750919277461529
20
+ gem_xsum,2,median,rouge2_fmeasure,0.05750919277461529
21
+ gem_xsum,2,average,multiple,0.05160986059071043
22
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.051776101353314175
23
+ web_nlg_en,0,median,rouge2_fmeasure,0.051776101353314175
24
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08797235452625987
25
+ web_nlg_en,1,median,rouge2_fmeasure,0.08797235452625987
26
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.11382579286837109
27
+ web_nlg_en,2,median,rouge2_fmeasure,0.11382579286837109
28
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.1207306381621409
29
+ web_nlg_en,3,median,rouge2_fmeasure,0.1207306381621409
30
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12559736589051002
31
+ web_nlg_en,4,median,rouge2_fmeasure,0.12559736589051002
32
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.12988590207293185
33
+ web_nlg_en,5,median,rouge2_fmeasure,0.12988590207293185
34
+ web_nlg_en,5,average,multiple,0.10496469247892132
35
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02778920003491775
36
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.02778920003491775
37
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.044909001327117046
38
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.044909001327117046
39
+ wiki_lingua_en,1,average,multiple,0.0363491006810174
8b7178b44b/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4712440224858371, "bleu_stderr": 0.041795896662364315, "rouge1_fmeasure": 0.11420475482093911, "rouge1_fmeasure_stderr": 0.0022183305831393195, "rouge1_precision": 0.08494278433659254, "rouge1_precision_stderr": 0.002682014199464667, "rouge1_recall": 0.3010315100774798, "rouge1_recall_stderr": 0.005434382926718196, "rouge2_fmeasure": 0.051776101353314175, "rouge2_fmeasure_stderr": 0.0012810666933093713, "rouge2_precision": 0.03708415721222578, "rouge2_precision_stderr": 0.0015055606328312114, "rouge2_recall": 0.14224175280721357, "rouge2_recall_stderr": 0.0032367826210838984, "rougeL_fmeasure": 0.10723944522759159, "rougeL_fmeasure_stderr": 0.002017538602140417, "rougeL_precision": 0.07986488274116094, "rougeL_precision_stderr": 0.0025364066921602967, "rougeL_recall": 0.2851084703862567, "rougeL_recall_stderr": 0.005103727035204417, "rougeLsum_fmeasure": 0.10754705129811067, "rougeLsum_fmeasure_stderr": 0.002066923006506901, "rougeLsum_precision": 0.08031203138023377, "rougeLsum_precision_stderr": 0.0025700547990470703, "rougeLsum_recall": 0.2832049885841092, "rougeLsum_recall_stderr": 0.00501983173550152}}, "1": {"PALM_prompt": {"bleu": 0.6352422858353469, "bleu_stderr": 0.04386319262846487, "rouge1_fmeasure": 0.17595828606649466, "rouge1_fmeasure_stderr": 0.0037103128590180736, "rouge1_precision": 0.14729082149250516, "rouge1_precision_stderr": 0.004315116801617204, "rouge1_recall": 0.34501701041772753, "rouge1_recall_stderr": 0.004883560454809294, "rouge2_fmeasure": 0.08797235452625987, "rouge2_fmeasure_stderr": 0.002563800160615811, "rouge2_precision": 0.0749912603946425, "rouge2_precision_stderr": 0.0029697645981284103, "rouge2_recall": 0.17507177387031794, "rouge2_recall_stderr": 0.0035680879509635034, "rougeL_fmeasure": 0.1587241385605771, "rougeL_fmeasure_stderr": 0.0031813191221144485, "rougeL_precision": 0.13087012462145003, "rougeL_precision_stderr": 0.0037359089786958557, "rougeL_recall": 0.3214469196295103, "rougeL_recall_stderr": 0.004518231473535444, "rougeLsum_fmeasure": 0.16209364614512511, "rougeLsum_fmeasure_stderr": 0.003268839028418146, "rougeLsum_precision": 0.13443162008077075, "rougeLsum_precision_stderr": 0.0038480943977239573, "rougeLsum_recall": 0.32447309188867546, "rougeLsum_recall_stderr": 0.0044995663315275155}}, "2": {"PALM_prompt": {"bleu": 0.9195810919624675, "bleu_stderr": 0.04315216107683015, "rouge1_fmeasure": 0.21485950562026987, "rouge1_fmeasure_stderr": 0.004330200790716057, "rouge1_precision": 0.19281853446958125, "rouge1_precision_stderr": 0.005348179657635172, "rouge1_recall": 0.3839239693259562, "rouge1_recall_stderr": 0.004896794421772872, "rouge2_fmeasure": 0.11382579286837109, "rouge2_fmeasure_stderr": 0.0029731216918058084, "rouge2_precision": 0.1059225294754287, "rouge2_precision_stderr": 0.0036916446521448806, "rouge2_recall": 0.20457409251843786, "rouge2_recall_stderr": 0.0037775743582373947, "rougeL_fmeasure": 0.1905777152487244, "rougeL_fmeasure_stderr": 0.0036266046372882984, "rougeL_precision": 0.16888883147055286, "rougeL_precision_stderr": 0.00457575327708451, "rougeL_recall": 0.35302151670878984, "rougeL_recall_stderr": 0.004443535122018173, "rougeLsum_fmeasure": 0.1966781681343204, "rougeLsum_fmeasure_stderr": 0.0037869984327305512, "rougeLsum_precision": 0.17526191475368208, "rougeLsum_precision_stderr": 0.004769580382692894, "rougeLsum_recall": 0.35984651407202956, "rougeLsum_recall_stderr": 0.004507478564774203}}, "3": {"PALM_prompt": {"bleu": 1.0349566025343735, "bleu_stderr": 0.03887674612290353, "rouge1_fmeasure": 0.2237082219543607, "rouge1_fmeasure_stderr": 0.004572034500678942, "rouge1_precision": 0.20438835211537978, "rouge1_precision_stderr": 0.005665805208271208, "rouge1_recall": 0.39198992418866246, "rouge1_recall_stderr": 0.004947498335222262, "rouge2_fmeasure": 0.1207306381621409, "rouge2_fmeasure_stderr": 0.003262380279344123, "rouge2_precision": 0.11365499071190853, "rouge2_precision_stderr": 0.00397252439813096, "rouge2_recall": 0.2111302605559281, "rouge2_recall_stderr": 0.003926393767113415, "rougeL_fmeasure": 0.196789710556852, "rougeL_fmeasure_stderr": 0.0038281461566836554, "rougeL_precision": 0.17756713475764335, "rougeL_precision_stderr": 0.004833439612735641, "rougeL_recall": 0.3583014136638203, "rougeL_recall_stderr": 0.004476979554028604, "rougeLsum_fmeasure": 0.20416935580581227, "rougeLsum_fmeasure_stderr": 0.004021347054930177, "rougeLsum_precision": 0.18556820599751273, "rougeLsum_precision_stderr": 0.00508629808674153, "rougeLsum_recall": 0.3664900622762122, "rougeLsum_recall_stderr": 0.004561176586485258}}, "4": {"PALM_prompt": {"bleu": 1.1673998473534046, "bleu_stderr": 0.09133948510400369, "rouge1_fmeasure": 0.23196338111553216, "rouge1_fmeasure_stderr": 0.004558457306896048, "rouge1_precision": 0.2125195111528658, "rouge1_precision_stderr": 0.005730453983133926, "rouge1_recall": 0.40637639113635166, "rouge1_recall_stderr": 0.004933922377880338, "rouge2_fmeasure": 0.12559736589051002, "rouge2_fmeasure_stderr": 0.003188817274548042, "rouge2_precision": 0.11875092253811664, "rouge2_precision_stderr": 0.0039714331213516595, "rouge2_recall": 0.22248575515884728, "rouge2_recall_stderr": 0.004014145150661029, "rougeL_fmeasure": 0.2038615706419507, "rougeL_fmeasure_stderr": 0.003812860468542949, "rougeL_precision": 0.1843607825179914, "rougeL_precision_stderr": 0.0048800492055134925, "rougeL_recall": 0.370665372766866, "rougeL_recall_stderr": 0.004464677380992271, "rougeLsum_fmeasure": 0.2125920854603434, "rougeLsum_fmeasure_stderr": 0.004058013873763605, "rougeLsum_precision": 0.19370515184964268, "rougeLsum_precision_stderr": 0.005184788234116766, "rougeLsum_recall": 0.3806829975264072, "rougeLsum_recall_stderr": 0.004583640895651089}}, "5": {"PALM_prompt": {"bleu": 1.194760741144373, "bleu_stderr": 0.09038400286323779, "rouge1_fmeasure": 0.24067238350377898, "rouge1_fmeasure_stderr": 0.004627635578763758, "rouge1_precision": 0.22388107728965712, "rouge1_precision_stderr": 0.0058375878448932725, "rouge1_recall": 0.41258053843820386, "rouge1_recall_stderr": 0.004808891702805189, "rouge2_fmeasure": 0.12988590207293185, "rouge2_fmeasure_stderr": 0.003239297680717399, "rouge2_precision": 0.12470779957706617, "rouge2_precision_stderr": 0.004011608962377894, "rouge2_recall": 0.22396081182592556, "rouge2_recall_stderr": 0.003932938864761309, "rougeL_fmeasure": 0.20930811896578502, "rougeL_fmeasure_stderr": 0.0038259333936250185, "rougeL_precision": 0.19195297464964095, "rougeL_precision_stderr": 0.004914089701123312, "rougeL_recall": 0.37374160359881364, "rougeL_recall_stderr": 0.00433847098671097, "rougeLsum_fmeasure": 0.21923266732565286, "rougeLsum_fmeasure_stderr": 0.004087600756703223, "rougeLsum_precision": 0.20275170487892208, "rougeLsum_precision_stderr": 0.005253109590732199, "rougeLsum_recall": 0.3844755575959121, "rougeLsum_recall_stderr": 0.004448663215506157}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.170472303270534, "bleu_stderr": 0.09245843828343826, "rouge1_fmeasure": 0.10762579083182928, "rouge1_fmeasure_stderr": 0.0025835076322910672, "rouge1_precision": 0.11259234653795358, "rouge1_precision_stderr": 0.0032411862971363117, "rouge1_recall": 0.14340893843595093, "rouge1_recall_stderr": 0.0034694907773717048, "rouge2_fmeasure": 0.02778920003491775, "rouge2_fmeasure_stderr": 0.000982242456017776, "rouge2_precision": 0.02582999428983234, "rouge2_precision_stderr": 0.0010000072112516562, "rouge2_recall": 0.03784241933814092, "rouge2_recall_stderr": 0.001450453403192633, "rougeL_fmeasure": 0.08192620888873649, "rougeL_fmeasure_stderr": 0.0019339948879164794, "rougeL_precision": 0.08841484558766141, "rougeL_precision_stderr": 0.0027853934411114305, "rougeL_recall": 0.1114317644839379, "rougeL_recall_stderr": 0.0027453844256091257, "rougeLsum_fmeasure": 0.10120163014874665, "rougeLsum_fmeasure_stderr": 0.002435237757238233, "rougeLsum_precision": 0.1068082984161991, "rougeLsum_precision_stderr": 0.0031424461382987463, "rougeLsum_recall": 0.13490305217744453, "rougeLsum_recall_stderr": 0.0032803950524800847}}, "1": {"tldr_en": {"bleu": 3.020335174960559, "bleu_stderr": 0.06787715772935045, "rouge1_fmeasure": 0.18153788588353004, "rouge1_fmeasure_stderr": 0.002424482153914544, "rouge1_precision": 0.19504169763237164, "rouge1_precision_stderr": 0.003161655570483573, "rouge1_recall": 0.2268980166797919, "rouge1_recall_stderr": 0.00336143207911689, "rouge2_fmeasure": 0.044909001327117046, "rouge2_fmeasure_stderr": 0.0011943824375694858, "rouge2_precision": 0.049740220187040864, "rouge2_precision_stderr": 0.001561722025312558, "rouge2_recall": 0.05782059703115125, "rouge2_recall_stderr": 0.0016860297998920056, "rougeL_fmeasure": 0.1341843957584949, "rougeL_fmeasure_stderr": 0.0017857657209747223, "rougeL_precision": 0.14594590659077963, "rougeL_precision_stderr": 0.002476347943750222, "rougeL_recall": 0.1691189101402509, "rougeL_recall_stderr": 0.002588936970621776, "rougeLsum_fmeasure": 0.16951014920125493, "rougeLsum_fmeasure_stderr": 0.0022640636265336893, "rougeLsum_precision": 0.18281924229155466, "rougeLsum_precision_stderr": 0.0029977938731759806, "rougeLsum_recall": 0.21164415433278383, "rougeLsum_recall_stderr": 0.0031361900114708872}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.235702101845383, "bleu_stderr": 0.14717774293969751, "rouge1_fmeasure": 0.18121867050334917, "rouge1_fmeasure_stderr": 0.0018474922816842606, "rouge1_precision": 0.7622052797070503, "rouge1_precision_stderr": 0.006647669795578922, "rouge1_recall": 0.1532954300944369, "rouge1_recall_stderr": 0.0031267049368978316, "rouge2_fmeasure": 0.08752141617950068, "rouge2_fmeasure_stderr": 0.0010542837911429734, "rouge2_precision": 0.7122733552799622, "rouge2_precision_stderr": 0.007806658717987775, "rouge2_recall": 0.07189313722539607, "rouge2_recall_stderr": 0.0017250469433453682, "rougeL_fmeasure": 0.1698326390033901, "rougeL_fmeasure_stderr": 0.0014973431260616968, "rougeL_precision": 0.7500255707124592, "rougeL_precision_stderr": 0.006889071529543362, "rougeL_recall": 0.13882842661597, "rougeL_recall_stderr": 0.0025055451424711027, "rougeLsum_fmeasure": 0.17261995880897366, "rougeLsum_fmeasure_stderr": 0.0016987141071618745, "rougeLsum_precision": 0.752191135927755, "rougeLsum_precision_stderr": 0.00685154752320934, "rougeLsum_recall": 0.14346581040848494, "rougeLsum_recall_stderr": 0.002891637286051436}}, "1": {"generate_text_restaurant": {"bleu": 12.507959373818647, "bleu_stderr": 0.12774308491764494, "rouge1_fmeasure": 0.48125855004986695, "rouge1_fmeasure_stderr": 0.0023747652067436864, "rouge1_precision": 0.5901037503482837, "rouge1_precision_stderr": 0.0032159724501758416, "rouge1_recall": 0.4453954075758376, "rouge1_recall_stderr": 0.0030647232677422412, "rouge2_fmeasure": 0.23034576083422903, "rouge2_fmeasure_stderr": 0.0020975083120057516, "rouge2_precision": 0.2860806077756621, "rouge2_precision_stderr": 0.0027716337523015977, "rouge2_recall": 0.21297260057374331, "rouge2_recall_stderr": 0.002224122053216537, "rougeL_fmeasure": 0.34846752107389445, "rougeL_fmeasure_stderr": 0.0021243881165946155, "rougeL_precision": 0.4308357021587011, "rougeL_precision_stderr": 0.003009420690844223, "rougeL_recall": 0.3212543906087175, "rougeL_recall_stderr": 0.002480078078958591, "rougeLsum_fmeasure": 0.3931631455827191, "rougeLsum_fmeasure_stderr": 0.0023981333184838593, "rougeLsum_precision": 0.4830396285828985, "rougeLsum_precision_stderr": 0.003213107569523447, "rougeLsum_recall": 0.3634977467417529, "rougeLsum_recall_stderr": 0.002819214726631758}}, "2": {"generate_text_restaurant": {"bleu": 15.158653678134238, "bleu_stderr": 0.11217626725691181, "rouge1_fmeasure": 0.5131292320126455, "rouge1_fmeasure_stderr": 0.002253031761361406, "rouge1_precision": 0.6066145287034505, "rouge1_precision_stderr": 0.003147539042422012, "rouge1_recall": 0.48239587619310037, "rouge1_recall_stderr": 0.0029532891322940407, "rouge2_fmeasure": 0.2578530121604481, "rouge2_fmeasure_stderr": 0.0021628607938910917, "rouge2_precision": 0.3089787427206784, "rouge2_precision_stderr": 0.0028203707876968153, "rouge2_recall": 0.24188957715036025, "rouge2_recall_stderr": 0.002292974031208558, "rougeL_fmeasure": 0.37525966202827027, "rougeL_fmeasure_stderr": 0.0021725307399361672, "rougeL_precision": 0.44560511468454206, "rougeL_precision_stderr": 0.002995038040318342, "rougeL_recall": 0.35193582043955735, "rougeL_recall_stderr": 0.0025196283655795136, "rougeLsum_fmeasure": 0.4285131129511556, "rougeLsum_fmeasure_stderr": 0.0024045780604980463, "rougeLsum_precision": 0.5064590234844139, "rougeLsum_precision_stderr": 0.003190583549175626, "rougeLsum_recall": 0.4026845079975637, "rougeLsum_recall_stderr": 0.002836627735824717}}, "3": {"generate_text_restaurant": {"bleu": 15.759812537730546, "bleu_stderr": 0.16130432495289687, "rouge1_fmeasure": 0.5204784544081206, "rouge1_fmeasure_stderr": 0.002251275712244091, "rouge1_precision": 0.6077270325813979, "rouge1_precision_stderr": 0.0031150320109689183, "rouge1_recall": 0.49124527491366843, "rouge1_recall_stderr": 0.0029113142559981263, "rouge2_fmeasure": 0.26545692823299993, "rouge2_fmeasure_stderr": 0.0021835553197599713, "rouge2_precision": 0.31332017254120853, "rouge2_precision_stderr": 0.0027772905192189512, "rouge2_recall": 0.2502643652420692, "rouge2_recall_stderr": 0.0023315173133087275, "rougeL_fmeasure": 0.3810216920994399, "rougeL_fmeasure_stderr": 0.002194042986121944, "rougeL_precision": 0.4459205694292829, "rougeL_precision_stderr": 0.0029361567575171836, "rougeL_recall": 0.3592970906286205, "rougeL_recall_stderr": 0.0025375120589350396, "rougeLsum_fmeasure": 0.4357908055166842, "rougeLsum_fmeasure_stderr": 0.0024150704709467426, "rougeLsum_precision": 0.5082193147554575, "rougeLsum_precision_stderr": 0.003142597202698407, "rougeLsum_recall": 0.41160458845561226, "rougeLsum_recall_stderr": 0.002840955053130549}}, "4": {"generate_text_restaurant": {"bleu": 16.134799256710256, "bleu_stderr": 0.1070119918698532, "rouge1_fmeasure": 0.523909375920506, "rouge1_fmeasure_stderr": 0.0022880595790812155, "rouge1_precision": 0.6054119860771526, "rouge1_precision_stderr": 0.0031401965578103136, "rouge1_recall": 0.4946565816842518, "rouge1_recall_stderr": 0.0028364911596105765, "rouge2_fmeasure": 0.26757332273287654, "rouge2_fmeasure_stderr": 0.0022434333099258102, "rouge2_precision": 0.3124961122227526, "rouge2_precision_stderr": 0.002808954903697186, "rouge2_recall": 0.25220130152340275, "rouge2_recall_stderr": 0.0023493523906758557, "rougeL_fmeasure": 0.382175935620986, "rougeL_fmeasure_stderr": 0.002236237383536334, "rougeL_precision": 0.44233843950127705, "rougeL_precision_stderr": 0.0029306698549013154, "rougeL_recall": 0.3607369273273119, "rougeL_recall_stderr": 0.002523691933961996, "rougeLsum_fmeasure": 0.4381722454639816, "rougeLsum_fmeasure_stderr": 0.0024443106668905357, "rougeLsum_precision": 0.5057660051353207, "rougeLsum_precision_stderr": 0.003153645376968123, "rougeLsum_recall": 0.4139501510871044, "rougeLsum_recall_stderr": 0.002788136076678763}}, "5": {"generate_text_restaurant": {"bleu": 16.107165763126492, "bleu_stderr": 0.1725403495293288, "rouge1_fmeasure": 0.5240001141378704, "rouge1_fmeasure_stderr": 0.0022053636458810423, "rouge1_precision": 0.6037882438302521, "rouge1_precision_stderr": 0.00307437348704947, "rouge1_recall": 0.49496382545882733, "rouge1_recall_stderr": 0.0027753751046680057, "rouge2_fmeasure": 0.26817812765923393, "rouge2_fmeasure_stderr": 0.002197626024453061, "rouge2_precision": 0.31296117294303094, "rouge2_precision_stderr": 0.0027976539566726744, "rouge2_recall": 0.25262525126987617, "rouge2_recall_stderr": 0.0023072545439166217, "rougeL_fmeasure": 0.3843695820781079, "rougeL_fmeasure_stderr": 0.0022098458849370652, "rougeL_precision": 0.44434419215812615, "rougeL_precision_stderr": 0.002949180064465491, "rougeL_recall": 0.3625276939571968, "rougeL_recall_stderr": 0.0024933568400759844, "rougeLsum_fmeasure": 0.4393845856350276, "rougeLsum_fmeasure_stderr": 0.002395852175826684, "rougeLsum_precision": 0.5068534628215157, "rougeLsum_precision_stderr": 0.0031588710330076797, "rougeLsum_recall": 0.41448240798432473, "rougeLsum_recall_stderr": 0.0027203178472831524}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1587870129118882, "bleu_stderr": 0.09029922216218629, "rouge1_fmeasure": 0.2131475561803473, "rouge1_fmeasure_stderr": 0.002809398976147748, "rouge1_precision": 0.1554932484923166, "rouge1_precision_stderr": 0.002221691062513405, "rouge1_recall": 0.35841398237522276, "rouge1_recall_stderr": 0.004880460975300627, "rouge2_fmeasure": 0.05062634534913608, "rouge2_fmeasure_stderr": 0.001690768330530915, "rouge2_precision": 0.036146559262545196, "rouge2_precision_stderr": 0.0012086761533876766, "rouge2_recall": 0.08868762877243597, "rouge2_recall_stderr": 0.0030835154047321074, "rougeL_fmeasure": 0.15595658881556435, "rougeL_fmeasure_stderr": 0.002127497880378741, "rougeL_precision": 0.11383026012975615, "rougeL_precision_stderr": 0.001748646718497343, "rougeL_recall": 0.2638855275835188, "rougeL_recall_stderr": 0.0038587410761858755, "rougeLsum_fmeasure": 0.16967238254142056, "rougeLsum_fmeasure_stderr": 0.00237624607133809, "rougeLsum_precision": 0.12373439304630478, "rougeLsum_precision_stderr": 0.0019078089158618033, "rougeLsum_recall": 0.2868552037024372, "rougeLsum_recall_stderr": 0.004272131408381669}}, "1": {"article_DOC_summary": {"bleu": 1.9813214649410344, "bleu_stderr": 0.06605541875330423, "rouge1_fmeasure": 0.20475174329785, "rouge1_fmeasure_stderr": 0.0030292857270426262, "rouge1_precision": 0.17602273459783957, "rouge1_precision_stderr": 0.0034912567740095035, "rouge1_recall": 0.2978672198691626, "rouge1_recall_stderr": 0.004325712043489534, "rouge2_fmeasure": 0.046694043648379924, "rouge2_fmeasure_stderr": 0.001891957305961457, "rouge2_precision": 0.04044850363752258, "rouge2_precision_stderr": 0.0019137570100629107, "rouge2_recall": 0.06884979773351274, "rouge2_recall_stderr": 0.002724070824860895, "rougeL_fmeasure": 0.1593397594005996, "rougeL_fmeasure_stderr": 0.0024118116760541847, "rougeL_precision": 0.13694272638916472, "rougeL_precision_stderr": 0.0028412360369673493, "rougeL_recall": 0.2334897227634632, "rougeL_recall_stderr": 0.003556590427451311, "rougeLsum_fmeasure": 0.15946764140405875, "rougeLsum_fmeasure_stderr": 0.0025141424269190023, "rougeLsum_precision": 0.137142234387202, "rougeLsum_precision_stderr": 0.0029073893715640745, "rougeLsum_recall": 0.23357550407307784, "rougeLsum_recall_stderr": 0.003728086289055363}}, "2": {"article_DOC_summary": {"bleu": 2.568315279137756, "bleu_stderr": 0.13888090668923836, "rouge1_fmeasure": 0.2303304665083427, "rouge1_fmeasure_stderr": 0.003468001334593542, "rouge1_precision": 0.2229374260378376, "rouge1_precision_stderr": 0.004280928411283304, "rouge1_recall": 0.28304069580526636, "rouge1_recall_stderr": 0.003972947611177992, "rouge2_fmeasure": 0.05750919277461529, "rouge2_fmeasure_stderr": 0.002337201303458201, "rouge2_precision": 0.057639208977718646, "rouge2_precision_stderr": 0.002614759341604805, "rouge2_recall": 0.06850665995022794, "rouge2_recall_stderr": 0.0026514610571335023, "rougeL_fmeasure": 0.17795352717964225, "rougeL_fmeasure_stderr": 0.0028890194692426686, "rougeL_precision": 0.17219483167439611, "rougeL_precision_stderr": 0.003538412807763811, "rougeL_recall": 0.21973513766849492, "rougeL_recall_stderr": 0.0033149507650464807, "rougeLsum_fmeasure": 0.17909978174756772, "rougeLsum_fmeasure_stderr": 0.002921283919599673, "rougeLsum_precision": 0.17313600638730706, "rougeLsum_precision_stderr": 0.0035630889990072444, "rougeLsum_recall": 0.2218290481310322, "rougeLsum_recall_stderr": 0.0034301100563963126}}}}
8b7178b44b/evaluation/rankeval/8b7178b44b_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.35,0.015090650341444235,0
3
+ anli_r2,acc,0.339,0.014976758771620339,0
4
+ anli_r3,acc,0.3566666666666667,0.013833742805050713,0
5
+ arc_challenge,acc,0.2440273037542662,0.012551447627856259,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428175,0
7
+ arc_easy,acc,0.5917508417508418,0.01008556619579125,0
8
+ arc_easy,acc_norm,0.5340909090909091,0.010235908103438688,0
9
+ boolq,acc,0.5651376146788991,0.008670528471841557,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.2631578947368421,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4500099581756622,0.004964779805180658,0
14
+ hellaswag,acc_norm,0.5825532762397929,0.00492130033128556,0
15
+ piqa,acc,0.7366702937976061,0.010276185322196764,0
16
+ piqa,acc_norm,0.7464635473340587,0.010150090834551794,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.857,0.01107581480856704,0
19
+ sciq,acc_norm,0.769,0.013334797216936442,0
20
+ storycloze_2016,acc,0.7006948156066275,0.010590117252248801,0
21
+ winogrande,acc,0.5509076558800315,0.013979459389140844,0
8b7178b44b/evaluation/rankeval/8b7178b44b_0_lm-eval_global_step84877_2023-01-31-11-38-06_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.35,
5
- "acc_stderr": 0.015090650341444235
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620339
10
- },
11
- "anli_r3": {
12
- "acc": 0.3566666666666667,
13
- "acc_stderr": 0.013833742805050713
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.2631578947368421
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.4500099581756622,
26
- "acc_stderr": 0.004964779805180658,
27
- "acc_norm": 0.5825532762397929,
28
- "acc_norm_stderr": 0.00492130033128556
29
- },
30
- "rte": {
31
- "acc": 0.5126353790613718,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5509076558800315,
36
- "acc_stderr": 0.013979459389140844
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7006948156066275,
40
- "acc_stderr": 0.010590117252248801
41
- },
42
- "boolq": {
43
- "acc": 0.5651376146788991,
44
- "acc_stderr": 0.008670528471841557
45
- },
46
- "arc_easy": {
47
- "acc": 0.5917508417508418,
48
- "acc_stderr": 0.01008556619579125,
49
- "acc_norm": 0.5340909090909091,
50
- "acc_norm_stderr": 0.010235908103438688
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2440273037542662,
54
- "acc_stderr": 0.012551447627856259,
55
- "acc_norm": 0.2883959044368601,
56
- "acc_norm_stderr": 0.013238394422428175
57
- },
58
- "sciq": {
59
- "acc": 0.857,
60
- "acc_stderr": 0.01107581480856704,
61
- "acc_norm": 0.769,
62
- "acc_norm_stderr": 0.013334797216936442
63
- },
64
- "piqa": {
65
- "acc": 0.7366702937976061,
66
- "acc_stderr": 0.010276185322196764,
67
- "acc_norm": 0.7464635473340587,
68
- "acc_norm_stderr": 0.010150090834551794
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b44b/evaluation/rankeval/8b7178b44b_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.014671272822977885,0
3
+ anli_r2,acc,0.325,0.014818724459095527,0
4
+ anli_r3,acc,0.3491666666666667,0.01376707539507725,0
5
+ arc_challenge,acc,0.2764505119453925,0.013069662474252425,0
6
+ arc_challenge,acc_norm,0.2986348122866894,0.013374078615068752,0
7
+ arc_easy,acc,0.6346801346801347,0.009880576614806928,0
8
+ arc_easy,acc_norm,0.6254208754208754,0.009931758820410629,0
9
+ boolq,acc,0.618348623853211,0.008496550741178254,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.2576489533011272,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.45200159330810596,0.004966736811010493,0
14
+ hellaswag,acc_norm,0.5935072694682334,0.004901747426331751,0
15
+ piqa,acc,0.7486398258977149,0.010121156016819255,0
16
+ piqa,acc_norm,0.750816104461371,0.01009188277012021,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.898,0.009575368801653892,0
19
+ sciq,acc_norm,0.902,0.009406619184621228,0
20
+ storycloze_2016,acc,0.6953500801710315,0.0106434269886468,0
21
+ winogrande,acc,0.5666929755327546,0.01392691505275734,0
8b7178b44b/evaluation/rankeval/8b7178b44b_1_lm-eval_global_step84877_2023-01-31-11-38-06_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.313,
5
- "acc_stderr": 0.014671272822977885
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095527
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.01376707539507725
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.2576489533011272
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.45200159330810596,
26
- "acc_stderr": 0.004966736811010493,
27
- "acc_norm": 0.5935072694682334,
28
- "acc_norm_stderr": 0.004901747426331751
29
- },
30
- "rte": {
31
- "acc": 0.516245487364621,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5666929755327546,
36
- "acc_stderr": 0.01392691505275734
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6953500801710315,
40
- "acc_stderr": 0.0106434269886468
41
- },
42
- "boolq": {
43
- "acc": 0.618348623853211,
44
- "acc_stderr": 0.008496550741178254
45
- },
46
- "arc_easy": {
47
- "acc": 0.6346801346801347,
48
- "acc_stderr": 0.009880576614806928,
49
- "acc_norm": 0.6254208754208754,
50
- "acc_norm_stderr": 0.009931758820410629
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2764505119453925,
54
- "acc_stderr": 0.013069662474252425,
55
- "acc_norm": 0.2986348122866894,
56
- "acc_norm_stderr": 0.013374078615068752
57
- },
58
- "sciq": {
59
- "acc": 0.898,
60
- "acc_stderr": 0.009575368801653892,
61
- "acc_norm": 0.902,
62
- "acc_norm_stderr": 0.009406619184621228
63
- },
64
- "piqa": {
65
- "acc": 0.7486398258977149,
66
- "acc_stderr": 0.010121156016819255,
67
- "acc_norm": 0.750816104461371,
68
- "acc_norm_stderr": 0.01009188277012021
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b44b/evaluation/rankeval/8b7178b44b_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.317,0.014721675438880241,0
3
+ anli_r2,acc,0.358,0.015167928865407559,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989521,0
5
+ arc_challenge,acc,0.28498293515358364,0.013191348179838793,0
6
+ arc_challenge,acc_norm,0.3293515358361775,0.013734057652635474,0
7
+ arc_easy,acc,0.6418350168350169,0.009838331651451841,0
8
+ arc_easy,acc_norm,0.6296296296296297,0.009908978578665753,0
9
+ boolq,acc,0.6204892966360857,0.008487341975756834,1
10
+ cb,acc,0.5178571428571429,0.06737697508644647,1
11
+ cb,f1,0.3558162267839687,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.44911372236606256,0.0049638729368579396,0
14
+ hellaswag,acc_norm,0.5975901214897431,0.004893814890208308,0
15
+ piqa,acc,0.7475516866158868,0.010135665547362362,0
16
+ piqa,acc_norm,0.7453754080522307,0.010164432237060494,0
17
+ rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.918,0.008680515615523727,0
19
+ sciq,acc_norm,0.919,0.00863212103213998,0
20
+ storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
21
+ winogrande,acc,0.5651144435674822,0.013932814110418025,0
8b7178b44b/evaluation/rankeval/8b7178b44b_2_lm-eval_global_step84877_2023-01-31-11-38-06_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.317,
5
- "acc_stderr": 0.014721675438880241
6
- },
7
- "anli_r2": {
8
- "acc": 0.358,
9
- "acc_stderr": 0.015167928865407559
10
- },
11
- "anli_r3": {
12
- "acc": 0.32166666666666666,
13
- "acc_stderr": 0.013490095282989521
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644647,
18
- "f1": 0.3558162267839687
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.44911372236606256,
26
- "acc_stderr": 0.0049638729368579396,
27
- "acc_norm": 0.5975901214897431,
28
- "acc_norm_stderr": 0.004893814890208308
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.5651144435674822,
36
- "acc_stderr": 0.013932814110418025
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6996258685195083,
40
- "acc_stderr": 0.010600915927985021
41
- },
42
- "boolq": {
43
- "acc": 0.6204892966360857,
44
- "acc_stderr": 0.008487341975756834
45
- },
46
- "arc_easy": {
47
- "acc": 0.6418350168350169,
48
- "acc_stderr": 0.009838331651451841,
49
- "acc_norm": 0.6296296296296297,
50
- "acc_norm_stderr": 0.009908978578665753
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28498293515358364,
54
- "acc_stderr": 0.013191348179838793,
55
- "acc_norm": 0.3293515358361775,
56
- "acc_norm_stderr": 0.013734057652635474
57
- },
58
- "sciq": {
59
- "acc": 0.918,
60
- "acc_stderr": 0.008680515615523727,
61
- "acc_norm": 0.919,
62
- "acc_norm_stderr": 0.00863212103213998
63
- },
64
- "piqa": {
65
- "acc": 0.7475516866158868,
66
- "acc_stderr": 0.010135665547362362,
67
- "acc_norm": 0.7453754080522307,
68
- "acc_norm_stderr": 0.010164432237060494
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b44b/evaluation/rankeval/8b7178b44b_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.014671272822977883,0
3
+ anli_r2,acc,0.325,0.014818724459095529,0
4
+ anli_r3,acc,0.33666666666666667,0.013647602942406394,0
5
+ arc_challenge,acc,0.2858361774744027,0.013203196088537369,0
6
+ arc_challenge,acc_norm,0.32081911262798635,0.013640943091946524,0
7
+ arc_easy,acc,0.6405723905723906,0.009845958893373764,0
8
+ arc_easy,acc_norm,0.6376262626262627,0.009863468202583773,0
9
+ boolq,acc,0.6204892966360857,0.008487341975756834,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3995062282572102,,1
12
+ copa,acc,0.74,0.0440844002276808,0
13
+ hellaswag,acc,0.4510057757418841,0.004965768348628053,0
14
+ hellaswag,acc_norm,0.5970922127066322,0.004894801119898596,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459862,0
16
+ piqa,acc_norm,0.7519042437431991,0.010077118315574703,0
17
+ rte,acc,0.5090252707581228,0.030091559826331334,0
18
+ sciq,acc,0.923,0.008434580140240651,0
19
+ sciq,acc_norm,0.925,0.00833333333333335,0
20
+ storycloze_2016,acc,0.7124532335649385,0.010466744473098368,0
21
+ winogrande,acc,0.569060773480663,0.013917796623335966,0
8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.313,
5
- "acc_stderr": 0.014671272822977883
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095529
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.013647602942406394
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3995062282572102
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.0440844002276808
23
- },
24
- "hellaswag": {
25
- "acc": 0.4510057757418841,
26
- "acc_stderr": 0.004965768348628053,
27
- "acc_norm": 0.5970922127066322,
28
- "acc_norm_stderr": 0.004894801119898596
29
- },
30
- "rte": {
31
- "acc": 0.5090252707581228,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.569060773480663,
36
- "acc_stderr": 0.013917796623335966
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7124532335649385,
40
- "acc_stderr": 0.010466744473098368
41
- },
42
- "boolq": {
43
- "acc": 0.6204892966360857,
44
- "acc_stderr": 0.008487341975756834
45
- },
46
- "arc_easy": {
47
- "acc": 0.6405723905723906,
48
- "acc_stderr": 0.009845958893373764,
49
- "acc_norm": 0.6376262626262627,
50
- "acc_norm_stderr": 0.009863468202583773
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2858361774744027,
54
- "acc_stderr": 0.013203196088537369,
55
- "acc_norm": 0.32081911262798635,
56
- "acc_norm_stderr": 0.013640943091946524
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240651,
61
- "acc_norm": 0.925,
62
- "acc_norm_stderr": 0.00833333333333335
63
- },
64
- "piqa": {
65
- "acc": 0.7442872687704026,
66
- "acc_stderr": 0.010178690109459862,
67
- "acc_norm": 0.7519042437431991,
68
- "acc_norm_stderr": 0.010077118315574703
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b44b/evaluation/rankeval/8b7178b44b_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.311,0.014645596385722694,0
3
+ anli_r2,acc,0.299,0.014484778521220477,0
4
+ anli_r3,acc,0.335,0.013630871843821474,0
5
+ arc_challenge,acc,0.28924914675767915,0.013250012579393443,0
6
+ arc_challenge,acc_norm,0.318259385665529,0.013611993916971453,0
7
+ arc_easy,acc,0.6401515151515151,0.009848484848484843,0
8
+ arc_easy,acc_norm,0.6346801346801347,0.009880576614806924,0
9
+ boolq,acc,0.6241590214067279,0.008471147248160114,1
10
+ cb,acc,0.5178571428571429,0.06737697508644647,1
11
+ cb,f1,0.43401043401043404,,1
12
+ copa,acc,0.82,0.038612291966536955,0
13
+ hellaswag,acc,0.45140410276837284,0.004966158142645416,0
14
+ hellaswag,acc_norm,0.601274646484764,0.0048863535635718415,0
15
+ piqa,acc,0.7453754080522307,0.010164432237060487,0
16
+ piqa,acc_norm,0.7448313384113167,0.010171571592521834,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.927,0.008230354715244055,0
19
+ sciq,acc_norm,0.928,0.008178195576218681,0
20
+ storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0
21
+ winogrande,acc,0.590370955011839,0.013821049109655491,0
8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.311,
5
- "acc_stderr": 0.014645596385722694
6
- },
7
- "anli_r2": {
8
- "acc": 0.299,
9
- "acc_stderr": 0.014484778521220477
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821474
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644647,
18
- "f1": 0.43401043401043404
19
- },
20
- "copa": {
21
- "acc": 0.82,
22
- "acc_stderr": 0.038612291966536955
23
- },
24
- "hellaswag": {
25
- "acc": 0.45140410276837284,
26
- "acc_stderr": 0.004966158142645416,
27
- "acc_norm": 0.601274646484764,
28
- "acc_norm_stderr": 0.0048863535635718415
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.590370955011839,
36
- "acc_stderr": 0.013821049109655491
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7097808658471406,
40
- "acc_stderr": 0.010495529690730063
41
- },
42
- "boolq": {
43
- "acc": 0.6241590214067279,
44
- "acc_stderr": 0.008471147248160114
45
- },
46
- "arc_easy": {
47
- "acc": 0.6401515151515151,
48
- "acc_stderr": 0.009848484848484843,
49
- "acc_norm": 0.6346801346801347,
50
- "acc_norm_stderr": 0.009880576614806924
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28924914675767915,
54
- "acc_stderr": 0.013250012579393443,
55
- "acc_norm": 0.318259385665529,
56
- "acc_norm_stderr": 0.013611993916971453
57
- },
58
- "sciq": {
59
- "acc": 0.927,
60
- "acc_stderr": 0.008230354715244055,
61
- "acc_norm": 0.928,
62
- "acc_norm_stderr": 0.008178195576218681
63
- },
64
- "piqa": {
65
- "acc": 0.7453754080522307,
66
- "acc_stderr": 0.010164432237060487,
67
- "acc_norm": 0.7448313384113167,
68
- "acc_norm_stderr": 0.010171571592521834
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b44b/evaluation/rankeval/8b7178b44b_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.01479492784334864,0
3
+ anli_r2,acc,0.302,0.014526080235459548,0
4
+ anli_r3,acc,0.3375,0.013655897185463648,0
5
+ arc_challenge,acc,0.29180887372013653,0.01328452529240351,0
6
+ arc_challenge,acc_norm,0.33532423208191126,0.013796182947785562,0
7
+ arc_easy,acc,0.6456228956228957,0.00981500403025175,0
8
+ arc_easy,acc_norm,0.6506734006734006,0.0097828534493993,0
9
+ boolq,acc,0.6223241590214067,0.008479309208281643,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3081617086193746,,1
12
+ copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.45269866560446126,0.004967402792744857,0
14
+ hellaswag,acc_norm,0.601274646484764,0.004886353563571844,0
15
+ piqa,acc,0.7388465723612623,0.010248738649935581,0
16
+ piqa,acc_norm,0.7459194776931447,0.010157271999135055,0
17
+ rte,acc,0.5126353790613718,0.030086851767188564,0
18
+ sciq,acc,0.931,0.00801893405031515,0
19
+ sciq,acc_norm,0.936,0.007743640226919298,0
20
+ storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0
21
+ winogrande,acc,0.569060773480663,0.01391779662333596,0
8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.323,
5
- "acc_stderr": 0.01479492784334864
6
- },
7
- "anli_r2": {
8
- "acc": 0.302,
9
- "acc_stderr": 0.014526080235459548
10
- },
11
- "anli_r3": {
12
- "acc": 0.3375,
13
- "acc_stderr": 0.013655897185463648
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3081617086193746
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.45269866560446126,
26
- "acc_stderr": 0.004967402792744857,
27
- "acc_norm": 0.601274646484764,
28
- "acc_norm_stderr": 0.004886353563571844
29
- },
30
- "rte": {
31
- "acc": 0.5126353790613718,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.569060773480663,
36
- "acc_stderr": 0.01391779662333596
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7097808658471406,
40
- "acc_stderr": 0.010495529690730063
41
- },
42
- "boolq": {
43
- "acc": 0.6223241590214067,
44
- "acc_stderr": 0.008479309208281643
45
- },
46
- "arc_easy": {
47
- "acc": 0.6456228956228957,
48
- "acc_stderr": 0.00981500403025175,
49
- "acc_norm": 0.6506734006734006,
50
- "acc_norm_stderr": 0.0097828534493993
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29180887372013653,
54
- "acc_stderr": 0.01328452529240351,
55
- "acc_norm": 0.33532423208191126,
56
- "acc_norm_stderr": 0.013796182947785562
57
- },
58
- "sciq": {
59
- "acc": 0.931,
60
- "acc_stderr": 0.00801893405031515,
61
- "acc_norm": 0.936,
62
- "acc_norm_stderr": 0.007743640226919298
63
- },
64
- "piqa": {
65
- "acc": 0.7388465723612623,
66
- "acc_stderr": 0.010248738649935581,
67
- "acc_norm": 0.7459194776931447,
68
- "acc_norm_stderr": 0.010157271999135055
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.14611347950675788
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.14611347950675788
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.23028289344422687
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.23028289344422687
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2598408586362927
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2598408586362927
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.26903286770558915
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.26903286770558915
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2724112279454815
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2724112279454815
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.27475970807249195
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.27475970807249195
14
+ e2e_nlg_cleaned,5,average,multiple,0.24207350588514
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03841432091372367
16
+ gem_xsum,0,median,rouge2_fmeasure,0.03841432091372367
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05054236360888691
18
+ gem_xsum,1,median,rouge2_fmeasure,0.05054236360888691
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06264124531231369
20
+ gem_xsum,2,median,rouge2_fmeasure,0.06264124531231369
21
+ gem_xsum,2,average,multiple,0.05053264327830809
22
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.049040742381146675
23
+ web_nlg_en,0,median,rouge2_fmeasure,0.049040742381146675
24
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08445836822356238
25
+ web_nlg_en,1,median,rouge2_fmeasure,0.08445836822356238
26
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.10928693373297448
27
+ web_nlg_en,2,median,rouge2_fmeasure,0.10928693373297448
28
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.11292635699291721
29
+ web_nlg_en,3,median,rouge2_fmeasure,0.11292635699291721
30
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.12100972003049645
31
+ web_nlg_en,4,median,rouge2_fmeasure,0.12100972003049645
32
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.13171103508163604
33
+ web_nlg_en,5,median,rouge2_fmeasure,0.13171103508163604
34
+ web_nlg_en,5,average,multiple,0.10140552607378887
35
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03307271090107231
36
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03307271090107231
37
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04664136277906473
38
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04664136277906473
39
+ wiki_lingua_en,1,average,multiple,0.03985703684006852
8b7178b88b/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3018678413532531, "bleu_stderr": 0.02844023781425127, "rouge1_fmeasure": 0.10892691674779821, "rouge1_fmeasure_stderr": 0.0019071235551653418, "rouge1_precision": 0.07529613947856881, "rouge1_precision_stderr": 0.001982838396838402, "rouge1_recall": 0.3128650370000908, "rouge1_recall_stderr": 0.0050801382033981305, "rouge2_fmeasure": 0.049040742381146675, "rouge2_fmeasure_stderr": 0.0011519113898695909, "rouge2_precision": 0.03417694669369951, "rouge2_precision_stderr": 0.001313368572681446, "rouge2_recall": 0.14493621807358514, "rouge2_recall_stderr": 0.0032191726931486194, "rougeL_fmeasure": 0.10396046493580034, "rougeL_fmeasure_stderr": 0.001778756221046735, "rougeL_precision": 0.07180450626627975, "rougeL_precision_stderr": 0.001883124040127771, "rougeL_recall": 0.30107392723259574, "rougeL_recall_stderr": 0.004911398619361963, "rougeLsum_fmeasure": 0.10404327940071008, "rougeLsum_fmeasure_stderr": 0.001820581250965178, "rougeLsum_precision": 0.0720836832710309, "rougeLsum_precision_stderr": 0.001909971490636727, "rougeLsum_recall": 0.29709085493162957, "rougeLsum_recall_stderr": 0.004719450806012623}}, "1": {"PALM_prompt": {"bleu": 0.5232740840707778, "bleu_stderr": 0.020854898797383486, "rouge1_fmeasure": 0.1654988415912321, "rouge1_fmeasure_stderr": 0.003746858529251122, "rouge1_precision": 0.14535656258632043, "rouge1_precision_stderr": 0.00467037788995567, "rouge1_recall": 0.32467105717536826, "rouge1_recall_stderr": 0.004844586825107793, "rouge2_fmeasure": 0.08445836822356238, "rouge2_fmeasure_stderr": 0.0026512482133539865, "rouge2_precision": 0.07725516531902364, "rouge2_precision_stderr": 0.0033727094011323654, "rouge2_recall": 0.16618004223187002, "rouge2_recall_stderr": 0.003573100663317745, "rougeL_fmeasure": 0.1515457985267535, "rougeL_fmeasure_stderr": 0.003242282817029173, "rougeL_precision": 0.1318447100821655, "rougeL_precision_stderr": 0.004146806578291652, "rougeL_recall": 0.30561460626596343, "rougeL_recall_stderr": 0.004514632290906585, "rougeLsum_fmeasure": 0.15355200233921673, "rougeLsum_fmeasure_stderr": 0.0032933307640919023, "rougeLsum_precision": 0.1339783021316465, "rougeLsum_precision_stderr": 0.004209899048276762, "rougeLsum_recall": 0.3075713065076715, "rougeLsum_recall_stderr": 0.004515232151240997}}, "2": {"PALM_prompt": {"bleu": 0.8216652782232776, "bleu_stderr": 0.04062910442416713, "rouge1_fmeasure": 0.2027123169825771, "rouge1_fmeasure_stderr": 0.004395624756947157, "rouge1_precision": 0.18008691481284958, "rouge1_precision_stderr": 0.0052757038925836395, "rouge1_recall": 0.3711673998047281, "rouge1_recall_stderr": 0.00487879034297811, "rouge2_fmeasure": 0.10928693373297448, "rouge2_fmeasure_stderr": 0.003154986876758784, "rouge2_precision": 0.10008962637316307, "rouge2_precision_stderr": 0.0037529405438827225, "rouge2_recall": 0.20016469360773192, "rouge2_recall_stderr": 0.003813720451519567, "rougeL_fmeasure": 0.18244327655886838, "rougeL_fmeasure_stderr": 0.003749619328233065, "rougeL_precision": 0.15996735275766777, "rougeL_precision_stderr": 0.004577626944141576, "rougeL_recall": 0.3460684822761284, "rougeL_recall_stderr": 0.004492766954524856, "rougeLsum_fmeasure": 0.18698187602734404, "rougeLsum_fmeasure_stderr": 0.0038921774054845996, "rougeLsum_precision": 0.1650914247834489, "rougeLsum_precision_stderr": 0.00477544492053463, "rougeLsum_recall": 0.3505702667663343, "rougeLsum_recall_stderr": 0.004548048493270462}}, "3": {"PALM_prompt": {"bleu": 0.8830672069742221, "bleu_stderr": 0.028000457075678158, "rouge1_fmeasure": 0.21071007543003908, "rouge1_fmeasure_stderr": 0.004463218149424481, "rouge1_precision": 0.188815921549058, "rouge1_precision_stderr": 0.005405782039170827, "rouge1_recall": 0.38372152833073453, "rouge1_recall_stderr": 0.004859603610175178, "rouge2_fmeasure": 0.11292635699291721, "rouge2_fmeasure_stderr": 0.0031281095189860052, "rouge2_precision": 0.10432859230180822, "rouge2_precision_stderr": 0.003707124967324291, "rouge2_recall": 0.2043813472297843, "rouge2_recall_stderr": 0.003801503662629156, "rougeL_fmeasure": 0.18796910977796713, "rougeL_fmeasure_stderr": 0.003750496425693158, "rougeL_precision": 0.16609714125737446, "rougeL_precision_stderr": 0.004603337178258819, "rougeL_recall": 0.3553325064502732, "rougeL_recall_stderr": 0.0044309664783451575, "rougeLsum_fmeasure": 0.1924829285681012, "rougeLsum_fmeasure_stderr": 0.0038880601940687624, "rougeLsum_precision": 0.17111254219249275, "rougeLsum_precision_stderr": 0.004780700019520514, "rougeLsum_recall": 0.36026081429848233, "rougeLsum_recall_stderr": 0.004488184055350965}}, "4": {"PALM_prompt": {"bleu": 1.0358671081792514, "bleu_stderr": 0.04263566752630496, "rouge1_fmeasure": 0.22245843694786877, "rouge1_fmeasure_stderr": 0.0045583402573195686, "rouge1_precision": 0.19989615669545296, "rouge1_precision_stderr": 0.005599957130646531, "rouge1_recall": 0.40336166710123866, "rouge1_recall_stderr": 0.004818721124125534, "rouge2_fmeasure": 0.12100972003049645, "rouge2_fmeasure_stderr": 0.0032743404066677153, "rouge2_precision": 0.1124336236373582, "rouge2_precision_stderr": 0.003935741095779227, "rouge2_recall": 0.21819990615637125, "rouge2_recall_stderr": 0.00385618079142143, "rougeL_fmeasure": 0.19760963957512798, "rougeL_fmeasure_stderr": 0.00383542084139647, "rougeL_precision": 0.17492321519899293, "rougeL_precision_stderr": 0.004773378480561351, "rougeL_recall": 0.37224158952457587, "rougeL_recall_stderr": 0.004377948019839408, "rougeLsum_fmeasure": 0.20440757227679296, "rougeLsum_fmeasure_stderr": 0.004022054785395138, "rougeLsum_precision": 0.18235385466161755, "rougeLsum_precision_stderr": 0.005021694344891093, "rougeLsum_recall": 0.3793687467005697, "rougeLsum_recall_stderr": 0.004423507289060084}}, "5": {"PALM_prompt": {"bleu": 1.204171361933669, "bleu_stderr": 0.07613961296168276, "rouge1_fmeasure": 0.2366466068769033, "rouge1_fmeasure_stderr": 0.0048055385839361016, "rouge1_precision": 0.21975255516712816, "rouge1_precision_stderr": 0.0060000476488098085, "rouge1_recall": 0.4066635344523642, "rouge1_recall_stderr": 0.0048262902071662655, "rouge2_fmeasure": 0.13171103508163604, "rouge2_fmeasure_stderr": 0.0034923443640983974, "rouge2_precision": 0.1275911476917841, "rouge2_precision_stderr": 0.004309205748397978, "rouge2_recall": 0.2235792080539432, "rouge2_recall_stderr": 0.003950954664759423, "rougeL_fmeasure": 0.2088535130444153, "rougeL_fmeasure_stderr": 0.004052842030593682, "rougeL_precision": 0.19131328031391062, "rougeL_precision_stderr": 0.005134255771123987, "rougeL_recall": 0.37336352525066424, "rougeL_recall_stderr": 0.0044216260882892515, "rougeLsum_fmeasure": 0.21625717477809833, "rougeLsum_fmeasure_stderr": 0.004260506456488727, "rougeLsum_precision": 0.1997593788485136, "rougeLsum_precision_stderr": 0.005406525039653976, "rougeLsum_recall": 0.3803285154915348, "rougeLsum_recall_stderr": 0.004488995109880062}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.7397274944771954, "bleu_stderr": 0.10039261451919343, "rouge1_fmeasure": 0.1238366627407439, "rouge1_fmeasure_stderr": 0.0026637547390690967, "rouge1_precision": 0.12462462729719587, "rouge1_precision_stderr": 0.0031826046662497306, "rouge1_recall": 0.16449607884346923, "rouge1_recall_stderr": 0.003588981148112544, "rouge2_fmeasure": 0.03307271090107231, "rouge2_fmeasure_stderr": 0.0010639050828558138, "rouge2_precision": 0.030579818116461738, "rouge2_precision_stderr": 0.001114447996676589, "rouge2_recall": 0.045348988871228374, "rouge2_recall_stderr": 0.0015776302264520795, "rougeL_fmeasure": 0.0940193069900903, "rougeL_fmeasure_stderr": 0.001985212421750187, "rougeL_precision": 0.09689369540491309, "rougeL_precision_stderr": 0.0026833442642966994, "rougeL_recall": 0.1273558135465041, "rougeL_recall_stderr": 0.002838117953205465, "rougeLsum_fmeasure": 0.11606269945283079, "rougeLsum_fmeasure_stderr": 0.0025094061265404873, "rougeLsum_precision": 0.11762669226995941, "rougeLsum_precision_stderr": 0.0030721629727453585, "rougeLsum_recall": 0.15435308236518808, "rougeLsum_recall_stderr": 0.003394722386908597}}, "1": {"tldr_en": {"bleu": 2.980212171285624, "bleu_stderr": 0.08986977717018042, "rouge1_fmeasure": 0.18706368194466189, "rouge1_fmeasure_stderr": 0.0023760001514770367, "rouge1_precision": 0.2303358349811005, "rouge1_precision_stderr": 0.003690727542733754, "rouge1_recall": 0.2154464843014795, "rouge1_recall_stderr": 0.00312287409246033, "rouge2_fmeasure": 0.04664136277906473, "rouge2_fmeasure_stderr": 0.0012607234019787598, "rouge2_precision": 0.0623131824874716, "rouge2_precision_stderr": 0.0021997319954224622, "rouge2_recall": 0.05366477872164223, "rouge2_recall_stderr": 0.0015294014686067932, "rougeL_fmeasure": 0.14101873450061833, "rougeL_fmeasure_stderr": 0.0017902413086473704, "rougeL_precision": 0.1774352175816246, "rougeL_precision_stderr": 0.003056714448069494, "rougeL_recall": 0.16262682837459477, "rougeL_recall_stderr": 0.00239676054267987, "rougeLsum_fmeasure": 0.175474051697367, "rougeLsum_fmeasure_stderr": 0.0022207055448280263, "rougeLsum_precision": 0.21694264697187762, "rougeLsum_precision_stderr": 0.0035242626334641026, "rougeLsum_recall": 0.2020549195476134, "rougeLsum_recall_stderr": 0.0029196969560855153}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.805428848697077, "bleu_stderr": 0.12760294204588482, "rouge1_fmeasure": 0.3241138419996619, "rouge1_fmeasure_stderr": 0.0023488121086233404, "rouge1_precision": 0.27630659363671284, "rouge1_precision_stderr": 0.0026635424916828915, "rouge1_recall": 0.4517766787964781, "rouge1_recall_stderr": 0.003086695070399088, "rouge2_fmeasure": 0.14611347950675788, "rouge2_fmeasure_stderr": 0.001555781159556189, "rouge2_precision": 0.1264233869651961, "rouge2_precision_stderr": 0.0021472423489400424, "rouge2_recall": 0.21006531093284228, "rouge2_recall_stderr": 0.002273053133823739, "rougeL_fmeasure": 0.2782112735172708, "rougeL_fmeasure_stderr": 0.0017379173406301361, "rougeL_precision": 0.23647123285513574, "rougeL_precision_stderr": 0.0021694427334222896, "rougeL_recall": 0.39302733892651276, "rougeL_recall_stderr": 0.0026453773863021526, "rougeLsum_fmeasure": 0.2838669853732326, "rougeLsum_fmeasure_stderr": 0.0022710804975857187, "rougeLsum_precision": 0.24330576145803912, "rougeLsum_precision_stderr": 0.0025941508735257053, "rougeLsum_recall": 0.3957393841813685, "rougeLsum_recall_stderr": 0.0030500827793352625}}, "1": {"generate_text_restaurant": {"bleu": 12.182758345079787, "bleu_stderr": 0.16518841123830086, "rouge1_fmeasure": 0.4813086962286221, "rouge1_fmeasure_stderr": 0.0023626326089443398, "rouge1_precision": 0.6058724528221369, "rouge1_precision_stderr": 0.0033311930729353, "rouge1_recall": 0.4375685096869626, "rouge1_recall_stderr": 0.003004503828354488, "rouge2_fmeasure": 0.23028289344422687, "rouge2_fmeasure_stderr": 0.0021131218145040835, "rouge2_precision": 0.29572879977984745, "rouge2_precision_stderr": 0.0029383694383037506, "rouge2_recall": 0.20878267638207582, "rouge2_recall_stderr": 0.002207007421280634, "rougeL_fmeasure": 0.3505103154607651, "rougeL_fmeasure_stderr": 0.002138758436636428, "rougeL_precision": 0.4452525524920764, "rougeL_precision_stderr": 0.003155514665754343, "rougeL_recall": 0.31741659640932457, "rougeL_recall_stderr": 0.0024579048629975297, "rougeLsum_fmeasure": 0.39279988924924625, "rougeLsum_fmeasure_stderr": 0.002377886979824964, "rougeLsum_precision": 0.4958740637334028, "rougeLsum_precision_stderr": 0.0033266551400865404, "rougeLsum_recall": 0.356592637216114, "rougeLsum_recall_stderr": 0.0027567285724463706}}, "2": {"generate_text_restaurant": {"bleu": 14.77827774961412, "bleu_stderr": 0.17074445414549996, "rouge1_fmeasure": 0.5169724859470212, "rouge1_fmeasure_stderr": 0.0022847828063681107, "rouge1_precision": 0.6222132815532128, "rouge1_precision_stderr": 0.0032116680629870925, "rouge1_recall": 0.4773787870946972, "rouge1_recall_stderr": 0.002880300695039262, "rouge2_fmeasure": 0.2598408586362927, "rouge2_fmeasure_stderr": 0.002199049529931736, "rouge2_precision": 0.31734278580959246, "rouge2_precision_stderr": 0.002901203617815359, "rouge2_recall": 0.23934523412335545, "rouge2_recall_stderr": 0.0022881777132200292, "rougeL_fmeasure": 0.37830224304770604, "rougeL_fmeasure_stderr": 0.0021704191814246308, "rougeL_precision": 0.45801859712166865, "rougeL_precision_stderr": 0.0030651884383412443, "rougeL_recall": 0.3483543845938943, "rougeL_recall_stderr": 0.002450753947224449, "rougeLsum_fmeasure": 0.42874770680726115, "rougeLsum_fmeasure_stderr": 0.00239536120318971, "rougeLsum_precision": 0.5166167827562745, "rougeLsum_precision_stderr": 0.003243801734266107, "rougeLsum_recall": 0.3956919145577125, "rougeLsum_recall_stderr": 0.002759335166433648}}, "3": {"generate_text_restaurant": {"bleu": 15.660507663936528, "bleu_stderr": 0.14749957372060007, "rouge1_fmeasure": 0.5280713486662109, "rouge1_fmeasure_stderr": 0.0022869985346214445, "rouge1_precision": 0.6220372006914571, "rouge1_precision_stderr": 0.0031139706058727593, "rouge1_recall": 0.4921683601168692, "rouge1_recall_stderr": 0.0029083346931655855, "rouge2_fmeasure": 0.26903286770558915, "rouge2_fmeasure_stderr": 0.0022307748577894758, "rouge2_precision": 0.31964099722948836, "rouge2_precision_stderr": 0.002777708595069122, "rouge2_recall": 0.2509916858334456, "rouge2_recall_stderr": 0.0023901154240661446, "rougeL_fmeasure": 0.38565827502841116, "rougeL_fmeasure_stderr": 0.0022144421481799675, "rougeL_precision": 0.455321793928234, "rougeL_precision_stderr": 0.0029321326061168074, "rougeL_recall": 0.35903739623304465, "rougeL_recall_stderr": 0.0025252866229682975, "rougeLsum_fmeasure": 0.4394144928671385, "rougeLsum_fmeasure_stderr": 0.002446885490860059, "rougeLsum_precision": 0.5172893895354551, "rougeLsum_precision_stderr": 0.003153574847912617, "rougeLsum_recall": 0.40954269048002123, "rougeLsum_recall_stderr": 0.002825679167152443}}, "4": {"generate_text_restaurant": {"bleu": 16.046714521986758, "bleu_stderr": 0.20143548468737246, "rouge1_fmeasure": 0.5343300631658281, "rouge1_fmeasure_stderr": 0.002276059193634907, "rouge1_precision": 0.6240126615936337, "rouge1_precision_stderr": 0.00312720979038774, "rouge1_recall": 0.4989840309368499, "rouge1_recall_stderr": 0.0028177351641601373, "rouge2_fmeasure": 0.2724112279454815, "rouge2_fmeasure_stderr": 0.0022911561617215254, "rouge2_precision": 0.3206493529913017, "rouge2_precision_stderr": 0.0028268414398358497, "rouge2_recall": 0.2544472343665272, "rouge2_recall_stderr": 0.0024038250271984246, "rougeL_fmeasure": 0.38733097602553734, "rougeL_fmeasure_stderr": 0.0022465126119210273, "rougeL_precision": 0.4530934873979934, "rougeL_precision_stderr": 0.0029431217749683455, "rougeL_recall": 0.3614836535348617, "rougeL_recall_stderr": 0.0025050776663639207, "rougeLsum_fmeasure": 0.4465935641244627, "rougeLsum_fmeasure_stderr": 0.0024867305930155463, "rougeLsum_precision": 0.5210977780903223, "rougeLsum_precision_stderr": 0.003193746615616614, "rougeLsum_recall": 0.41707944621670306, "rougeLsum_recall_stderr": 0.002802635738849298}}, "5": {"generate_text_restaurant": {"bleu": 16.068878582309953, "bleu_stderr": 0.16081121483163321, "rouge1_fmeasure": 0.5355837269029773, "rouge1_fmeasure_stderr": 0.002256980681599985, "rouge1_precision": 0.6224102862193265, "rouge1_precision_stderr": 0.003110234568961701, "rouge1_recall": 0.500933748593845, "rouge1_recall_stderr": 0.0027924958992030795, "rouge2_fmeasure": 0.27475970807249195, "rouge2_fmeasure_stderr": 0.0022615182075459324, "rouge2_precision": 0.32190257045006593, "rouge2_precision_stderr": 0.0027959933340407817, "rouge2_recall": 0.2569886621162686, "rouge2_recall_stderr": 0.002381426382114005, "rougeL_fmeasure": 0.3914922502865105, "rougeL_fmeasure_stderr": 0.0022467536238069325, "rougeL_precision": 0.45606984360921154, "rougeL_precision_stderr": 0.002956375747033492, "rougeL_recall": 0.3657446339488385, "rougeL_recall_stderr": 0.002509353106433301, "rougeLsum_fmeasure": 0.4482824729485371, "rougeLsum_fmeasure_stderr": 0.002444818682804833, "rougeLsum_precision": 0.5210361752122326, "rougeLsum_precision_stderr": 0.003175466781222956, "rougeLsum_recall": 0.4190489396414274, "rougeLsum_recall_stderr": 0.0027531052100554392}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9818055644823127, "bleu_stderr": 0.13978823814879407, "rouge1_fmeasure": 0.16636180556058044, "rouge1_fmeasure_stderr": 0.003516551179676636, "rouge1_precision": 0.12872236323811218, "rouge1_precision_stderr": 0.0032462823876021106, "rouge1_recall": 0.2715393984942335, "rouge1_recall_stderr": 0.005900736678094209, "rouge2_fmeasure": 0.03841432091372367, "rouge2_fmeasure_stderr": 0.001586981058162795, "rouge2_precision": 0.028024322491153688, "rouge2_precision_stderr": 0.0011655819703985627, "rouge2_recall": 0.06517448572813263, "rouge2_recall_stderr": 0.002766560068276156, "rougeL_fmeasure": 0.1211817583352299, "rougeL_fmeasure_stderr": 0.0025835665514872593, "rougeL_precision": 0.09534495061706572, "rougeL_precision_stderr": 0.0027658531338571454, "rougeL_recall": 0.19816999370668548, "rougeL_recall_stderr": 0.004414878706261276, "rougeLsum_fmeasure": 0.13466615178498392, "rougeLsum_fmeasure_stderr": 0.002869955857600535, "rougeLsum_precision": 0.10514740754117864, "rougeLsum_precision_stderr": 0.0028961374555318475, "rougeLsum_recall": 0.22067225131355028, "rougeLsum_recall_stderr": 0.004901108311721729}}, "1": {"article_DOC_summary": {"bleu": 2.718621562424974, "bleu_stderr": 0.25634902183378955, "rouge1_fmeasure": 0.2252201544517192, "rouge1_fmeasure_stderr": 0.003572521220076497, "rouge1_precision": 0.23797749292717868, "rouge1_precision_stderr": 0.0044248829684158586, "rouge1_recall": 0.23607563998039394, "rouge1_recall_stderr": 0.0037207865629804763, "rouge2_fmeasure": 0.05054236360888691, "rouge2_fmeasure_stderr": 0.0023023749748183903, "rouge2_precision": 0.05608458865614393, "rouge2_precision_stderr": 0.0026939326656790527, "rouge2_recall": 0.0508692034245319, "rouge2_recall_stderr": 0.0022931658802725287, "rougeL_fmeasure": 0.1713099421580951, "rougeL_fmeasure_stderr": 0.0029320038478532935, "rougeL_precision": 0.18150902719333478, "rougeL_precision_stderr": 0.0036305277559064066, "rougeL_recall": 0.179561582857957, "rougeL_recall_stderr": 0.003045132003822578, "rougeLsum_fmeasure": 0.17382401249824367, "rougeLsum_fmeasure_stderr": 0.0029535439613312367, "rougeLsum_precision": 0.18366739499452403, "rougeLsum_precision_stderr": 0.003629882854081677, "rougeLsum_recall": 0.18297857106589824, "rougeLsum_recall_stderr": 0.003137997544172217}}, "2": {"article_DOC_summary": {"bleu": 3.6613673832300515, "bleu_stderr": 0.29305939325429936, "rouge1_fmeasure": 0.2516525371976862, "rouge1_fmeasure_stderr": 0.0037511199505683374, "rouge1_precision": 0.26927175770553385, "rouge1_precision_stderr": 0.004455761662742256, "rouge1_recall": 0.25342716562216744, "rouge1_recall_stderr": 0.0038549474849294127, "rouge2_fmeasure": 0.06264124531231369, "rouge2_fmeasure_stderr": 0.0025022071058794777, "rouge2_precision": 0.06831364960977458, "rouge2_precision_stderr": 0.0028257306475568847, "rouge2_recall": 0.06217985063286193, "rouge2_recall_stderr": 0.0025187198385362495, "rougeL_fmeasure": 0.19077644710608424, "rougeL_fmeasure_stderr": 0.003126557023491442, "rougeL_precision": 0.20401963131847098, "rougeL_precision_stderr": 0.0036937944194900436, "rougeL_recall": 0.19255452445504442, "rougeL_recall_stderr": 0.0032146416769602863, "rougeLsum_fmeasure": 0.19254279514804024, "rougeLsum_fmeasure_stderr": 0.00313403184014492, "rougeLsum_precision": 0.20564280351844771, "rougeLsum_precision_stderr": 0.003691582561213757, "rougeLsum_recall": 0.1946786446186705, "rougeLsum_recall_stderr": 0.0032424612854966943}}}}
8b7178b88b/evaluation/rankeval/8b7178b88b_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270334,0
3
+ anli_r2,acc,0.344,0.015029633724408943,0
4
+ anli_r3,acc,0.38333333333333336,0.014041190711780102,0
5
+ arc_challenge,acc,0.2773037542662116,0.013082095839059374,0
6
+ arc_challenge,acc_norm,0.2935153583617747,0.013307250444941118,0
7
+ arc_easy,acc,0.6203703703703703,0.009958037725468567,0
8
+ arc_easy,acc_norm,0.5622895622895623,0.010179856486006906,0
9
+ boolq,acc,0.5345565749235474,0.008724144040604807,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.22990271377368152,,1
12
+ copa,acc,0.74,0.04408440022768077,0
13
+ hellaswag,acc,0.4500099581756622,0.004964779805180658,0
14
+ hellaswag,acc_norm,0.5852419836685919,0.004916733258140278,0
15
+ piqa,acc,0.7486398258977149,0.01012115601681926,0
16
+ piqa,acc_norm,0.7529923830250272,0.01006226814077264,0
17
+ rte,acc,0.4693140794223827,0.03003973059219781,0
18
+ sciq,acc,0.86,0.010978183844357805,0
19
+ sciq,acc_norm,0.793,0.012818553557843984,0
20
+ storycloze_2016,acc,0.7022982362373063,0.010573790208173063,0
21
+ winogrande,acc,0.5643251775848461,0.013935709739615708,0
8b7178b88b/evaluation/rankeval/8b7178b88b_0_lm-eval_global_step84877_2023-01-30-20-00-12_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270334
6
- },
7
- "anli_r2": {
8
- "acc": 0.344,
9
- "acc_stderr": 0.015029633724408943
10
- },
11
- "anli_r3": {
12
- "acc": 0.38333333333333336,
13
- "acc_stderr": 0.014041190711780102
14
- },
15
- "cb": {
16
- "acc": 0.3392857142857143,
17
- "acc_stderr": 0.06384226561930825,
18
- "f1": 0.22990271377368152
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768077
23
- },
24
- "hellaswag": {
25
- "acc": 0.4500099581756622,
26
- "acc_stderr": 0.004964779805180658,
27
- "acc_norm": 0.5852419836685919,
28
- "acc_norm_stderr": 0.004916733258140278
29
- },
30
- "rte": {
31
- "acc": 0.4693140794223827,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5643251775848461,
36
- "acc_stderr": 0.013935709739615708
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7022982362373063,
40
- "acc_stderr": 0.010573790208173063
41
- },
42
- "boolq": {
43
- "acc": 0.5345565749235474,
44
- "acc_stderr": 0.008724144040604807
45
- },
46
- "arc_easy": {
47
- "acc": 0.6203703703703703,
48
- "acc_stderr": 0.009958037725468567,
49
- "acc_norm": 0.5622895622895623,
50
- "acc_norm_stderr": 0.010179856486006906
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2773037542662116,
54
- "acc_stderr": 0.013082095839059374,
55
- "acc_norm": 0.2935153583617747,
56
- "acc_norm_stderr": 0.013307250444941118
57
- },
58
- "sciq": {
59
- "acc": 0.86,
60
- "acc_stderr": 0.010978183844357805,
61
- "acc_norm": 0.793,
62
- "acc_norm_stderr": 0.012818553557843984
63
- },
64
- "piqa": {
65
- "acc": 0.7486398258977149,
66
- "acc_stderr": 0.01012115601681926,
67
- "acc_norm": 0.7529923830250272,
68
- "acc_norm_stderr": 0.01006226814077264
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/rankeval/8b7178b88b_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.314,0.014683991951087966,0
3
+ anli_r2,acc,0.347,0.015060472031706615,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225606,0
5
+ arc_challenge,acc,0.30204778156996587,0.013417519144716422,0
6
+ arc_challenge,acc_norm,0.32081911262798635,0.013640943091946526,0
7
+ arc_easy,acc,0.6367845117845118,0.009868397136118798,0
8
+ arc_easy,acc_norm,0.6376262626262627,0.00986346820258377,0
9
+ boolq,acc,0.5529051987767584,0.008695963064172729,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3421052631578947,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.45180242979486157,0.004966544724452226,0
14
+ hellaswag,acc_norm,0.5972913762198765,0.004894407257215795,0
15
+ piqa,acc,0.7453754080522307,0.010164432237060489,0
16
+ piqa,acc_norm,0.749727965179543,0.010106561880089767,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.913,0.008916866630745916,0
19
+ sciq,acc_norm,0.917,0.00872852720607479,0
20
+ storycloze_2016,acc,0.7001603420630679,0.010595525174558607,0
21
+ winogrande,acc,0.5832675611681136,0.013856250072796316,0
8b7178b88b/evaluation/rankeval/8b7178b88b_1_lm-eval_global_step84877_2023-01-30-20-00-12_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.314,
5
- "acc_stderr": 0.014683991951087966
6
- },
7
- "anli_r2": {
8
- "acc": 0.347,
9
- "acc_stderr": 0.015060472031706615
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225606
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3421052631578947
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.45180242979486157,
26
- "acc_stderr": 0.004966544724452226,
27
- "acc_norm": 0.5972913762198765,
28
- "acc_norm_stderr": 0.004894407257215795
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.5832675611681136,
36
- "acc_stderr": 0.013856250072796316
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7001603420630679,
40
- "acc_stderr": 0.010595525174558607
41
- },
42
- "boolq": {
43
- "acc": 0.5529051987767584,
44
- "acc_stderr": 0.008695963064172729
45
- },
46
- "arc_easy": {
47
- "acc": 0.6367845117845118,
48
- "acc_stderr": 0.009868397136118798,
49
- "acc_norm": 0.6376262626262627,
50
- "acc_norm_stderr": 0.00986346820258377
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30204778156996587,
54
- "acc_stderr": 0.013417519144716422,
55
- "acc_norm": 0.32081911262798635,
56
- "acc_norm_stderr": 0.013640943091946526
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.008916866630745916,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.00872852720607479
63
- },
64
- "piqa": {
65
- "acc": 0.7453754080522307,
66
- "acc_stderr": 0.010164432237060489,
67
- "acc_norm": 0.749727965179543,
68
- "acc_norm_stderr": 0.010106561880089767
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/rankeval/8b7178b88b_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.296,0.014442734941575018,0
3
+ anli_r2,acc,0.355,0.01513949154378053,0
4
+ anli_r3,acc,0.3233333333333333,0.013508372867300222,0
5
+ arc_challenge,acc,0.3037542662116041,0.013438909184778755,0
6
+ arc_challenge,acc_norm,0.34215017064846415,0.013864152159177278,0
7
+ arc_easy,acc,0.6384680134680135,0.009858506543162063,0
8
+ arc_easy,acc_norm,0.6473063973063973,0.009804420599378657,0
9
+ boolq,acc,0.5724770642201835,0.008652692997177339,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.28226120857699805,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.45737900816570404,0.004971619995879763,0
14
+ hellaswag,acc_norm,0.6022704640509858,0.004884287515461492,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459857,0
16
+ piqa,acc_norm,0.7546245919477693,0.010039831320422386,0
17
+ rte,acc,0.47653429602888087,0.030063300411902652,0
18
+ sciq,acc,0.923,0.008434580140240637,0
19
+ sciq,acc_norm,0.926,0.008282064512704159,0
20
+ storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0
21
+ winogrande,acc,0.5698500394632992,0.013914685094716692,0
8b7178b88b/evaluation/rankeval/8b7178b88b_2_lm-eval_global_step84877_2023-01-30-20-00-12_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.296,
5
- "acc_stderr": 0.014442734941575018
6
- },
7
- "anli_r2": {
8
- "acc": 0.355,
9
- "acc_stderr": 0.01513949154378053
10
- },
11
- "anli_r3": {
12
- "acc": 0.3233333333333333,
13
- "acc_stderr": 0.013508372867300222
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.28226120857699805
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.45737900816570404,
26
- "acc_stderr": 0.004971619995879763,
27
- "acc_norm": 0.6022704640509858,
28
- "acc_norm_stderr": 0.004884287515461492
29
- },
30
- "rte": {
31
- "acc": 0.47653429602888087,
32
- "acc_stderr": 0.030063300411902652
33
- },
34
- "winogrande": {
35
- "acc": 0.5698500394632992,
36
- "acc_stderr": 0.013914685094716692
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6996258685195083,
40
- "acc_stderr": 0.010600915927985021
41
- },
42
- "boolq": {
43
- "acc": 0.5724770642201835,
44
- "acc_stderr": 0.008652692997177339
45
- },
46
- "arc_easy": {
47
- "acc": 0.6384680134680135,
48
- "acc_stderr": 0.009858506543162063,
49
- "acc_norm": 0.6473063973063973,
50
- "acc_norm_stderr": 0.009804420599378657
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3037542662116041,
54
- "acc_stderr": 0.013438909184778755,
55
- "acc_norm": 0.34215017064846415,
56
- "acc_norm_stderr": 0.013864152159177278
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240637,
61
- "acc_norm": 0.926,
62
- "acc_norm_stderr": 0.008282064512704159
63
- },
64
- "piqa": {
65
- "acc": 0.7442872687704026,
66
- "acc_stderr": 0.010178690109459857,
67
- "acc_norm": 0.7546245919477693,
68
- "acc_norm_stderr": 0.010039831320422386
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/rankeval/8b7178b88b_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574883,0
3
+ anli_r2,acc,0.362,0.0152048409129195,0
4
+ anli_r3,acc,0.32916666666666666,0.013570806258433625,0
5
+ arc_challenge,acc,0.3097269624573379,0.01351205841523836,0
6
+ arc_challenge,acc_norm,0.3378839590443686,0.013822047922283512,0
7
+ arc_easy,acc,0.6418350168350169,0.009838331651451848,0
8
+ arc_easy,acc_norm,0.6452020202020202,0.009817629113069694,0
9
+ boolq,acc,0.5685015290519878,0.0086625945690273,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.44163083293518074,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.45628360884286,0.004970672651595845,0
14
+ hellaswag,acc_norm,0.602867954590719,0.004883037758919965,0
15
+ piqa,acc,0.7464635473340587,0.010150090834551791,0
16
+ piqa,acc_norm,0.7480957562568009,0.010128421335088688,0
17
+ rte,acc,0.5523465703971119,0.02993107036293953,0
18
+ sciq,acc,0.924,0.008384169266796403,0
19
+ sciq,acc_norm,0.933,0.007910345983177547,0
20
+ storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0
21
+ winogrande,acc,0.6006314127861089,0.013764933546717612,0
8b7178b88b/evaluation/rankeval/8b7178b88b_3_lm-eval_global_step84877_2023-01-30-20-00-12_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.014758652303574883
6
- },
7
- "anli_r2": {
8
- "acc": 0.362,
9
- "acc_stderr": 0.0152048409129195
10
- },
11
- "anli_r3": {
12
- "acc": 0.32916666666666666,
13
- "acc_stderr": 0.013570806258433625
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.44163083293518074
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.45628360884286,
26
- "acc_stderr": 0.004970672651595845,
27
- "acc_norm": 0.602867954590719,
28
- "acc_norm_stderr": 0.004883037758919965
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.02993107036293953
33
- },
34
- "winogrande": {
35
- "acc": 0.6006314127861089,
36
- "acc_stderr": 0.013764933546717612
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7097808658471406,
40
- "acc_stderr": 0.010495529690730063
41
- },
42
- "boolq": {
43
- "acc": 0.5685015290519878,
44
- "acc_stderr": 0.0086625945690273
45
- },
46
- "arc_easy": {
47
- "acc": 0.6418350168350169,
48
- "acc_stderr": 0.009838331651451848,
49
- "acc_norm": 0.6452020202020202,
50
- "acc_norm_stderr": 0.009817629113069694
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3097269624573379,
54
- "acc_stderr": 0.01351205841523836,
55
- "acc_norm": 0.3378839590443686,
56
- "acc_norm_stderr": 0.013822047922283512
57
- },
58
- "sciq": {
59
- "acc": 0.924,
60
- "acc_stderr": 0.008384169266796403,
61
- "acc_norm": 0.933,
62
- "acc_norm_stderr": 0.007910345983177547
63
- },
64
- "piqa": {
65
- "acc": 0.7464635473340587,
66
- "acc_stderr": 0.010150090834551791,
67
- "acc_norm": 0.7480957562568009,
68
- "acc_norm_stderr": 0.010128421335088688
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/rankeval/8b7178b88b_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270336,0
3
+ anli_r2,acc,0.38,0.015356947477797577,0
4
+ anli_r3,acc,0.3575,0.013840921245257796,0
5
+ arc_challenge,acc,0.2986348122866894,0.013374078615068756,0
6
+ arc_challenge,acc_norm,0.3216723549488055,0.013650488084494162,0
7
+ arc_easy,acc,0.6456228956228957,0.009815004030251743,0
8
+ arc_easy,acc_norm,0.6435185185185185,0.009828046544504438,0
9
+ boolq,acc,0.5596330275229358,0.008682635667686902,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.3799029799029799,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.4539932284405497,0.00496861353930925,0
14
+ hellaswag,acc_norm,0.6053574985062736,0.004877748536428436,0
15
+ piqa,acc,0.7464635473340587,0.010150090834551782,0
16
+ piqa,acc_norm,0.7529923830250272,0.010062268140772644,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.921,0.008534156773333431,0
19
+ sciq,acc_norm,0.938,0.007629823996280307,0
20
+ storycloze_2016,acc,0.7204703367183325,0.010377702099704856,0
21
+ winogrande,acc,0.5832675611681136,0.013856250072796318,0
8b7178b88b/evaluation/rankeval/8b7178b88b_4_lm-eval_global_step84877_2023-01-30-20-00-12_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270336
6
- },
7
- "anli_r2": {
8
- "acc": 0.38,
9
- "acc_stderr": 0.015356947477797577
10
- },
11
- "anli_r3": {
12
- "acc": 0.3575,
13
- "acc_stderr": 0.013840921245257796
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.3799029799029799
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.4539932284405497,
26
- "acc_stderr": 0.00496861353930925,
27
- "acc_norm": 0.6053574985062736,
28
- "acc_norm_stderr": 0.004877748536428436
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.5832675611681136,
36
- "acc_stderr": 0.013856250072796318
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7204703367183325,
40
- "acc_stderr": 0.010377702099704856
41
- },
42
- "boolq": {
43
- "acc": 0.5596330275229358,
44
- "acc_stderr": 0.008682635667686902
45
- },
46
- "arc_easy": {
47
- "acc": 0.6456228956228957,
48
- "acc_stderr": 0.009815004030251743,
49
- "acc_norm": 0.6435185185185185,
50
- "acc_norm_stderr": 0.009828046544504438
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2986348122866894,
54
- "acc_stderr": 0.013374078615068756,
55
- "acc_norm": 0.3216723549488055,
56
- "acc_norm_stderr": 0.013650488084494162
57
- },
58
- "sciq": {
59
- "acc": 0.921,
60
- "acc_stderr": 0.008534156773333431,
61
- "acc_norm": 0.938,
62
- "acc_norm_stderr": 0.007629823996280307
63
- },
64
- "piqa": {
65
- "acc": 0.7464635473340587,
66
- "acc_stderr": 0.010150090834551782,
67
- "acc_norm": 0.7529923830250272,
68
- "acc_norm_stderr": 0.010062268140772644
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b88b/evaluation/rankeval/8b7178b88b_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574869,0
3
+ anli_r2,acc,0.341,0.014998131348402706,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989521,0
5
+ arc_challenge,acc,0.30887372013651876,0.013501770929344003,0
6
+ arc_challenge,acc_norm,0.3395904436860068,0.013839039762820164,0
7
+ arc_easy,acc,0.6388888888888888,0.00985601342581124,0
8
+ arc_easy,acc_norm,0.6506734006734006,0.009782853449399295,0
9
+ boolq,acc,0.5626911314984709,0.008676043429497423,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.36734693877551017,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4540928101971719,0.004968705270086761,0
14
+ hellaswag,acc_norm,0.6073491336387173,0.004873421833291568,0
15
+ piqa,acc,0.7464635473340587,0.010150090834551786,0
16
+ piqa,acc_norm,0.7524483133841132,0.010069703966857116,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.921,0.008534156773333438,0
19
+ sciq,acc_norm,0.933,0.007910345983177547,0
20
+ storycloze_2016,acc,0.7172634954569749,0.01041380648612127,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195308,0
8b7178b88b/evaluation/rankeval/8b7178b88b_5_lm-eval_global_step84877_2023-01-30-20-00-12_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.014758652303574869
6
- },
7
- "anli_r2": {
8
- "acc": 0.341,
9
- "acc_stderr": 0.014998131348402706
10
- },
11
- "anli_r3": {
12
- "acc": 0.32166666666666666,
13
- "acc_stderr": 0.013490095282989521
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.36734693877551017
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.4540928101971719,
26
- "acc_stderr": 0.004968705270086761,
27
- "acc_norm": 0.6073491336387173,
28
- "acc_norm_stderr": 0.004873421833291568
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195308
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7172634954569749,
40
- "acc_stderr": 0.01041380648612127
41
- },
42
- "boolq": {
43
- "acc": 0.5626911314984709,
44
- "acc_stderr": 0.008676043429497423
45
- },
46
- "arc_easy": {
47
- "acc": 0.6388888888888888,
48
- "acc_stderr": 0.00985601342581124,
49
- "acc_norm": 0.6506734006734006,
50
- "acc_norm_stderr": 0.009782853449399295
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30887372013651876,
54
- "acc_stderr": 0.013501770929344003,
55
- "acc_norm": 0.3395904436860068,
56
- "acc_norm_stderr": 0.013839039762820164
57
- },
58
- "sciq": {
59
- "acc": 0.921,
60
- "acc_stderr": 0.008534156773333438,
61
- "acc_norm": 0.933,
62
- "acc_norm_stderr": 0.007910345983177547
63
- },
64
- "piqa": {
65
- "acc": 0.7464635473340587,
66
- "acc_stderr": 0.010150090834551786,
67
- "acc_norm": 0.7524483133841132,
68
- "acc_norm_stderr": 0.010069703966857116
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }