diff --git a/8b7178b13b/evaluation/8b7178b13b_1_babi.json b/8b7178b13b/evaluation/8b7178b13b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..489fe20a587e0305caa25f3fac7f6a3826959ba8 --- /dev/null +++ b/8b7178b13b/evaluation/8b7178b13b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.10966666666666666, + "em_stderr": 0.005705916414010263 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/8b7178b13b_2_babi.json b/8b7178b13b/evaluation/8b7178b13b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..cbeb231736fd0458935392317b75d3adb7a02e81 --- /dev/null +++ b/8b7178b13b/evaluation/8b7178b13b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.20533333333333334, + "em_stderr": 0.007376222253753254 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/8b7178b13b_3_babi.json b/8b7178b13b/evaluation/8b7178b13b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..e6893a9721feb7f5d76fbad904dbea81097f6a67 --- /dev/null +++ b/8b7178b13b/evaluation/8b7178b13b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.258, + "em_stderr": 0.007989573064892506 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/8b7178b13b_4_babi.json b/8b7178b13b/evaluation/8b7178b13b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..600a2eede5edee8286a0bf1de3a453d8c4bb2db3 --- /dev/null +++ b/8b7178b13b/evaluation/8b7178b13b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.2843333333333333, + "em_stderr": 0.008237227300544015 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/8b7178b13b_5_babi.json b/8b7178b13b/evaluation/8b7178b13b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..80b71d1db525052a958f4c161a7f897463cbca0d --- /dev/null +++ b/8b7178b13b/evaluation/8b7178b13b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.30933333333333335, + "em_stderr": 0.008440329009701236 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..44fc1be21e677f7e1221e5390135a9c0eac0bbbc --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.29175676576014364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034471150363075184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2607907068748222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002898135289732422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23794097030152866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002188642411213601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08536913371644045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020895235460299017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0713925019948092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001635476891768791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06574644928335649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013625725128389179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22160256648226004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027745204374263316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19691795695851566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002270661716223799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1789951285071888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001673666200127008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.27421900185303183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003293382529360642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24426464874342324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735802329870702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22300315725329103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020678687486032563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.071985313483549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07738970826466988}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..5c9821a69ae233f3cd2dd167d3f070f57829c852 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2571754504445173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037874180373320154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22046706474081984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032483649489955437}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20409132873325006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026059526618525918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07716310931244513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021423793043653955}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06255207307386669, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001665836586847627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05803958336813696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013905246714186119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19917050763282695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031058368371087975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16863998232504207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025563859107073333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1557150280292743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020175919818759087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.24286851428671166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003627579238772214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2070808578907387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003069864948055298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19198507442281348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002473123474513908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.256877193820952, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07385158775345343}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ca5930259595613514bf624be1195da25dc3f3f1 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08608412704276916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031073010197710128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07306085845324418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026739004311174368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06754366655458334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00231821642571142}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02477632495929693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013952242546163309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020963360092888926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011677531292808372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01891360756998624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009547131510927643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06705402166283986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024891810743964266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05633331544121819, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021123430958068772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05180637434259422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017937325092342497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08098856968063572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002946288914451725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06822240197436494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002506023867308553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06316340092042942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021760431723604587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.11803096947131637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018616113127493047}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..94f75fcf87ec41793a7fd28cc05ec3d9dc9b93e3 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.014929032882940911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001437252981426829}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.012605454696432882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012515184538477262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01167038909001005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010944129890059014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0041010832248453595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005815077800226312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0034469110258534192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004725161504235943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003203820995720566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00042103771194584703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.011742235186401492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011661092630625284}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009919184883424567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010141555758155579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009059396220908112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008608509358819988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.014255925532109718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001380098431266019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012021925711305772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001202720217240228}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.011096298465596519, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010440702662319144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.860100637257344e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0205550894986597e-10}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a3a1c014b2155a404ff5b5695941d7afec0130ed --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21592444335846092, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004246983520225218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27255737663479357, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00447437166346409}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22044943741927525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003500198780987625}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.05330214126583765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025012370800294183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06515904722605939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025535061286592504}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05351615652078515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022197484992897607}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16172792138960806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003474612361597086}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20248110828398325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034801799909718704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16421475353571632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028136325952485735}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16634670992387487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034675307885691244}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21304752670955557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003802772338811888}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17052032174118803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002883605135295133}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.4283658434676942, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1084231496409544}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..542425febaa57fbed3b0bc94cc83bc7f9feef4b8 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06147088600466444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003807688121301648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06919835565602203, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004080878740872531}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05798370899016291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003309673469340258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.014228533929974868, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014047860954449383}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0166314025722418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014602317545186565}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013641962174467587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012106481387976049}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04649724897774529, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029768758298026627}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.05168291535676844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003094715210052394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.043392440237132235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002526920635642375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.048035895047127686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003028118969042768}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05464177271239397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033104672738008553}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04537985187674532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026361408489140024}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.40228156399700404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0795066489605694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4f0a77366bfa0e2d31d96601a91e3f9cc50274a7 --- /dev/null +++ b/8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002305114359249611, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006855365150103617}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002688750909485673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008419301936781596}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0022410464211047414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006379815102748624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00023662681974673803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010805221445763836}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003191847290509596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013624996945101362}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0002428157678792438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010121603253679161}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0014967981731443785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00044282917196946015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0018089244853048499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005955143349649119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001480383954453164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004335775279500401}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0014611359517881007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004268628859757995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017987055420995693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005977174922462606}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0014592568247182058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00042833093506174205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.2194816597556972e-29, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.9673150803997765e-21}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a3a1128cf5200263b63d7242847cd071504df5eb 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d50b98ce786c24110901e7da0bfb8f2f0b1a8997df2194229acd5bc122dc22d +size 18605434 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d2d23a8b412034c43bace731f97cedd0fb090a72 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fcc32b2a7a9a4ec85ad6b084bb458f744025984db325f5f2f95bc51178c1ef9 +size 24070805 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..006fa112a1d7fe4b10ed21198d9b53925a3be2aa 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff0281a58876590084570bea0b8feb099efce7b6bd5f1dded593af2c7f2d0146 +size 29380490 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52a506be0237b00c1fbf9e1e955346ec0fb9dcd3 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9209c03a21b26044ca2675e69b17b1e5cd47f660be0226fba1abdd49d99233 +size 34786842 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bcae0c533e25b7367729c97b908937dcb1366f5d 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d38077208e1f7fd684e78d720c69585bf6441502b4381400558f457020508dfa +size 9525984 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4427a7514affed305fd09da461d0e4d3dbac2ac0 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf648f5afd69404494ab6936ba7266cd146373c249af58da4c8975a49b6c660 +size 11646229 diff --git a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f1fe872b727def3efaf191ae45a20baa5f5ec091 100644 --- a/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd422c7e3808f695d266dfee392525cea2dbb5f620848c74d4478f52bf23f26f +size 13898127 diff --git a/8b7178b13b/evaluation/generation/merged.csv b/8b7178b13b/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..164b923122c4ea0a25b1fb09bee008159594f7e9 --- /dev/null +++ b/8b7178b13b/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.13957033665597848 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.13957033665597848 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21971297989413593 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21971297989413593 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2436824998963185 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2436824998963185 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2526618416523279 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2526618416523279 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2559926229244319 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2559926229244319 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.259556048619835 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.259556048619835 +e2e_nlg_cleaned,5,average,multiple,0.22852938827383795 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.021176795907655737 +gem_xsum,0,median,rouge2_fmeasure,0.021176795907655737 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.050641256552544464 +gem_xsum,1,median,rouge2_fmeasure,0.050641256552544464 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05359901469303583 +gem_xsum,2,median,rouge2_fmeasure,0.05359901469303583 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05351615652078515 +gem_xsum,3,median,rouge2_fmeasure,0.05351615652078515 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013641962174467587 +gem_xsum,4,median,rouge2_fmeasure,0.013641962174467587 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002428157678792438 +gem_xsum,5,median,rouge2_fmeasure,0.0002428157678792438 +gem_xsum,5,average,multiple,0.032136333602728 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.047608492095606206 +web_nlg_en,0,median,rouge2_fmeasure,0.047608492095606206 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08120001213349398 +web_nlg_en,1,median,rouge2_fmeasure,0.08120001213349398 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.11056055329882854 +web_nlg_en,2,median,rouge2_fmeasure,0.11056055329882854 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.12608234413935157 +web_nlg_en,3,median,rouge2_fmeasure,0.12608234413935157 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.1354322190391961 +web_nlg_en,4,median,rouge2_fmeasure,0.1354322190391961 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1419903076188946 +web_nlg_en,5,median,rouge2_fmeasure,0.1419903076188946 +web_nlg_en,5,average,multiple,0.10714565472089517 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04284409748743667 +wiki_lingua_en,0,median,rouge2_fmeasure,0.04284409748743667 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05107550928141819 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05107550928141819 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06574644928335649 +wiki_lingua_en,2,median,rouge2_fmeasure,0.06574644928335649 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05803958336813696 +wiki_lingua_en,3,median,rouge2_fmeasure,0.05803958336813696 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01891360756998624 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01891360756998624 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003203820995720566 +wiki_lingua_en,5,median,rouge2_fmeasure,0.003203820995720566 +wiki_lingua_en,5,average,multiple,0.039970511331009186 diff --git a/8b7178b13b/evaluation/generation/merged.json b/8b7178b13b/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..13a10c11f5bd7ad90153baa46c3870fe0499f609 --- /dev/null +++ b/8b7178b13b/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4522375218061915, "bleu_stderr": 0.04842537846111568, "rouge1_fmeasure": 0.10639641446726286, "rouge1_fmeasure_stderr": 0.002372783014156511, "rouge1_precision": 0.09405156373772382, "rouge1_precision_stderr": 0.0036589357455526954, "rouge1_recall": 0.2623187491640316, "rouge1_recall_stderr": 0.0054853417994958405, "rouge2_fmeasure": 0.047608492095606206, "rouge2_fmeasure_stderr": 0.001384256918874843, "rouge2_precision": 0.04001471903798784, "rouge2_precision_stderr": 0.002167193193860632, "rouge2_recall": 0.12482367655474384, "rouge2_recall_stderr": 0.003329638916496446, "rougeL_fmeasure": 0.09811489729413486, "rougeL_fmeasure_stderr": 0.002141247906215953, "rougeL_precision": 0.08735958257324784, "rougeL_precision_stderr": 0.0035013157983101876, "rougeL_recall": 0.2450465857818092, "rougeL_recall_stderr": 0.005156857986010396, "rougeLsum_fmeasure": 0.0992851030641096, "rougeLsum_fmeasure_stderr": 0.0022026560740801633, "rougeLsum_precision": 0.08882938411931994, "rougeLsum_precision_stderr": 0.0035668764760974714, "rougeLsum_recall": 0.24452754279908495, "rougeLsum_recall_stderr": 0.005065510424462396}}, "1": {"PALM_prompt": {"bleu": 0.650629434960658, "bleu_stderr": 0.04811417898690189, "rouge1_fmeasure": 0.16281032775122675, "rouge1_fmeasure_stderr": 0.003905335709899557, "rouge1_precision": 0.15916782201345867, "rouge1_precision_stderr": 0.005131870879954661, "rouge1_recall": 0.29665725707782414, "rouge1_recall_stderr": 0.005228319001095806, "rouge2_fmeasure": 0.08120001213349398, "rouge2_fmeasure_stderr": 0.0026664765201647343, "rouge2_precision": 0.07819648278455865, "rouge2_precision_stderr": 0.0034063589997107026, "rouge2_recall": 0.15070155918572603, "rouge2_recall_stderr": 0.003623729048520002, "rougeL_fmeasure": 0.14574992383080743, "rougeL_fmeasure_stderr": 0.003397730356308293, "rougeL_precision": 0.14191196636841233, "rougeL_precision_stderr": 0.004635280920281775, "rougeL_recall": 0.2736224046058516, "rougeL_recall_stderr": 0.004816339822450663, "rougeLsum_fmeasure": 0.14966591828719222, "rougeLsum_fmeasure_stderr": 0.003504137587849123, "rougeLsum_precision": 0.14648404402072937, "rougeLsum_precision_stderr": 0.004776844963861229, "rougeLsum_recall": 0.2774966849512831, "rougeLsum_recall_stderr": 0.004834496186187873}}, "2": {"PALM_prompt": {"bleu": 0.9046528025555703, "bleu_stderr": 0.04317473166983863, "rouge1_fmeasure": 0.2126025964717363, "rouge1_fmeasure_stderr": 0.004487229116514054, "rouge1_precision": 0.21014915270499354, "rouge1_precision_stderr": 0.005813053226743566, "rouge1_recall": 0.35653672883727167, "rouge1_recall_stderr": 0.0051494656409290275, "rouge2_fmeasure": 0.11056055329882854, "rouge2_fmeasure_stderr": 0.0031590233571128605, "rouge2_precision": 0.11091162929767316, "rouge2_precision_stderr": 0.003989043309470051, "rouge2_recall": 0.18668399640135247, "rouge2_recall_stderr": 0.0039177512048306565, "rougeL_fmeasure": 0.18634690018126043, "rougeL_fmeasure_stderr": 0.0038448807973693925, "rougeL_precision": 0.1825110333764981, "rougeL_precision_stderr": 0.005070346564582974, "rougeL_recall": 0.32304322748014636, "rougeL_recall_stderr": 0.004678142532303707, "rougeLsum_fmeasure": 0.19202573601368517, "rougeLsum_fmeasure_stderr": 0.003952591052036519, "rougeLsum_precision": 0.18939634311269352, "rougeLsum_precision_stderr": 0.005258445068941222, "rougeLsum_recall": 0.32966626591290826, "rougeLsum_recall_stderr": 0.004724502434373412}}, "3": {"PALM_prompt": {"bleu": 1.1442800703272336, "bleu_stderr": 0.05161996592570338, "rouge1_fmeasure": 0.23508991254207284, "rouge1_fmeasure_stderr": 0.004629614568550998, "rouge1_precision": 0.23213625362665477, "rouge1_precision_stderr": 0.0059610834558063595, "rouge1_recall": 0.38062292481677, "rouge1_recall_stderr": 0.005078282219671234, "rouge2_fmeasure": 0.12608234413935157, "rouge2_fmeasure_stderr": 0.003256601130342381, "rouge2_precision": 0.12811335412630678, "rouge2_precision_stderr": 0.004200144761611546, "rouge2_recall": 0.20379204157806646, "rouge2_recall_stderr": 0.00386170912683119, "rougeL_fmeasure": 0.20320672155464048, "rougeL_fmeasure_stderr": 0.003882337416765662, "rougeL_precision": 0.1997394107463207, "rougeL_precision_stderr": 0.005153079376868987, "rougeL_recall": 0.3397942315323993, "rougeL_recall_stderr": 0.004538527029075319, "rougeLsum_fmeasure": 0.21069756332821474, "rougeLsum_fmeasure_stderr": 0.004028196190821567, "rougeLsum_precision": 0.2083043346123531, "rougeLsum_precision_stderr": 0.005363455263356501, "rougeLsum_recall": 0.347929987856493, "rougeLsum_recall_stderr": 0.0045764471921679985}}, "4": {"PALM_prompt": {"bleu": 1.3793783786539846, "bleu_stderr": 0.08292725693632953, "rouge1_fmeasure": 0.25107613979636906, "rouge1_fmeasure_stderr": 0.00476323540590602, "rouge1_precision": 0.25146617993410997, "rouge1_precision_stderr": 0.006199020027337637, "rouge1_recall": 0.3925026283739189, "rouge1_recall_stderr": 0.005116605241451321, "rouge2_fmeasure": 0.1354322190391961, "rouge2_fmeasure_stderr": 0.003385535533668589, "rouge2_precision": 0.1385226844845104, "rouge2_precision_stderr": 0.004282956786141916, "rouge2_recall": 0.21131194271802842, "rouge2_recall_stderr": 0.0040385627196912076, "rougeL_fmeasure": 0.2160535699510597, "rougeL_fmeasure_stderr": 0.003987483818543123, "rougeL_precision": 0.21425855000222013, "rougeL_precision_stderr": 0.005261027090142587, "rougeL_recall": 0.34988090714764264, "rougeL_recall_stderr": 0.004590444333954342, "rougeLsum_fmeasure": 0.22537108240785111, "rougeLsum_fmeasure_stderr": 0.004170782290413772, "rougeLsum_precision": 0.22545655976991655, "rougeLsum_precision_stderr": 0.0055589913005229, "rougeLsum_recall": 0.36010176303537716, "rougeLsum_recall_stderr": 0.0046603281582664625}}, "5": {"PALM_prompt": {"bleu": 1.4145829791141218, "bleu_stderr": 0.06756850302725374, "rouge1_fmeasure": 0.25726246836968625, "rouge1_fmeasure_stderr": 0.004880864665323261, "rouge1_precision": 0.25559388874117145, "rouge1_precision_stderr": 0.006265188037282814, "rouge1_recall": 0.4014517195417289, "rouge1_recall_stderr": 0.0052097160639095145, "rouge2_fmeasure": 0.1419903076188946, "rouge2_fmeasure_stderr": 0.003491421839159055, "rouge2_precision": 0.1462902529413211, "rouge2_precision_stderr": 0.004502888503688928, "rouge2_recall": 0.22067604236758182, "rouge2_recall_stderr": 0.0041554431177298085, "rougeL_fmeasure": 0.22145748379950564, "rougeL_fmeasure_stderr": 0.004104691248611666, "rougeL_precision": 0.21838534521953784, "rougeL_precision_stderr": 0.005386484846358847, "rougeL_recall": 0.35807492888309944, "rougeL_recall_stderr": 0.004713215605602484, "rougeLsum_fmeasure": 0.2308946484376556, "rougeLsum_fmeasure_stderr": 0.004281312884008834, "rougeLsum_precision": 0.22909884605448813, "rougeLsum_precision_stderr": 0.00564116547513353, "rougeLsum_recall": 0.3686456415754311, "rougeLsum_recall_stderr": 0.004784333349134369}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.5768645983512664, "bleu_stderr": 0.10390732312462625, "rouge1_fmeasure": 0.1810278543145764, "rouge1_fmeasure_stderr": 0.002359129908964426, "rouge1_precision": 0.1631004447259747, "rouge1_precision_stderr": 0.0025140049334374266, "rouge1_recall": 0.24869580468026348, "rouge1_recall_stderr": 0.0032277820635021014, "rouge2_fmeasure": 0.04284409748743667, "rouge2_fmeasure_stderr": 0.001036773633130634, "rouge2_precision": 0.0385111191751266, "rouge2_precision_stderr": 0.0009986244610694906, "rouge2_recall": 0.05950986614858929, "rouge2_recall_stderr": 0.0015515657908224004, "rougeL_fmeasure": 0.1323060407446597, "rougeL_fmeasure_stderr": 0.001666744596513707, "rougeL_precision": 0.11812150163422086, "rougeL_precision_stderr": 0.0017909292945280444, "rougeL_recall": 0.18668011105548613, "rougeL_recall_stderr": 0.002526858332113489, "rougeLsum_fmeasure": 0.16822692441451978, "rougeLsum_fmeasure_stderr": 0.0021965424695315023, "rougeLsum_precision": 0.15149118025181801, "rougeLsum_precision_stderr": 0.0023436532932745767, "rougeLsum_recall": 0.23193053314919995, "rougeLsum_recall_stderr": 0.003047509010443981}}, "1": {"tldr_en": {"bleu": 3.2805418289148096, "bleu_stderr": 0.06930736886640082, "rouge1_fmeasure": 0.20446918503496525, "rouge1_fmeasure_stderr": 0.0022069372036890365, "rouge1_precision": 0.24381809168774637, "rouge1_precision_stderr": 0.0033022582237546634, "rouge1_recall": 0.23017687741596388, "rouge1_recall_stderr": 0.0028624354435880044, "rouge2_fmeasure": 0.05107550928141819, "rouge2_fmeasure_stderr": 0.0012372719079850197, "rouge2_precision": 0.064059693409422, "rouge2_precision_stderr": 0.0018318331544901892, "rouge2_recall": 0.05717654707897076, "rouge2_recall_stderr": 0.0015139726468610508, "rougeL_fmeasure": 0.15417240714255895, "rougeL_fmeasure_stderr": 0.001662532523275427, "rougeL_precision": 0.18545125118813593, "rougeL_precision_stderr": 0.0026173569724847225, "rougeL_recall": 0.1747030421481033, "rougeL_recall_stderr": 0.0022396644779822898, "rougeLsum_fmeasure": 0.19083537889844354, "rougeLsum_fmeasure_stderr": 0.002054035558945731, "rougeLsum_precision": 0.2280337645476899, "rougeLsum_precision_stderr": 0.003112015382305472, "rougeLsum_recall": 0.21469429192649783, "rougeLsum_recall_stderr": 0.0026546326180329014}}, "2": {"tldr_en": {"bleu": 4.071985313483549, "bleu_stderr": 0.07738970826466988, "rouge1_fmeasure": 0.23794097030152866, "rouge1_fmeasure_stderr": 0.002188642411213601, "rouge1_precision": 0.29175676576014364, "rouge1_precision_stderr": 0.0034471150363075184, "rouge1_recall": 0.2607907068748222, "rouge1_recall_stderr": 0.002898135289732422, "rouge2_fmeasure": 0.06574644928335649, "rouge2_fmeasure_stderr": 0.0013625725128389179, "rouge2_precision": 0.08536913371644045, "rouge2_precision_stderr": 0.0020895235460299017, "rouge2_recall": 0.0713925019948092, "rouge2_recall_stderr": 0.001635476891768791, "rougeL_fmeasure": 0.1789951285071888, "rougeL_fmeasure_stderr": 0.001673666200127008, "rougeL_precision": 0.22160256648226004, "rougeL_precision_stderr": 0.0027745204374263316, "rougeL_recall": 0.19691795695851566, "rougeL_recall_stderr": 0.002270661716223799, "rougeLsum_fmeasure": 0.22300315725329103, "rougeLsum_fmeasure_stderr": 0.0020678687486032563, "rougeLsum_precision": 0.27421900185303183, "rougeLsum_precision_stderr": 0.003293382529360642, "rougeLsum_recall": 0.24426464874342324, "rougeLsum_recall_stderr": 0.002735802329870702}}, "3": {"tldr_en": {"bleu": 3.256877193820952, "bleu_stderr": 0.07385158775345343, "rouge1_fmeasure": 0.20409132873325006, "rouge1_fmeasure_stderr": 0.0026059526618525918, "rouge1_precision": 0.2571754504445173, "rouge1_precision_stderr": 0.0037874180373320154, "rouge1_recall": 0.22046706474081984, "rouge1_recall_stderr": 0.0032483649489955437, "rouge2_fmeasure": 0.05803958336813696, "rouge2_fmeasure_stderr": 0.0013905246714186119, "rouge2_precision": 0.07716310931244513, "rouge2_precision_stderr": 0.0021423793043653955, "rouge2_recall": 0.06255207307386669, "rouge2_recall_stderr": 0.001665836586847627, "rougeL_fmeasure": 0.1557150280292743, "rougeL_fmeasure_stderr": 0.0020175919818759087, "rougeL_precision": 0.19917050763282695, "rougeL_precision_stderr": 0.0031058368371087975, "rougeL_recall": 0.16863998232504207, "rougeL_recall_stderr": 0.0025563859107073333, "rougeLsum_fmeasure": 0.19198507442281348, "rougeLsum_fmeasure_stderr": 0.002473123474513908, "rougeLsum_precision": 0.24286851428671166, "rougeLsum_precision_stderr": 0.003627579238772214, "rougeLsum_recall": 0.2070808578907387, "rougeLsum_recall_stderr": 0.003069864948055298}}, "4": {"tldr_en": {"bleu": 0.11803096947131637, "bleu_stderr": 0.018616113127493047, "rouge1_fmeasure": 0.06754366655458334, "rouge1_fmeasure_stderr": 0.00231821642571142, "rouge1_precision": 0.08608412704276916, "rouge1_precision_stderr": 0.0031073010197710128, "rouge1_recall": 0.07306085845324418, "rouge1_recall_stderr": 0.0026739004311174368, "rouge2_fmeasure": 0.01891360756998624, "rouge2_fmeasure_stderr": 0.0009547131510927643, "rouge2_precision": 0.02477632495929693, "rouge2_precision_stderr": 0.0013952242546163309, "rouge2_recall": 0.020963360092888926, "rouge2_recall_stderr": 0.0011677531292808372, "rougeL_fmeasure": 0.05180637434259422, "rougeL_fmeasure_stderr": 0.0017937325092342497, "rougeL_precision": 0.06705402166283986, "rougeL_precision_stderr": 0.0024891810743964266, "rougeL_recall": 0.05633331544121819, "rougeL_recall_stderr": 0.0021123430958068772, "rougeLsum_fmeasure": 0.06316340092042942, "rougeLsum_fmeasure_stderr": 0.0021760431723604587, "rougeLsum_precision": 0.08098856968063572, "rougeLsum_precision_stderr": 0.002946288914451725, "rougeLsum_recall": 0.06822240197436494, "rougeLsum_recall_stderr": 0.002506023867308553}}, "5": {"tldr_en": {"bleu": 5.860100637257344e-12, "bleu_stderr": 1.0205550894986597e-10, "rouge1_fmeasure": 0.01167038909001005, "rouge1_fmeasure_stderr": 0.0010944129890059014, "rouge1_precision": 0.014929032882940911, "rouge1_precision_stderr": 0.001437252981426829, "rouge1_recall": 0.012605454696432882, "rouge1_recall_stderr": 0.0012515184538477262, "rouge2_fmeasure": 0.003203820995720566, "rouge2_fmeasure_stderr": 0.00042103771194584703, "rouge2_precision": 0.0041010832248453595, "rouge2_precision_stderr": 0.0005815077800226312, "rouge2_recall": 0.0034469110258534192, "rouge2_recall_stderr": 0.0004725161504235943, "rougeL_fmeasure": 0.009059396220908112, "rougeL_fmeasure_stderr": 0.0008608509358819988, "rougeL_precision": 0.011742235186401492, "rougeL_precision_stderr": 0.0011661092630625284, "rougeL_recall": 0.009919184883424567, "rougeL_recall_stderr": 0.0010141555758155579, "rougeLsum_fmeasure": 0.011096298465596519, "rougeLsum_fmeasure_stderr": 0.0010440702662319144, "rougeLsum_precision": 0.014255925532109718, "rougeLsum_precision_stderr": 0.001380098431266019, "rougeLsum_recall": 0.012021925711305772, "rougeLsum_recall_stderr": 0.001202720217240228}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.518228772851459, "bleu_stderr": 0.07988589142118296, "rouge1_fmeasure": 0.30892622395914193, "rouge1_fmeasure_stderr": 0.002362726614809891, "rouge1_precision": 0.2527585937479932, "rouge1_precision_stderr": 0.0019586691047001085, "rouge1_recall": 0.4532685157753863, "rouge1_recall_stderr": 0.0036911339160002263, "rouge2_fmeasure": 0.13957033665597848, "rouge2_fmeasure_stderr": 0.001587978757027655, "rouge2_precision": 0.10827294752994317, "rouge2_precision_stderr": 0.0013193546727602984, "rouge2_recall": 0.21206177532278597, "rouge2_recall_stderr": 0.0024697648114498094, "rougeL_fmeasure": 0.25201104517465034, "rougeL_fmeasure_stderr": 0.0018344182759923726, "rougeL_precision": 0.2074761734719538, "rougeL_precision_stderr": 0.0015484435087844478, "rougeL_recall": 0.3707049758227277, "rougeL_recall_stderr": 0.002992917144452656, "rougeLsum_fmeasure": 0.2721367746591264, "rougeLsum_fmeasure_stderr": 0.002175228967316158, "rougeLsum_precision": 0.22367271169207667, "rougeLsum_precision_stderr": 0.0018161195853035582, "rougeLsum_recall": 0.3994503148641263, "rougeLsum_recall_stderr": 0.0034245642845668594}}, "1": {"generate_text_restaurant": {"bleu": 11.762713013802438, "bleu_stderr": 0.09382576543199773, "rouge1_fmeasure": 0.4698359144705449, "rouge1_fmeasure_stderr": 0.0023428195562317463, "rouge1_precision": 0.5758411505584643, "rouge1_precision_stderr": 0.0032641614095776606, "rouge1_recall": 0.43699974124480134, "rouge1_recall_stderr": 0.0030360517355224794, "rouge2_fmeasure": 0.21971297989413593, "rouge2_fmeasure_stderr": 0.0020517231278393165, "rouge2_precision": 0.27339290904893143, "rouge2_precision_stderr": 0.0027333405330301377, "rouge2_recall": 0.20425825992656632, "rouge2_recall_stderr": 0.0021962926617281136, "rougeL_fmeasure": 0.3397129377482095, "rougeL_fmeasure_stderr": 0.0020808579737579473, "rougeL_precision": 0.4200242586753886, "rougeL_precision_stderr": 0.0030278428531837543, "rougeL_recall": 0.31470569352158173, "rougeL_recall_stderr": 0.002441278894908242, "rougeLsum_fmeasure": 0.3826629288347234, "rougeLsum_fmeasure_stderr": 0.0023388359390624446, "rougeLsum_precision": 0.4702024862705025, "rougeLsum_precision_stderr": 0.0032151475190549403, "rougeLsum_recall": 0.3555458059380285, "rougeLsum_recall_stderr": 0.0027838026083691135}}, "2": {"generate_text_restaurant": {"bleu": 14.028619911508963, "bleu_stderr": 0.17673898243267264, "rouge1_fmeasure": 0.4982696315856735, "rouge1_fmeasure_stderr": 0.002277178664730148, "rouge1_precision": 0.5880328986485642, "rouge1_precision_stderr": 0.0031633644314478966, "rouge1_recall": 0.4720615710167169, "rouge1_recall_stderr": 0.0030042579734499103, "rouge2_fmeasure": 0.2436824998963185, "rouge2_fmeasure_stderr": 0.0021479952999434505, "rouge2_precision": 0.2905416018050732, "rouge2_precision_stderr": 0.0027458331588556565, "rouge2_recall": 0.2313719045642725, "rouge2_recall_stderr": 0.002349650773654754, "rougeL_fmeasure": 0.36315580245879386, "rougeL_fmeasure_stderr": 0.0021103667472393136, "rougeL_precision": 0.4304826242102334, "rougeL_precision_stderr": 0.002921537898047091, "rougeL_recall": 0.34359634632916913, "rougeL_recall_stderr": 0.002538014950574236, "rougeLsum_fmeasure": 0.41230819320152157, "rougeLsum_fmeasure_stderr": 0.0023628359694983633, "rougeLsum_precision": 0.4866663175226494, "rougeLsum_precision_stderr": 0.003134734940667762, "rougeLsum_recall": 0.39070560600420967, "rougeLsum_recall_stderr": 0.002859660656155989}}, "3": {"generate_text_restaurant": {"bleu": 14.824429124321526, "bleu_stderr": 0.1257638219512965, "rouge1_fmeasure": 0.5070937995389966, "rouge1_fmeasure_stderr": 0.0021993659376666874, "rouge1_precision": 0.5903839264246126, "rouge1_precision_stderr": 0.003096593178519806, "rouge1_recall": 0.48303754455261133, "rouge1_recall_stderr": 0.0029313776925495253, "rouge2_fmeasure": 0.2526618416523279, "rouge2_fmeasure_stderr": 0.0021298129637362085, "rouge2_precision": 0.2971819596009762, "rouge2_precision_stderr": 0.0027204840509740583, "rouge2_recall": 0.24110048435208156, "rouge2_recall_stderr": 0.002348630487706421, "rougeL_fmeasure": 0.37097982222923104, "rougeL_fmeasure_stderr": 0.00214061071940383, "rougeL_precision": 0.43347382731719464, "rougeL_precision_stderr": 0.0029257357889719074, "rougeL_recall": 0.3529852168172252, "rougeL_recall_stderr": 0.002550242635255893, "rougeLsum_fmeasure": 0.42203281188344766, "rougeLsum_fmeasure_stderr": 0.0023652228936698657, "rougeLsum_precision": 0.49123394352656125, "rougeLsum_precision_stderr": 0.0031248395828196904, "rougeLsum_recall": 0.4020764458847546, "rougeLsum_recall_stderr": 0.0028446825862533704}}, "4": {"generate_text_restaurant": {"bleu": 15.1456115809235, "bleu_stderr": 0.1628753687758124, "rouge1_fmeasure": 0.5145793651843434, "rouge1_fmeasure_stderr": 0.002244489356919281, "rouge1_precision": 0.5945998345487046, "rouge1_precision_stderr": 0.003074613754297493, "rouge1_recall": 0.48854055206810637, "rouge1_recall_stderr": 0.0028822641580179367, "rouge2_fmeasure": 0.2559926229244319, "rouge2_fmeasure_stderr": 0.002186356460577112, "rouge2_precision": 0.29784097926829245, "rouge2_precision_stderr": 0.0026941678409656135, "rouge2_recall": 0.2436337287979087, "rouge2_recall_stderr": 0.0023716503152493335, "rougeL_fmeasure": 0.3747326640922719, "rougeL_fmeasure_stderr": 0.002148508706418628, "rougeL_precision": 0.433653082379793, "rougeL_precision_stderr": 0.002833488672483008, "rougeL_recall": 0.3558030676746678, "rougeL_recall_stderr": 0.0025159494686373797, "rougeLsum_fmeasure": 0.42740717670424816, "rougeLsum_fmeasure_stderr": 0.0023775373903584217, "rougeLsum_precision": 0.49357503577481815, "rougeLsum_precision_stderr": 0.0030663612183832903, "rougeLsum_recall": 0.4058018094790397, "rougeLsum_recall_stderr": 0.002791634703514705}}, "5": {"generate_text_restaurant": {"bleu": 15.184590176560544, "bleu_stderr": 0.17570671649096134, "rouge1_fmeasure": 0.5172310679190623, "rouge1_fmeasure_stderr": 0.002187662941341786, "rouge1_precision": 0.5980619294865127, "rouge1_precision_stderr": 0.003055147702862821, "rouge1_recall": 0.4899430329979205, "rouge1_recall_stderr": 0.002838541142993511, "rouge2_fmeasure": 0.259556048619835, "rouge2_fmeasure_stderr": 0.002143271876534031, "rouge2_precision": 0.3029600136699186, "rouge2_precision_stderr": 0.002703515351522382, "rouge2_recall": 0.24620276831802643, "rouge2_recall_stderr": 0.0023264378571411816, "rougeL_fmeasure": 0.37807698267399703, "rougeL_fmeasure_stderr": 0.002126725638562394, "rougeL_precision": 0.4380103729999666, "rougeL_precision_stderr": 0.0028424300466825983, "rougeL_recall": 0.35795640195755224, "rougeL_recall_stderr": 0.0024826420662082673, "rougeLsum_fmeasure": 0.4311732922825964, "rougeLsum_fmeasure_stderr": 0.0023482937755900097, "rougeLsum_precision": 0.4982942932710782, "rougeLsum_precision_stderr": 0.0030594319184573462, "rougeLsum_recall": 0.40852131422574206, "rougeLsum_recall_stderr": 0.0027745302682206834}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0840616150366835, "bleu_stderr": 0.10220605780455636, "rouge1_fmeasure": 0.12067679782231505, "rouge1_fmeasure_stderr": 0.003015872334492621, "rouge1_precision": 0.09160247708817207, "rouge1_precision_stderr": 0.002375172469912284, "rouge1_recall": 0.19613476599918667, "rouge1_recall_stderr": 0.005073318605110471, "rouge2_fmeasure": 0.021176795907655737, "rouge2_fmeasure_stderr": 0.0011603757977784665, "rouge2_precision": 0.015487063309988504, "rouge2_precision_stderr": 0.000848981732321504, "rouge2_recall": 0.0363837944571209, "rouge2_recall_stderr": 0.0020790710888273895, "rougeL_fmeasure": 0.09086726074181874, "rougeL_fmeasure_stderr": 0.0022256142858678965, "rougeL_precision": 0.06937634282139736, "rougeL_precision_stderr": 0.001813605739492327, "rougeL_recall": 0.1479963053053532, "rougeL_recall_stderr": 0.0038099944087791725, "rougeLsum_fmeasure": 0.09880097411831448, "rougeLsum_fmeasure_stderr": 0.0024614730448775535, "rougeLsum_precision": 0.07512656050190324, "rougeLsum_precision_stderr": 0.001959507720128778, "rougeLsum_recall": 0.1612082948452722, "rougeLsum_recall_stderr": 0.004219489865399336}}, "1": {"article_DOC_summary": {"bleu": 2.1144830791719307, "bleu_stderr": 0.11445499213864452, "rouge1_fmeasure": 0.22223912621847974, "rouge1_fmeasure_stderr": 0.003038126692025604, "rouge1_precision": 0.2003179694226108, "rouge1_precision_stderr": 0.003646294986793581, "rouge1_recall": 0.3082326819925303, "rouge1_recall_stderr": 0.004421626418186149, "rouge2_fmeasure": 0.050641256552544464, "rouge2_fmeasure_stderr": 0.002016315458850625, "rouge2_precision": 0.04617082546967428, "rouge2_precision_stderr": 0.0021567164563758494, "rouge2_recall": 0.07074171325045796, "rouge2_recall_stderr": 0.0026782882522819163, "rougeL_fmeasure": 0.16589957489115423, "rougeL_fmeasure_stderr": 0.0024349153804753905, "rougeL_precision": 0.15020466828774898, "rougeL_precision_stderr": 0.002974138334891651, "rougeL_recall": 0.23037723268498672, "rougeL_recall_stderr": 0.0034903634923284257, "rougeLsum_fmeasure": 0.17413975005815832, "rougeLsum_fmeasure_stderr": 0.0025334465295397523, "rougeLsum_precision": 0.156190697593295, "rougeLsum_precision_stderr": 0.0029737943854797597, "rougeLsum_recall": 0.2443326358730322, "rougeLsum_recall_stderr": 0.003851152935036917}}, "2": {"article_DOC_summary": {"bleu": 2.2306956500254795, "bleu_stderr": 0.1534426529740081, "rouge1_fmeasure": 0.22874247828238384, "rouge1_fmeasure_stderr": 0.003191868132841587, "rouge1_precision": 0.2167785998186487, "rouge1_precision_stderr": 0.0038703703581663036, "rouge1_recall": 0.29079354607238944, "rouge1_recall_stderr": 0.0042086230091149205, "rouge2_fmeasure": 0.05359901469303583, "rouge2_fmeasure_stderr": 0.002063273466751458, "rouge2_precision": 0.05175962394844965, "rouge2_precision_stderr": 0.002262457698273439, "rouge2_recall": 0.06767519675126271, "rouge2_recall_stderr": 0.002531468931801855, "rougeL_fmeasure": 0.17116883513373068, "rougeL_fmeasure_stderr": 0.002601631095778569, "rougeL_precision": 0.1626685190790419, "rougeL_precision_stderr": 0.003181540362928691, "rougeL_recall": 0.2176434055764335, "rougeL_recall_stderr": 0.0033468188991687773, "rougeLsum_fmeasure": 0.17781283216267912, "rougeLsum_fmeasure_stderr": 0.0026319963165001175, "rougeLsum_precision": 0.16752103628195428, "rougeLsum_precision_stderr": 0.0031476246457933605, "rougeLsum_recall": 0.2286412427632142, "rougeLsum_recall_stderr": 0.003582394216085152}}, "3": {"article_DOC_summary": {"bleu": 2.4283658434676942, "bleu_stderr": 0.1084231496409544, "rouge1_fmeasure": 0.22044943741927525, "rouge1_fmeasure_stderr": 0.003500198780987625, "rouge1_precision": 0.21592444335846092, "rouge1_precision_stderr": 0.004246983520225218, "rouge1_recall": 0.27255737663479357, "rouge1_recall_stderr": 0.00447437166346409, "rouge2_fmeasure": 0.05351615652078515, "rouge2_fmeasure_stderr": 0.0022197484992897607, "rouge2_precision": 0.05330214126583765, "rouge2_precision_stderr": 0.0025012370800294183, "rouge2_recall": 0.06515904722605939, "rouge2_recall_stderr": 0.0025535061286592504, "rougeL_fmeasure": 0.16421475353571632, "rougeL_fmeasure_stderr": 0.0028136325952485735, "rougeL_precision": 0.16172792138960806, "rougeL_precision_stderr": 0.003474612361597086, "rougeL_recall": 0.20248110828398325, "rougeL_recall_stderr": 0.0034801799909718704, "rougeLsum_fmeasure": 0.17052032174118803, "rougeLsum_fmeasure_stderr": 0.002883605135295133, "rougeLsum_precision": 0.16634670992387487, "rougeLsum_precision_stderr": 0.0034675307885691244, "rougeLsum_recall": 0.21304752670955557, "rougeLsum_recall_stderr": 0.003802772338811888}}, "4": {"article_DOC_summary": {"bleu": 0.40228156399700404, "bleu_stderr": 0.0795066489605694, "rouge1_fmeasure": 0.05798370899016291, "rouge1_fmeasure_stderr": 0.003309673469340258, "rouge1_precision": 0.06147088600466444, "rouge1_precision_stderr": 0.003807688121301648, "rouge1_recall": 0.06919835565602203, "rouge1_recall_stderr": 0.004080878740872531, "rouge2_fmeasure": 0.013641962174467587, "rouge2_fmeasure_stderr": 0.0012106481387976049, "rouge2_precision": 0.014228533929974868, "rouge2_precision_stderr": 0.0014047860954449383, "rouge2_recall": 0.0166314025722418, "rouge2_recall_stderr": 0.0014602317545186565, "rougeL_fmeasure": 0.043392440237132235, "rougeL_fmeasure_stderr": 0.002526920635642375, "rougeL_precision": 0.04649724897774529, "rougeL_precision_stderr": 0.0029768758298026627, "rougeL_recall": 0.05168291535676844, "rougeL_recall_stderr": 0.003094715210052394, "rougeLsum_fmeasure": 0.04537985187674532, "rougeLsum_fmeasure_stderr": 0.0026361408489140024, "rougeLsum_precision": 0.048035895047127686, "rougeLsum_precision_stderr": 0.003028118969042768, "rougeLsum_recall": 0.05464177271239397, "rougeLsum_recall_stderr": 0.0033104672738008553}}, "5": {"article_DOC_summary": {"bleu": 1.2194816597556972e-29, "bleu_stderr": 1.9673150803997765e-21, "rouge1_fmeasure": 0.0022410464211047414, "rouge1_fmeasure_stderr": 0.0006379815102748624, "rouge1_precision": 0.002305114359249611, "rouge1_precision_stderr": 0.0006855365150103617, "rouge1_recall": 0.002688750909485673, "rouge1_recall_stderr": 0.0008419301936781596, "rouge2_fmeasure": 0.0002428157678792438, "rouge2_fmeasure_stderr": 0.00010121603253679161, "rouge2_precision": 0.00023662681974673803, "rouge2_precision_stderr": 0.00010805221445763836, "rouge2_recall": 0.0003191847290509596, "rouge2_recall_stderr": 0.00013624996945101362, "rougeL_fmeasure": 0.001480383954453164, "rougeL_fmeasure_stderr": 0.0004335775279500401, "rougeL_precision": 0.0014967981731443785, "rougeL_precision_stderr": 0.00044282917196946015, "rougeL_recall": 0.0018089244853048499, "rougeL_recall_stderr": 0.0005955143349649119, "rougeLsum_fmeasure": 0.0014592568247182058, "rougeLsum_fmeasure_stderr": 0.00042833093506174205, "rougeLsum_precision": 0.0014611359517881007, "rougeLsum_precision_stderr": 0.0004268628859757995, "rougeLsum_recall": 0.0017987055420995693, "rougeLsum_recall_stderr": 0.0005977174922462606}}}} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ee0f1deb0df3054af8b27460aae743a7c68089d7 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.29175676576014364, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0034471150363075184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2607907068748222, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002898135289732422 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.23794097030152866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002188642411213601 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.08536913371644045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0020895235460299017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0713925019948092, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001635476891768791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06574644928335649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013625725128389179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.22160256648226004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0027745204374263316 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.19691795695851566, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002270661716223799 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1789951285071888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001673666200127008 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.27421900185303183, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003293382529360642 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.24426464874342324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002735802329870702 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.22300315725329103, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020678687486032563 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.071985313483549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07738970826466988 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..985b095b0510648e4dd26c71d12abe57fd6269fb --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2571754504445173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0037874180373320154 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22046706474081984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032483649489955437 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20409132873325006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026059526618525918 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07716310931244513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021423793043653955 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06255207307386669, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001665836586847627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05803958336813696, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013905246714186119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.19917050763282695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031058368371087975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.16863998232504207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025563859107073333 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1557150280292743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020175919818759087 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.24286851428671166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003627579238772214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2070808578907387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003069864948055298 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19198507442281348, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002473123474513908 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.256877193820952, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07385158775345343 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c30086afbaf2145db8305b936e78009d118e0d76 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.08608412704276916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0031073010197710128 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07306085845324418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026739004311174368 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06754366655458334, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00231821642571142 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02477632495929693, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013952242546163309 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.020963360092888926, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011677531292808372 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.01891360756998624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009547131510927643 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06705402166283986, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024891810743964266 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05633331544121819, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021123430958068772 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.05180637434259422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017937325092342497 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.08098856968063572, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002946288914451725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06822240197436494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002506023867308553 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06316340092042942, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021760431723604587 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.11803096947131637, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.018616113127493047 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b64c8db74d68d11286eb32dff0d01badfcc7d618 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.014929032882940911, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001437252981426829 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.012605454696432882, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012515184538477262 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.01167038909001005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010944129890059014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0041010832248453595, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005815077800226312 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0034469110258534192, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004725161504235943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.003203820995720566, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00042103771194584703 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.011742235186401492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011661092630625284 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009919184883424567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010141555758155579 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.009059396220908112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008608509358819988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.014255925532109718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001380098431266019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012021925711305772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001202720217240228 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.011096298465596519, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010440702662319144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 5.860100637257344e-12, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.0205550894986597e-10 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4a34580c21d9f7cc9657f475f162d01e2d1b4134 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.21592444335846092, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004246983520225218 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.27255737663479357, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00447437166346409 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.22044943741927525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003500198780987625 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.05330214126583765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0025012370800294183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06515904722605939, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0025535061286592504 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.05351615652078515, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0022197484992897607 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.16172792138960806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.003474612361597086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.20248110828398325, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034801799909718704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16421475353571632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0028136325952485735 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.16634670992387487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0034675307885691244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.21304752670955557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003802772338811888 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17052032174118803, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002883605135295133 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.4283658434676942, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1084231496409544 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..036e0c2da42da0d5b9f1d06c0db3106b59f32810 --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06147088600466444, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.003807688121301648 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.06919835565602203, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004080878740872531 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05798370899016291, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003309673469340258 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.014228533929974868, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0014047860954449383 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0166314025722418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0014602317545186565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.013641962174467587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0012106481387976049 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.04649724897774529, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0029768758298026627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.05168291535676844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003094715210052394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.043392440237132235, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002526920635642375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.048035895047127686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003028118969042768 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.05464177271239397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0033104672738008553 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04537985187674532, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0026361408489140024 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.40228156399700404, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0795066489605694 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e661c4fb67f5758197adffb8a1259c522750d --- /dev/null +++ b/8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002305114359249611, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006855365150103617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002688750909485673, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008419301936781596 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0022410464211047414, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006379815102748624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00023662681974673803, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00010805221445763836 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0003191847290509596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00013624996945101362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0002428157678792438, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00010121603253679161 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0014967981731443785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00044282917196946015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0018089244853048499, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005955143349649119 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.001480383954453164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004335775279500401 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0014611359517881007, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0004268628859757995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0017987055420995693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005977174922462606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0014592568247182058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00042833093506174205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.2194816597556972e-29, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.9673150803997765e-21 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..7e0016035d1ee2f0d600427fa618ee01122cc5f2 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.354,0.015129868238451773,0 +anli_r2,acc,0.331,0.01488827258820394,0 +anli_r3,acc,0.3458333333333333,0.01373624534231101,0 +arc_challenge,acc,0.2721843003412969,0.013006600406423707,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0 +arc_easy,acc,0.5707070707070707,0.010156678075911087,0 +arc_easy,acc_norm,0.5172558922558923,0.010253671674754631,0 +boolq,acc,0.5501529051987768,0.008700950643028801,1 +cb,acc,0.2857142857142857,0.06091449038731724,1 +cb,f1,0.30952380952380953,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.4360685122485561,0.004948824501355485,0 +hellaswag,acc_norm,0.5632344154550887,0.004949716368890496,0 +piqa,acc,0.7225244831338411,0.010446818281039959,0 +piqa,acc_norm,0.7317736670293797,0.010336761992404485,0 +rte,acc,0.5306859205776173,0.03003973059219781,0 +sciq,acc,0.848,0.011358918303475282,0 +sciq,acc_norm,0.758,0.013550631705555958,0 +storycloze_2016,acc,0.6969535008017104,0.010627613073376715,0 +winogrande,acc,0.5666929755327546,0.013926915052757347,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json deleted file mode 100644 index 8ce17533780c8f358c9be167337acd8af6dbb1a0..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.354, - "acc_stderr": 0.015129868238451773 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.01488827258820394 - }, - "anli_r3": { - "acc": 0.3458333333333333, - "acc_stderr": 0.01373624534231101 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.06091449038731724, - "f1": 0.30952380952380953 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.4360685122485561, - "acc_stderr": 0.004948824501355485, - "acc_norm": 0.5632344154550887, - "acc_norm_stderr": 0.004949716368890496 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.03003973059219781 - }, - "winogrande": { - "acc": 0.5666929755327546, - "acc_stderr": 0.013926915052757347 - }, - "storycloze_2016": { - "acc": 0.6969535008017104, - "acc_stderr": 0.010627613073376715 - }, - "boolq": { - "acc": 0.5501529051987768, - "acc_stderr": 0.008700950643028801 - }, - "arc_easy": { - "acc": 0.5707070707070707, - "acc_stderr": 0.010156678075911087, - "acc_norm": 0.5172558922558923, - "acc_norm_stderr": 0.010253671674754631 - }, - "arc_challenge": { - "acc": 0.2721843003412969, - "acc_stderr": 0.013006600406423707, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134575 - }, - "sciq": { - "acc": 0.848, - "acc_stderr": 0.011358918303475282, - "acc_norm": 0.758, - "acc_norm_stderr": 0.013550631705555958 - }, - "piqa": { - "acc": 0.7225244831338411, - "acc_stderr": 0.010446818281039959, - "acc_norm": 0.7317736670293797, - "acc_norm_stderr": 0.010336761992404485 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..e09b8254720fce0063a1fab117d93bee7b8afcac --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.326,0.014830507204541028,0 +anli_r3,acc,0.3541666666666667,0.01381193349957096,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.5968013468013468,0.010065668576794803,0 +arc_easy,acc_norm,0.5913299663299664,0.01008717449876288,0 +boolq,acc,0.5562691131498471,0.008689501105367413,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.36324786324786323,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4411471818362876,0.004955095096264714,0 +hellaswag,acc_norm,0.5774746066520613,0.004929517011508216,0 +piqa,acc,0.7295973884657236,0.010363167031620784,0 +piqa,acc_norm,0.7334058759521219,0.010316749863541365,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.887,0.010016552866696846,0 +sciq,acc_norm,0.882,0.01020686926438179,0 +storycloze_2016,acc,0.6830571886691609,0.010759650951452121,0 +winogrande,acc,0.5595895816890292,0.013952330311915603,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json deleted file mode 100644 index 7c5e0809944d832c5cf497fea1ecdc9b4c0a1da1..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.014830507204541028 - }, - "anli_r3": { - "acc": 0.3541666666666667, - "acc_stderr": 0.01381193349957096 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.36324786324786323 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4411471818362876, - "acc_stderr": 0.004955095096264714, - "acc_norm": 0.5774746066520613, - "acc_norm_stderr": 0.004929517011508216 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5595895816890292, - "acc_stderr": 0.013952330311915603 - }, - "storycloze_2016": { - "acc": 0.6830571886691609, - "acc_stderr": 0.010759650951452121 - }, - "boolq": { - "acc": 0.5562691131498471, - "acc_stderr": 0.008689501105367413 - }, - "arc_easy": { - "acc": 0.5968013468013468, - "acc_stderr": 0.010065668576794803, - "acc_norm": 0.5913299663299664, - "acc_norm_stderr": 0.01008717449876288 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - }, - "sciq": { - "acc": 0.887, - "acc_stderr": 0.010016552866696846, - "acc_norm": 0.882, - "acc_norm_stderr": 0.01020686926438179 - }, - "piqa": { - "acc": 0.7295973884657236, - "acc_stderr": 0.010363167031620784, - "acc_norm": 0.7334058759521219, - "acc_norm_stderr": 0.010316749863541365 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..9e870a8717df33901d0f5f58f38ac6e7e128fe93 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.343,0.015019206922356951,0 +anli_r2,acc,0.339,0.014976758771620349,0 +anli_r3,acc,0.33416666666666667,0.013622434813136774,0 +arc_challenge,acc,0.28924914675767915,0.013250012579393443,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053059,0 +arc_easy,acc,0.6077441077441077,0.010018744689650043,0 +arc_easy,acc_norm,0.6026936026936027,0.010041053078884286,0 +boolq,acc,0.5529051987767584,0.008695963064172717,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.30617283950617286,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4419438358892651,0.004956030970911519,0 +hellaswag,acc_norm,0.5717984465245967,0.004938068627349502,0 +piqa,acc,0.7295973884657236,0.010363167031620784,0 +piqa,acc_norm,0.735038084874864,0.010296557993316042,0 +rte,acc,0.4404332129963899,0.029882123363118726,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.908,0.009144376393151108,0 +storycloze_2016,acc,0.6862640299305185,0.01073017911931762,0 +winogrande,acc,0.5382794001578532,0.014011242594964115,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json deleted file mode 100644 index 34afc508d14f0aa5b4f51ff0ded9a129e1554170..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356951 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620349 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136774 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.30617283950617286 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4419438358892651, - "acc_stderr": 0.004956030970911519, - "acc_norm": 0.5717984465245967, - "acc_norm_stderr": 0.004938068627349502 - }, - "rte": { - "acc": 0.4404332129963899, - "acc_stderr": 0.029882123363118726 - }, - "winogrande": { - "acc": 0.5382794001578532, - "acc_stderr": 0.014011242594964115 - }, - "storycloze_2016": { - "acc": 0.6862640299305185, - "acc_stderr": 0.01073017911931762 - }, - "boolq": { - "acc": 0.5529051987767584, - "acc_stderr": 0.008695963064172717 - }, - "arc_easy": { - "acc": 0.6077441077441077, - "acc_stderr": 0.010018744689650043, - "acc_norm": 0.6026936026936027, - "acc_norm_stderr": 0.010041053078884286 - }, - "arc_challenge": { - "acc": 0.28924914675767915, - "acc_stderr": 0.013250012579393443, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053059 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.908, - "acc_norm_stderr": 0.009144376393151108 - }, - "piqa": { - "acc": 0.7295973884657236, - "acc_stderr": 0.010363167031620784, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316042 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..b82de9c1cdcc0a7fa394b791bff8712121c4b5c1 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408945,0 +anli_r2,acc,0.365,0.0152317762262649,0 +anli_r3,acc,0.3333333333333333,0.013613950010225612,0 +arc_challenge,acc,0.2858361774744027,0.013203196088537369,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.6077441077441077,0.010018744689650043,0 +arc_easy,acc_norm,0.6022727272727273,0.010042861602178056,0 +boolq,acc,0.5314984709480123,0.00872768484861531,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.428030303030303,,1 +copa,acc,0.74,0.04408440022768079,0 +hellaswag,acc,0.4431388169687313,0.004957410545559414,0 +hellaswag,acc_norm,0.58105954989046,0.004923772581848488,0 +piqa,acc,0.7323177366702938,0.010330111189370429,0 +piqa,acc_norm,0.735038084874864,0.010296557993316044,0 +rte,acc,0.48736462093862815,0.030086851767188564,0 +sciq,acc,0.913,0.008916866630745923,0 +sciq,acc_norm,0.911,0.009008893392651518,0 +storycloze_2016,acc,0.6916087653661144,0.010679734445487801,0 +winogrande,acc,0.5556432517758485,0.013965196769083555,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json deleted file mode 100644 index 0129ce153d4301d7475f23bcd0f438c2b74ad82b..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408945 - }, - "anli_r2": { - "acc": 0.365, - "acc_stderr": 0.0152317762262649 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.013613950010225612 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.428030303030303 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768079 - }, - "hellaswag": { - "acc": 0.4431388169687313, - "acc_stderr": 0.004957410545559414, - "acc_norm": 0.58105954989046, - "acc_norm_stderr": 0.004923772581848488 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5556432517758485, - "acc_stderr": 0.013965196769083555 - }, - "storycloze_2016": { - "acc": 0.6916087653661144, - "acc_stderr": 0.010679734445487801 - }, - "boolq": { - "acc": 0.5314984709480123, - "acc_stderr": 0.00872768484861531 - }, - "arc_easy": { - "acc": 0.6077441077441077, - "acc_stderr": 0.010018744689650043, - "acc_norm": 0.6022727272727273, - "acc_norm_stderr": 0.010042861602178056 - }, - "arc_challenge": { - "acc": 0.2858361774744027, - "acc_stderr": 0.013203196088537369, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.008916866630745923, - "acc_norm": 0.911, - "acc_norm_stderr": 0.009008893392651518 - }, - "piqa": { - "acc": 0.7323177366702938, - "acc_stderr": 0.010330111189370429, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316044 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..0e54f2795ecab14030186c2d69ca5e1d1e9f36a6 --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.363,0.015213890444671283,0 +anli_r2,acc,0.362,0.0152048409129195,0 +anli_r3,acc,0.3516666666666667,0.013789711695404806,0 +arc_challenge,acc,0.27559726962457337,0.013057169655761838,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0 +arc_easy,acc,0.6203703703703703,0.009958037725468565,0 +arc_easy,acc_norm,0.6085858585858586,0.010014917532627824,0 +boolq,acc,0.5162079510703363,0.008740459157499082,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.3340305010893247,,1 +copa,acc,0.74,0.04408440022768078,0 +hellaswag,acc,0.44064927305317664,0.004954503606471609,0 +hellaswag,acc_norm,0.5764787890858395,0.004931065434173691,0 +piqa,acc,0.7285092491838956,0.010376251176596135,0 +piqa,acc_norm,0.7393906420021763,0.010241826155811632,0 +rte,acc,0.44765342960288806,0.029931070362939526,0 +sciq,acc,0.91,0.009054390204866444,0 +sciq,acc_norm,0.914,0.008870325962594766,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.5501183898973955,0.013981711904049732,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json deleted file mode 100644 index c883ab101ddcbfa8f37bb65ab04cae9858e4b663..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.363, - "acc_stderr": 0.015213890444671283 - }, - "anli_r2": { - "acc": 0.362, - "acc_stderr": 0.0152048409129195 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404806 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3340305010893247 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768078 - }, - "hellaswag": { - "acc": 0.44064927305317664, - "acc_stderr": 0.004954503606471609, - "acc_norm": 0.5764787890858395, - "acc_norm_stderr": 0.004931065434173691 - }, - "rte": { - "acc": 0.44765342960288806, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5501183898973955, - "acc_stderr": 0.013981711904049732 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5162079510703363, - "acc_stderr": 0.008740459157499082 - }, - "arc_easy": { - "acc": 0.6203703703703703, - "acc_stderr": 0.009958037725468565, - "acc_norm": 0.6085858585858586, - "acc_norm_stderr": 0.010014917532627824 - }, - "arc_challenge": { - "acc": 0.27559726962457337, - "acc_stderr": 0.013057169655761838, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623501 - }, - "sciq": { - "acc": 0.91, - "acc_stderr": 0.009054390204866444, - "acc_norm": 0.914, - "acc_norm_stderr": 0.008870325962594766 - }, - "piqa": { - "acc": 0.7285092491838956, - "acc_stderr": 0.010376251176596135, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.010241826155811632 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv b/8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..050e0b9b9385c1c9e115f914791f88966981568f --- /dev/null +++ b/8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.359,0.015177264224798601,0 +anli_r2,acc,0.335,0.014933117490932573,0 +anli_r3,acc,0.3258333333333333,0.013535422043417454,0 +arc_challenge,acc,0.2832764505119454,0.013167478735134575,0 +arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0 +arc_easy,acc,0.6094276094276094,0.010011059112064243,0 +arc_easy,acc_norm,0.6119528619528619,0.009999295905750666,0 +boolq,acc,0.519571865443425,0.008738352682962235,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.38723751912112364,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.44343756223859787,0.0049577508971529426,0 +hellaswag,acc_norm,0.5806612228639714,0.004924424018073683,0 +piqa,acc,0.7247007616974973,0.01042142927736953,0 +piqa,acc_norm,0.7393906420021763,0.010241826155811632,0 +rte,acc,0.48014440433212996,0.0300727231673172,0 +sciq,acc,0.913,0.008916866630745925,0 +sciq,acc_norm,0.917,0.00872852720607479,0 +storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0 +winogrande,acc,0.5540647198105761,0.013970093482330697,0 diff --git a/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json b/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json deleted file mode 100644 index 76f82522b12f592b48bb6adbe292a33f8ac74bb4..0000000000000000000000000000000000000000 --- a/8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.359, - "acc_stderr": 0.015177264224798601 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.013535422043417454 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.38723751912112364 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.44343756223859787, - "acc_stderr": 0.0049577508971529426, - "acc_norm": 0.5806612228639714, - "acc_norm_stderr": 0.004924424018073683 - }, - "rte": { - "acc": 0.48014440433212996, - "acc_stderr": 0.0300727231673172 - }, - "winogrande": { - "acc": 0.5540647198105761, - "acc_stderr": 0.013970093482330697 - }, - "storycloze_2016": { - "acc": 0.6937466595403528, - "acc_stderr": 0.010659088460112754 - }, - "boolq": { - "acc": 0.519571865443425, - "acc_stderr": 0.008738352682962235 - }, - "arc_easy": { - "acc": 0.6094276094276094, - "acc_stderr": 0.010011059112064243, - "acc_norm": 0.6119528619528619, - "acc_norm_stderr": 0.009999295905750666 - }, - "arc_challenge": { - "acc": 0.2832764505119454, - "acc_stderr": 0.013167478735134575, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.01359243151906808 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.008916866630745925, - "acc_norm": 0.917, - "acc_norm_stderr": 0.00872852720607479 - }, - "piqa": { - "acc": 0.7247007616974973, - "acc_stderr": 0.01042142927736953, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.010241826155811632 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/8b7178b178b/evaluation/8b7178b178b_1_babi.json b/8b7178b178b/evaluation/8b7178b178b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..b2ef6a86f3a5243b9eaf20ffa411bbce9db7b4d7 --- /dev/null +++ b/8b7178b178b/evaluation/8b7178b178b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.10766666666666666, + "em_stderr": 0.005659993848227298 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b178b/evaluation/8b7178b178b_2_babi.json b/8b7178b178b/evaluation/8b7178b178b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..0c071085bb040441f08d60ea53da4470f365ca37 --- /dev/null +++ b/8b7178b178b/evaluation/8b7178b178b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.208, + "em_stderr": 0.007411498505927842 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b178b/evaluation/8b7178b178b_3_babi.json b/8b7178b178b/evaluation/8b7178b178b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..1412cef22fbee22d9aa068117ce8dbba3b2c115e --- /dev/null +++ b/8b7178b178b/evaluation/8b7178b178b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.2713333333333333, + "em_stderr": 0.008119472096605799 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b178b/evaluation/8b7178b178b_4_babi.json b/8b7178b178b/evaluation/8b7178b178b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..8cd30a3a6bcfb2a324bf049c980a90789f05a557 --- /dev/null +++ b/8b7178b178b/evaluation/8b7178b178b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.309, + "em_stderr": 0.008437815608561314 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b178b/evaluation/8b7178b178b_5_babi.json b/8b7178b178b/evaluation/8b7178b178b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..019febd269e36d4c05942efcda8d64d7ea8cb2dc --- /dev/null +++ b/8b7178b178b/evaluation/8b7178b178b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.3273333333333333, + "em_stderr": 0.008568540173271721 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_0_babi.json b/8b7178b25b/evaluation/8b7178b25b_0_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..a769e6d217a781dc6103a76b922b0e940ae16405 --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_0_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.0, + "em_stderr": 0.0 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 0, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_1_babi.json b/8b7178b25b/evaluation/8b7178b25b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..4a4fb1d2ac17e8c38890c5aa77d401571944b550 --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.118, + "em_stderr": 0.005890973421765812 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_2_babi.json b/8b7178b25b/evaluation/8b7178b25b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0b5be1568c88dc840b2f4af6efdb6bd6880d4b --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.24333333333333335, + "em_stderr": 0.007835466732772215 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_3_babi.json b/8b7178b25b/evaluation/8b7178b25b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..a84c95f03d9b919a0eb6f530fe9ff4de63071ffb --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.2833333333333333, + "em_stderr": 0.008228472181192749 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_4_babi.json b/8b7178b25b/evaluation/8b7178b25b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..9c6a5a683d871cfa17a57902c0447e15fa118dfe --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.30766666666666664, + "em_stderr": 0.008427710547037915 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/8b7178b25b_5_babi.json b/8b7178b25b/evaluation/8b7178b25b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..d634bf0caf7c419682bb46ab47d43d9215f0cf27 --- /dev/null +++ b/8b7178b25b/evaluation/8b7178b25b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.3273333333333333, + "em_stderr": 0.008568540173271721 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_0_babi.json b/8b7178b35b/evaluation/8b7178b35b_0_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..8c938b8b8b0f79e4afcb65846aa1e39830f8e386 --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_0_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.0, + "em_stderr": 0.0 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 0, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_1_babi.json b/8b7178b35b/evaluation/8b7178b35b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..74bf263bb5bbdca00cba636892f711074c9d8c7f --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.099, + "em_stderr": 0.00545370647402016 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_2_babi.json b/8b7178b35b/evaluation/8b7178b35b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..f7055725dc0dd5cdfe07b34214567eb3a6dec286 --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.21933333333333332, + "em_stderr": 0.007556086214902187 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_3_babi.json b/8b7178b35b/evaluation/8b7178b35b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..97722a84b59e0b4862c51bed9f99113798175831 --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.2823333333333333, + "em_stderr": 0.00821966716987605 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_4_babi.json b/8b7178b35b/evaluation/8b7178b35b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..201036ada308fd2df93b30dd11f3be290300cb20 --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.322, + "em_stderr": 0.008532072750616232 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/8b7178b35b_5_babi.json b/8b7178b35b/evaluation/8b7178b35b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..d403f91bb81a1b24b21a9870e13dc42dc5cf0f6e --- /dev/null +++ b/8b7178b35b/evaluation/8b7178b35b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.344, + "em_stderr": 0.008674469175219012 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_0_babi.json b/8b7178b44b/evaluation/8b7178b44b_0_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..d3f0bfc8d41f7a3a60db7e50d36b8fbf4711b875 --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_0_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.0, + "em_stderr": 0.0 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 0, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_1_babi.json b/8b7178b44b/evaluation/8b7178b44b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..328efc86cf5a258dcfed9294f67ea41acbb10bcb --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.11366666666666667, + "em_stderr": 0.00579597989812855 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_2_babi.json b/8b7178b44b/evaluation/8b7178b44b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..28d549d8c0084b1e3e5167fd202e452c14e35700 --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.229, + "em_stderr": 0.007672849810081522 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_3_babi.json b/8b7178b44b/evaluation/8b7178b44b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..d5c37a8b3436327cb4929cfb909b2948e874a16e --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.288, + "em_stderr": 0.008268905102684231 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_4_babi.json b/8b7178b44b/evaluation/8b7178b44b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..3dd27b6ad561360df2d38794cf58744de9430c71 --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.32466666666666666, + "em_stderr": 0.008550464561773717 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/8b7178b44b_5_babi.json b/8b7178b44b/evaluation/8b7178b44b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..4f1ef07e6390b359e61c3bc61fd5f9526feb5af4 --- /dev/null +++ b/8b7178b44b/evaluation/8b7178b44b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.3303333333333333, + "em_stderr": 0.00858849976325721 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/8b7178b4b_1_babi.json b/8b7178b4b/evaluation/8b7178b4b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..45a0a203574161715054545b97162f16495561b6 --- /dev/null +++ b/8b7178b4b/evaluation/8b7178b4b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.06333333333333334, + "em_stderr": 0.0044475462482082085 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/8b7178b4b_2_babi.json b/8b7178b4b/evaluation/8b7178b4b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..8a321032167a4aea225e492ad6be2fbcf7e73eda --- /dev/null +++ b/8b7178b4b/evaluation/8b7178b4b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.11633333333333333, + "em_stderr": 0.005854746420484682 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/8b7178b4b_3_babi.json b/8b7178b4b/evaluation/8b7178b4b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..ca1333fee554f0720f082317c4c4a9780fbab210 --- /dev/null +++ b/8b7178b4b/evaluation/8b7178b4b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.14266666666666666, + "em_stderr": 0.006386278870795312 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/8b7178b4b_4_babi.json b/8b7178b4b/evaluation/8b7178b4b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..31dfd9cde1fda290d3456d38081e331373c88fc6 --- /dev/null +++ b/8b7178b4b/evaluation/8b7178b4b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.15533333333333332, + "em_stderr": 0.006614343969360118 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/8b7178b4b_5_babi.json b/8b7178b4b/evaluation/8b7178b4b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..b7cb416e78523f1dc44500054946d0f60d616e24 --- /dev/null +++ b/8b7178b4b/evaluation/8b7178b4b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.16333333333333333, + "em_stderr": 0.006750336352025223 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aea840512e642e6ff595b7c4d760689c7ba29c3c --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2148865105675037, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002739851744730188}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24425778319951902, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027078004695038444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19860742023540653, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019669822898704605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.047729902956845004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013778455296036068}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05132654178756232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013072982733870482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.042199560827095785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009984994712996336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.15746129883842613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021243165104716925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.178676546411626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020463162920555584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14373246146905902, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013823332704265676}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.201582658871862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025882580846911967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2298482831292535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025699798075345995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1862201529400756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018350815379350304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.285316312600506, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07318703724203124}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a87336a464c14800d24e374d4d31abcd11463291 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.19123547521805817, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030573229896881894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2015864889648793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003002796407976632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1673664610062123, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022414488889316436}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04120238523634148, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013513476132338967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0425607593666781, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012769091257706392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.035103874107299336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009353728071065693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14158935872924355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024049999199382307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.14757071761127052, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022268964035777597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12162349534045766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015927292615778431}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.17867034243558017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028775711712036097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.18884833907646806, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028264748023701333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15630488036874124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002088106219230967}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.952652551469228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08453362226769602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b9903a74928fa7351b1c2adbf114cf3db8675995 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06543156657179054, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002495986380075559}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.0672266515655875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002521383841330882}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.055242233660189904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019491440817189802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.015701101053758496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010978015281121545}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.015657153764832166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009749437105549986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.012417968712803235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007243737975246956}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.049422424802369726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019329518383636773}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.04994410733045717, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001901335194922695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04087216675333205, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014372727294585936}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.061083176462803424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023412533367671035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06257047928104469, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023392750560332614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.0513939982614947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018096253619321696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.1432851499362598, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018545752951036856}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..79fae5db279a606f1bb39a6fc607723650a5e825 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.01142443298407895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001168348600049017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.010393653713974759, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010577641578929015}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009113409021214056, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008812650320045362}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002752646600307901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004564370123056176}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0025596977500778532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003650435541065097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0021937463666516443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003047993992105857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.008694317655831221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009202143979895897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.007818331013501685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008075435288949127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0067854192472234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006485341464909711}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01065083686424177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011007522060213029}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.009678223252509898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009885816669464665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008458807054678006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008168184748216672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.950848606024447e-11, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.311478808939334e-10}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_3.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..384d14585ab274238fca8e157df7f3cba828073f --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.16259516777001937, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030728200086154447}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.269289410465233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00428240074281679}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18708591173867786, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028078435646932145}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.030072444877442605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001480222722747046}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.04913570989408663, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021386327088809545}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03405049278930678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014753683227482124}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11908382903479889, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023534660363485516}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19655219494376092, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0031758709154479396}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1363637318729216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002077616686086357}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1257413085245787, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023846570386209945}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2121332309440453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0035883574015232677}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.14547126169694422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00221176726638642}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.3354579627853345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0787551002497137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_4.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f148bf236602a79c6aca443e9c3fa999fb519e82 --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04717115702117185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027458547990898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06854685849970572, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004034335498605078}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05005387987888931, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027866581571753205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.008039252239171141, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008177591347774397}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.012871828788783901, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012947257895368077}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00886282988047597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008457399189072509}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03560333982666727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021361702503072603}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.050735327935754304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003012711453091767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.037042998151035164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020607318812993535}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03692496706333677, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002192842113184995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05337510829486164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031922745643679297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0387408600461143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021593263516489917}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.4649759375914046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11179707603605749}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_5.json b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cb77f6b25fe17ddd49c527982334fa2c52ca7ebe --- /dev/null +++ b/8b7178b4b/evaluation/generation/agg.8b7178b4b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0016915297589821948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00048713143022114703}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002562936486956815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007574911873960877}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0018881021406329268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005285584147199992}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00016030467176019738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.30332967873648e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003171427458769009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00014300870446452385}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00019917211727136963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.609231087472974e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0013520925095504633, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00037757052914657154}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0021249179272997214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006316914645536129}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0015342199902319133, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004241729776677898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0012885165827922138, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000365613793335358}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0020249663088333218, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006080031860077729}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0014609154009965347, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004094371256569771}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.5081996494898117e-23, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.560689042664534e-18}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c8cd837f42a619e173847e3d748491747ef269f2 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18c3afe45b8ddfefb18189689f85e674edbb2df281c0cd13a79956d2562f641f +size 18710208 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e4a78456a5d23bf1c108bda9ad0981ff46115908 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff191e099539d8665f7960142e3068ab17bc90bf18e6d35d97232b3d9b54cd55 +size 24152546 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..efa7a4f26616eca1a0a7f74380c647fa5db749cf 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c5c2bac61cf5134fcf31126be336867f0cddb5f43c7b57c8b2ac9e374ffd4df +size 29409598 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..eac2e87d9d71976e9b0d20c94368ee3978730b82 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ec6448227400f9aeb000569866b30b543b1f2a50ffd5592de1e5015d5b15d9 +size 34789561 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7bba17e34235097bd29687bb199b9a6def9b94c9 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2db65822d73e2f4f211eee8c864f74c1fbacd2e309585090700ea3de44898acc +size 9576964 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2e4f0cd422d68f04b3848422a277489810503871 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e6737fc7837c87ea10d35101dce149e60c2cd86fe7e0f65bce17f5457a91653 +size 11657891 diff --git a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5e7732e63e643e344c3cf7a0883970148ba257af 100644 --- a/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b4b/evaluation/generation/examples.8b7178b4b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df124e70d00bf1899c70d77a353edf34f0e57984519c5ed29e38b79f6f5b50f3 +size 13898550 diff --git a/8b7178b4b/evaluation/generation/merged.csv b/8b7178b4b/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..ea6604cc84a582f66ff51ca5c1aec789bf3e71e6 --- /dev/null +++ b/8b7178b4b/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.03042186977591868 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.03042186977591868 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.18077934814330301 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.18077934814330301 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19532447735079128 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.19532447735079128 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19990423001151691 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19990423001151691 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20150390158816628 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20150390158816628 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19983825989563073 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19983825989563073 +e2e_nlg_cleaned,5,average,multiple,0.16796201446088782 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0350647220803466 +gem_xsum,0,median,rouge2_fmeasure,0.0350647220803466 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.032271789589786035 +gem_xsum,1,median,rouge2_fmeasure,0.032271789589786035 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.033326130922902826 +gem_xsum,2,median,rouge2_fmeasure,0.033326130922902826 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03405049278930678 +gem_xsum,3,median,rouge2_fmeasure,0.03405049278930678 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.00886282988047597 +gem_xsum,4,median,rouge2_fmeasure,0.00886282988047597 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00019917211727136963 +gem_xsum,5,median,rouge2_fmeasure,0.00019917211727136963 +gem_xsum,5,average,multiple,0.023962522896681597 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.033752443331005515 +web_nlg_en,0,median,rouge2_fmeasure,0.033752443331005515 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04323640759220055 +web_nlg_en,1,median,rouge2_fmeasure,0.04323640759220055 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05777470204170961 +web_nlg_en,2,median,rouge2_fmeasure,0.05777470204170961 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05726772787270205 +web_nlg_en,3,median,rouge2_fmeasure,0.05726772787270205 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.058213068413064326 +web_nlg_en,4,median,rouge2_fmeasure,0.058213068413064326 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05969397137655782 +web_nlg_en,5,median,rouge2_fmeasure,0.05969397137655782 +web_nlg_en,5,average,multiple,0.051656386771206646 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033836058888840864 +wiki_lingua_en,0,median,rouge2_fmeasure,0.033836058888840864 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03663313230489769 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03663313230489769 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.042199560827095785 +wiki_lingua_en,2,median,rouge2_fmeasure,0.042199560827095785 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.035103874107299336 +wiki_lingua_en,3,median,rouge2_fmeasure,0.035103874107299336 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012417968712803235 +wiki_lingua_en,4,median,rouge2_fmeasure,0.012417968712803235 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0021937463666516443 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0021937463666516443 +wiki_lingua_en,5,average,multiple,0.027064056867931424 diff --git a/8b7178b4b/evaluation/generation/merged.json b/8b7178b4b/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..6cee9deead9bda3302480a8f8740e5daf3e444bd --- /dev/null +++ b/8b7178b4b/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4803306200633949, "bleu_stderr": 0.03922665187840482, "rouge1_fmeasure": 0.0888874783392969, "rouge1_fmeasure_stderr": 0.002307169699507323, "rouge1_precision": 0.09749321578563235, "rouge1_precision_stderr": 0.004124607823022986, "rouge1_recall": 0.22115727913442573, "rouge1_recall_stderr": 0.0050978117694942685, "rouge2_fmeasure": 0.033752443331005515, "rouge2_fmeasure_stderr": 0.0013600177462277827, "rouge2_precision": 0.04063067381458356, "rouge2_precision_stderr": 0.002733828953206739, "rouge2_recall": 0.08090403229418475, "rouge2_recall_stderr": 0.0027170770238036275, "rougeL_fmeasure": 0.07654187125556941, "rougeL_fmeasure_stderr": 0.001983666786078678, "rougeL_precision": 0.08671383574091343, "rougeL_precision_stderr": 0.0038475242033281618, "rougeL_recall": 0.1903877587871982, "rougeL_recall_stderr": 0.0043704846432708454, "rougeLsum_fmeasure": 0.08201086825686166, "rougeLsum_fmeasure_stderr": 0.0020987019677804358, "rougeLsum_precision": 0.09107192409144173, "rougeLsum_precision_stderr": 0.003912839871748979, "rougeLsum_recall": 0.20423975142854273, "rougeLsum_recall_stderr": 0.004708987216605135}}, "1": {"PALM_prompt": {"bleu": 0.465697232207848, "bleu_stderr": 0.03534355530426256, "rouge1_fmeasure": 0.10725647240526642, "rouge1_fmeasure_stderr": 0.0026437266698279512, "rouge1_precision": 0.12432716261670505, "rouge1_precision_stderr": 0.004322960235339739, "rouge1_recall": 0.20273474090920957, "rouge1_recall_stderr": 0.004670403351552389, "rouge2_fmeasure": 0.04323640759220055, "rouge2_fmeasure_stderr": 0.0016133525846048075, "rouge2_precision": 0.05159050607380015, "rouge2_precision_stderr": 0.0027210678933735815, "rouge2_recall": 0.08117469428374954, "rouge2_recall_stderr": 0.002704734869165859, "rougeL_fmeasure": 0.09346609534458801, "rougeL_fmeasure_stderr": 0.002306522228638754, "rougeL_precision": 0.1113668929957849, "rougeL_precision_stderr": 0.004043291317242354, "rougeL_recall": 0.17661426908038205, "rougeL_recall_stderr": 0.0040557849963649304, "rougeLsum_fmeasure": 0.09920590503684244, "rougeLsum_fmeasure_stderr": 0.0024409617183076453, "rougeLsum_precision": 0.116809464228312, "rougeLsum_precision_stderr": 0.004158408301421081, "rougeLsum_recall": 0.1875285811767148, "rougeLsum_recall_stderr": 0.00431164508206456}}, "2": {"PALM_prompt": {"bleu": 0.6103102492809146, "bleu_stderr": 0.02356171040986765, "rouge1_fmeasure": 0.13414128458830274, "rouge1_fmeasure_stderr": 0.0029984112072223607, "rouge1_precision": 0.14738537883933334, "rouge1_precision_stderr": 0.0044727724401315075, "rouge1_recall": 0.2507529143198021, "rouge1_recall_stderr": 0.005031280576422003, "rouge2_fmeasure": 0.05777470204170961, "rouge2_fmeasure_stderr": 0.0019127299451402037, "rouge2_precision": 0.06597103492628528, "rouge2_precision_stderr": 0.0029646723987533184, "rouge2_recall": 0.10691813370348757, "rouge2_recall_stderr": 0.003120430759212716, "rougeL_fmeasure": 0.11608855020499782, "rougeL_fmeasure_stderr": 0.002571881305673696, "rougeL_precision": 0.13069187555646603, "rougeL_precision_stderr": 0.004120235967265702, "rougeL_recall": 0.21777403694382114, "rougeL_recall_stderr": 0.004378238262258649, "rougeLsum_fmeasure": 0.1229596925873642, "rougeLsum_fmeasure_stderr": 0.0026888967207341083, "rougeLsum_precision": 0.1367822736828381, "rougeLsum_precision_stderr": 0.0042172621655552295, "rougeLsum_recall": 0.23110070419427364, "rougeLsum_recall_stderr": 0.004580544215458525}}, "3": {"PALM_prompt": {"bleu": 0.5759982325181904, "bleu_stderr": 0.03527399325206937, "rouge1_fmeasure": 0.13521047194045263, "rouge1_fmeasure_stderr": 0.002946666885970357, "rouge1_precision": 0.13728366988693166, "rouge1_precision_stderr": 0.004178415613322461, "rouge1_recall": 0.26401851266062687, "rouge1_recall_stderr": 0.004860743796395868, "rouge2_fmeasure": 0.05726772787270205, "rouge2_fmeasure_stderr": 0.0019419549346478611, "rouge2_precision": 0.06043304953115908, "rouge2_precision_stderr": 0.0027840963736740568, "rouge2_recall": 0.11060391739306118, "rouge2_recall_stderr": 0.0031505455023900977, "rougeL_fmeasure": 0.11644101786871884, "rougeL_fmeasure_stderr": 0.002552425341599467, "rougeL_precision": 0.12047800800814502, "rougeL_precision_stderr": 0.003815761870962405, "rougeL_recall": 0.22821304414780252, "rougeL_recall_stderr": 0.004206291399993297, "rougeLsum_fmeasure": 0.12339106867125381, "rougeLsum_fmeasure_stderr": 0.002641425885894831, "rougeLsum_precision": 0.1263917463627956, "rougeLsum_precision_stderr": 0.0039042563333160697, "rougeLsum_recall": 0.2425977982336775, "rougeLsum_recall_stderr": 0.0044346343445859976}}, "4": {"PALM_prompt": {"bleu": 0.5802809506773131, "bleu_stderr": 0.025615601301598487, "rouge1_fmeasure": 0.13671898110048547, "rouge1_fmeasure_stderr": 0.0028952468126031902, "rouge1_precision": 0.13230778253839034, "rouge1_precision_stderr": 0.003884437321031469, "rouge1_recall": 0.27790387337477124, "rouge1_recall_stderr": 0.004980688671328301, "rouge2_fmeasure": 0.058213068413064326, "rouge2_fmeasure_stderr": 0.0018668447313456471, "rouge2_precision": 0.058991661796684115, "rouge2_precision_stderr": 0.0025815087906904998, "rouge2_recall": 0.11818030987062483, "rouge2_recall_stderr": 0.003237712515808151, "rougeL_fmeasure": 0.11636850879508512, "rougeL_fmeasure_stderr": 0.0024710668224032523, "rougeL_precision": 0.11432856518176837, "rougeL_precision_stderr": 0.0034885395539469973, "rougeL_recall": 0.23807284258939296, "rougeL_recall_stderr": 0.0042856985659966325, "rougeLsum_fmeasure": 0.123832993107892, "rougeLsum_fmeasure_stderr": 0.0025812067745189238, "rougeLsum_precision": 0.12104142772513969, "rougeLsum_precision_stderr": 0.003609978991916031, "rougeLsum_recall": 0.25291366271666127, "rougeLsum_recall_stderr": 0.004522821728166606}}, "5": {"PALM_prompt": {"bleu": 0.5232654630013782, "bleu_stderr": 0.03271211394726812, "rouge1_fmeasure": 0.1397323283797567, "rouge1_fmeasure_stderr": 0.0030406297754937976, "rouge1_precision": 0.134681789984105, "rouge1_precision_stderr": 0.004035522030055344, "rouge1_recall": 0.2837323351731113, "rouge1_recall_stderr": 0.005030933216110844, "rouge2_fmeasure": 0.05969397137655782, "rouge2_fmeasure_stderr": 0.0020540436888755523, "rouge2_precision": 0.06040913734494076, "rouge2_precision_stderr": 0.0027210403294652174, "rouge2_recall": 0.11771805532151657, "rouge2_recall_stderr": 0.0032283126968023984, "rougeL_fmeasure": 0.11934468101285645, "rougeL_fmeasure_stderr": 0.0026176434668687075, "rougeL_precision": 0.11697019622096808, "rougeL_precision_stderr": 0.0036597376960975565, "rougeL_recall": 0.24267161814100727, "rougeL_recall_stderr": 0.004320464958041179, "rougeLsum_fmeasure": 0.12663501281616962, "rougeLsum_fmeasure_stderr": 0.0027062791991269526, "rougeLsum_precision": 0.12297542267548232, "rougeLsum_precision_stderr": 0.0037324972302743187, "rougeLsum_recall": 0.2590855849671594, "rougeLsum_recall_stderr": 0.004587447515119942}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6135686793887178, "bleu_stderr": 0.04971316918117406, "rouge1_fmeasure": 0.17665965443882184, "rouge1_fmeasure_stderr": 0.002014130246248338, "rouge1_precision": 0.16000268038564175, "rouge1_precision_stderr": 0.0021800774640497273, "rouge1_recall": 0.24345519983015207, "rouge1_recall_stderr": 0.002829441088525956, "rouge2_fmeasure": 0.033836058888840864, "rouge2_fmeasure_stderr": 0.0008213620182318017, "rouge2_precision": 0.03086659328640516, "rouge2_precision_stderr": 0.0008452391827844442, "rouge2_recall": 0.04840841521099736, "rouge2_recall_stderr": 0.0013238724956839457, "rougeL_fmeasure": 0.12729000803179405, "rougeL_fmeasure_stderr": 0.0013713478976488888, "rougeL_precision": 0.11484025855748327, "rougeL_precision_stderr": 0.0015362475046763462, "rougeL_recall": 0.17951598044474718, "rougeL_recall_stderr": 0.0021522131234445233, "rougeLsum_fmeasure": 0.16553902757578193, "rougeLsum_fmeasure_stderr": 0.0018824490145159468, "rougeLsum_precision": 0.14995043724037405, "rougeLsum_precision_stderr": 0.0020495091991268337, "rougeLsum_recall": 0.22849120959581118, "rougeLsum_recall_stderr": 0.0026738153161479645}}, "1": {"tldr_en": {"bleu": 1.9707762278111534, "bleu_stderr": 0.0696813647483855, "rouge1_fmeasure": 0.18431658076230925, "rouge1_fmeasure_stderr": 0.002026089256728527, "rouge1_precision": 0.1922268571354721, "rouge1_precision_stderr": 0.0025647688718406286, "rouge1_recall": 0.23236483151018725, "rouge1_recall_stderr": 0.0027657271740794, "rouge2_fmeasure": 0.03663313230489769, "rouge2_fmeasure_stderr": 0.0009211534108775115, "rouge2_precision": 0.03917874124344432, "rouge2_precision_stderr": 0.0011712262946686964, "rouge2_recall": 0.04678999104420162, "rouge2_recall_stderr": 0.0012769588178389568, "rougeL_fmeasure": 0.1321704020131924, "rougeL_fmeasure_stderr": 0.001370383174763437, "rougeL_precision": 0.13957223403441069, "rougeL_precision_stderr": 0.001910857884625806, "rougeL_recall": 0.16898108095060202, "rougeL_recall_stderr": 0.002038746570765352, "rougeLsum_fmeasure": 0.17295031855502907, "rougeLsum_fmeasure_stderr": 0.0018851344906411418, "rougeLsum_precision": 0.18050979698101216, "rougeLsum_precision_stderr": 0.0024077640439710032, "rougeLsum_recall": 0.2183674188032807, "rougeLsum_recall_stderr": 0.0025986661661154395}}, "2": {"tldr_en": {"bleu": 2.285316312600506, "bleu_stderr": 0.07318703724203124, "rouge1_fmeasure": 0.19860742023540653, "rouge1_fmeasure_stderr": 0.0019669822898704605, "rouge1_precision": 0.2148865105675037, "rouge1_precision_stderr": 0.002739851744730188, "rouge1_recall": 0.24425778319951902, "rouge1_recall_stderr": 0.0027078004695038444, "rouge2_fmeasure": 0.042199560827095785, "rouge2_fmeasure_stderr": 0.0009984994712996336, "rouge2_precision": 0.047729902956845004, "rouge2_precision_stderr": 0.0013778455296036068, "rouge2_recall": 0.05132654178756232, "rouge2_recall_stderr": 0.0013072982733870482, "rougeL_fmeasure": 0.14373246146905902, "rougeL_fmeasure_stderr": 0.0013823332704265676, "rougeL_precision": 0.15746129883842613, "rougeL_precision_stderr": 0.0021243165104716925, "rougeL_recall": 0.178676546411626, "rougeL_recall_stderr": 0.0020463162920555584, "rougeLsum_fmeasure": 0.1862201529400756, "rougeLsum_fmeasure_stderr": 0.0018350815379350304, "rougeLsum_precision": 0.201582658871862, "rougeLsum_precision_stderr": 0.0025882580846911967, "rougeLsum_recall": 0.2298482831292535, "rougeLsum_recall_stderr": 0.0025699798075345995}}, "3": {"tldr_en": {"bleu": 1.952652551469228, "bleu_stderr": 0.08453362226769602, "rouge1_fmeasure": 0.1673664610062123, "rouge1_fmeasure_stderr": 0.0022414488889316436, "rouge1_precision": 0.19123547521805817, "rouge1_precision_stderr": 0.0030573229896881894, "rouge1_recall": 0.2015864889648793, "rouge1_recall_stderr": 0.003002796407976632, "rouge2_fmeasure": 0.035103874107299336, "rouge2_fmeasure_stderr": 0.0009353728071065693, "rouge2_precision": 0.04120238523634148, "rouge2_precision_stderr": 0.0013513476132338967, "rouge2_recall": 0.0425607593666781, "rouge2_recall_stderr": 0.0012769091257706392, "rougeL_fmeasure": 0.12162349534045766, "rougeL_fmeasure_stderr": 0.0015927292615778431, "rougeL_precision": 0.14158935872924355, "rougeL_precision_stderr": 0.0024049999199382307, "rougeL_recall": 0.14757071761127052, "rougeL_recall_stderr": 0.0022268964035777597, "rougeLsum_fmeasure": 0.15630488036874124, "rougeLsum_fmeasure_stderr": 0.002088106219230967, "rougeLsum_precision": 0.17867034243558017, "rougeLsum_precision_stderr": 0.0028775711712036097, "rougeLsum_recall": 0.18884833907646806, "rougeLsum_recall_stderr": 0.0028264748023701333}}, "4": {"tldr_en": {"bleu": 0.1432851499362598, "bleu_stderr": 0.018545752951036856, "rouge1_fmeasure": 0.055242233660189904, "rouge1_fmeasure_stderr": 0.0019491440817189802, "rouge1_precision": 0.06543156657179054, "rouge1_precision_stderr": 0.002495986380075559, "rouge1_recall": 0.0672266515655875, "rouge1_recall_stderr": 0.002521383841330882, "rouge2_fmeasure": 0.012417968712803235, "rouge2_fmeasure_stderr": 0.0007243737975246956, "rouge2_precision": 0.015701101053758496, "rouge2_precision_stderr": 0.0010978015281121545, "rouge2_recall": 0.015657153764832166, "rouge2_recall_stderr": 0.0009749437105549986, "rougeL_fmeasure": 0.04087216675333205, "rougeL_fmeasure_stderr": 0.0014372727294585936, "rougeL_precision": 0.049422424802369726, "rougeL_precision_stderr": 0.0019329518383636773, "rougeL_recall": 0.04994410733045717, "rougeL_recall_stderr": 0.001901335194922695, "rougeLsum_fmeasure": 0.0513939982614947, "rougeLsum_fmeasure_stderr": 0.0018096253619321696, "rougeLsum_precision": 0.061083176462803424, "rougeLsum_precision_stderr": 0.0023412533367671035, "rougeLsum_recall": 0.06257047928104469, "rougeLsum_recall_stderr": 0.0023392750560332614}}, "5": {"tldr_en": {"bleu": 6.950848606024447e-11, "bleu_stderr": 5.311478808939334e-10, "rouge1_fmeasure": 0.009113409021214056, "rouge1_fmeasure_stderr": 0.0008812650320045362, "rouge1_precision": 0.01142443298407895, "rouge1_precision_stderr": 0.001168348600049017, "rouge1_recall": 0.010393653713974759, "rouge1_recall_stderr": 0.0010577641578929015, "rouge2_fmeasure": 0.0021937463666516443, "rouge2_fmeasure_stderr": 0.0003047993992105857, "rouge2_precision": 0.002752646600307901, "rouge2_precision_stderr": 0.0004564370123056176, "rouge2_recall": 0.0025596977500778532, "rouge2_recall_stderr": 0.0003650435541065097, "rougeL_fmeasure": 0.0067854192472234, "rougeL_fmeasure_stderr": 0.0006485341464909711, "rougeL_precision": 0.008694317655831221, "rougeL_precision_stderr": 0.0009202143979895897, "rougeL_recall": 0.007818331013501685, "rougeL_recall_stderr": 0.0008075435288949127, "rougeLsum_fmeasure": 0.008458807054678006, "rougeLsum_fmeasure_stderr": 0.0008168184748216672, "rougeLsum_precision": 0.01065083686424177, "rougeLsum_precision_stderr": 0.0011007522060213029, "rougeLsum_recall": 0.009678223252509898, "rougeLsum_recall_stderr": 0.0009885816669464665}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.6236635427104145, "bleu_stderr": 0.044000094823062975, "rouge1_fmeasure": 0.14178833842033947, "rouge1_fmeasure_stderr": 0.001366625858965996, "rouge1_precision": 0.13847763686627435, "rouge1_precision_stderr": 0.0016031339752114486, "rouge1_recall": 0.1670883161814056, "rouge1_recall_stderr": 0.001801269645905829, "rouge2_fmeasure": 0.03042186977591868, "rouge2_fmeasure_stderr": 0.0008025297811097743, "rouge2_precision": 0.03298433134895783, "rouge2_precision_stderr": 0.0009227541327962853, "rouge2_recall": 0.032152570390450726, "rouge2_recall_stderr": 0.0008807998538373043, "rougeL_fmeasure": 0.11479687580745425, "rougeL_fmeasure_stderr": 0.0010565693706085712, "rougeL_precision": 0.11440682005524515, "rougeL_precision_stderr": 0.0013890926638196878, "rougeL_recall": 0.13321300262366384, "rougeL_recall_stderr": 0.0013422075252029256, "rougeLsum_fmeasure": 0.12928537009000582, "rougeLsum_fmeasure_stderr": 0.0012449807807250097, "rougeLsum_precision": 0.12720508529865068, "rougeLsum_precision_stderr": 0.001519793803598763, "rougeLsum_recall": 0.1514931756280118, "rougeLsum_recall_stderr": 0.0016134722302035285}}, "1": {"generate_text_restaurant": {"bleu": 10.22257849977036, "bleu_stderr": 0.1358773638028331, "rouge1_fmeasure": 0.41278812145675686, "rouge1_fmeasure_stderr": 0.0024542911795773087, "rouge1_precision": 0.4871448147910172, "rouge1_precision_stderr": 0.0034540622067210382, "rouge1_recall": 0.3987305040433544, "rouge1_recall_stderr": 0.002896454163975001, "rouge2_fmeasure": 0.18077934814330301, "rouge2_fmeasure_stderr": 0.0019771106322196753, "rouge2_precision": 0.21605717614093567, "rouge2_precision_stderr": 0.002540642134225938, "rouge2_recall": 0.17434524724384645, "rouge2_recall_stderr": 0.002112670905580596, "rougeL_fmeasure": 0.3015488704235409, "rougeL_fmeasure_stderr": 0.00208792846393685, "rougeL_precision": 0.357592025526917, "rougeL_precision_stderr": 0.0029334285195078027, "rougeL_recall": 0.29097647428250184, "rougeL_recall_stderr": 0.002366681346446512, "rougeLsum_fmeasure": 0.33923274789901076, "rougeLsum_fmeasure_stderr": 0.002346302825076595, "rougeLsum_precision": 0.4009694444861299, "rougeLsum_precision_stderr": 0.003222987271894662, "rougeLsum_recall": 0.3276179281858076, "rougeLsum_recall_stderr": 0.002657379832165628}}, "2": {"generate_text_restaurant": {"bleu": 11.608172890812009, "bleu_stderr": 0.12546448007454764, "rouge1_fmeasure": 0.43403646272310575, "rouge1_fmeasure_stderr": 0.0023707708307059527, "rouge1_precision": 0.5003601755604831, "rouge1_precision_stderr": 0.0033676872823734986, "rouge1_recall": 0.4224578333820757, "rouge1_recall_stderr": 0.0028045070268289812, "rouge2_fmeasure": 0.19532447735079128, "rouge2_fmeasure_stderr": 0.002033890919626222, "rouge2_precision": 0.22765934598915802, "rouge2_precision_stderr": 0.002592272575619685, "rouge2_recall": 0.19017170459485505, "rouge2_recall_stderr": 0.0021754234261874283, "rougeL_fmeasure": 0.3195774711632576, "rougeL_fmeasure_stderr": 0.002087385018113282, "rougeL_precision": 0.36920005590300864, "rougeL_precision_stderr": 0.002899070105347367, "rougeL_recall": 0.3114300325584566, "rougeL_recall_stderr": 0.002386844149919298, "rougeLsum_fmeasure": 0.36092995445305726, "rougeLsum_fmeasure_stderr": 0.0023382463299368317, "rougeLsum_precision": 0.41565591824610326, "rougeLsum_precision_stderr": 0.003147349719498411, "rougeLsum_recall": 0.351656400499525, "rougeLsum_recall_stderr": 0.002666360477811218}}, "3": {"generate_text_restaurant": {"bleu": 12.148832814244063, "bleu_stderr": 0.2164254842540946, "rouge1_fmeasure": 0.43959025401077473, "rouge1_fmeasure_stderr": 0.0023469842550572265, "rouge1_precision": 0.5008629024842906, "rouge1_precision_stderr": 0.0032839465674907808, "rouge1_recall": 0.4289465657143499, "rouge1_recall_stderr": 0.002777856343721575, "rouge2_fmeasure": 0.19990423001151691, "rouge2_fmeasure_stderr": 0.002037750197374315, "rouge2_precision": 0.22932657191685374, "rouge2_precision_stderr": 0.002528173782009391, "rouge2_recall": 0.19569435790052012, "rouge2_recall_stderr": 0.0021939311172643567, "rougeL_fmeasure": 0.3262013378339417, "rougeL_fmeasure_stderr": 0.002088667798905091, "rougeL_precision": 0.3717564783038866, "rougeL_precision_stderr": 0.002815724825172542, "rougeL_recall": 0.31887734903335685, "rougeL_recall_stderr": 0.002394042476762719, "rougeLsum_fmeasure": 0.3663962667016505, "rougeLsum_fmeasure_stderr": 0.002307416103414695, "rougeLsum_precision": 0.41698686055117057, "rougeLsum_precision_stderr": 0.0030578982044504274, "rougeLsum_recall": 0.35794312261047967, "rougeLsum_recall_stderr": 0.0026356000638634524}}, "4": {"generate_text_restaurant": {"bleu": 12.188725337520205, "bleu_stderr": 0.17040691934318267, "rouge1_fmeasure": 0.44097522244142057, "rouge1_fmeasure_stderr": 0.002334987306347262, "rouge1_precision": 0.502843542346716, "rouge1_precision_stderr": 0.00324447586790352, "rouge1_recall": 0.42990043605232275, "rouge1_recall_stderr": 0.002805781003187939, "rouge2_fmeasure": 0.20150390158816628, "rouge2_fmeasure_stderr": 0.0020384007824099408, "rouge2_precision": 0.23136872267930247, "rouge2_precision_stderr": 0.0025298808409077124, "rouge2_recall": 0.1973380939042856, "rouge2_recall_stderr": 0.002214525838202308, "rougeL_fmeasure": 0.327709572922888, "rougeL_fmeasure_stderr": 0.0021175190612403457, "rougeL_precision": 0.37410338028287227, "rougeL_precision_stderr": 0.002845203480442862, "rougeL_recall": 0.3200022925635597, "rougeL_recall_stderr": 0.0024526386808870855, "rougeLsum_fmeasure": 0.3674167528986386, "rougeLsum_fmeasure_stderr": 0.0023145102818660965, "rougeLsum_precision": 0.4186841470296783, "rougeLsum_precision_stderr": 0.0030553849862408862, "rougeLsum_recall": 0.35847541210008427, "rougeLsum_recall_stderr": 0.0026604144939812438}}, "5": {"generate_text_restaurant": {"bleu": 12.052100189721047, "bleu_stderr": 0.17360816368667822, "rouge1_fmeasure": 0.44146484161672395, "rouge1_fmeasure_stderr": 0.0023599489046435985, "rouge1_precision": 0.5019578287458755, "rouge1_precision_stderr": 0.0032272955767237103, "rouge1_recall": 0.42766415620933046, "rouge1_recall_stderr": 0.002744722668822767, "rouge2_fmeasure": 0.19983825989563073, "rouge2_fmeasure_stderr": 0.0020273573992455273, "rouge2_precision": 0.22902089530886868, "rouge2_precision_stderr": 0.0025011174263965337, "rouge2_recall": 0.19404225653307416, "rouge2_recall_stderr": 0.00215085924091555, "rougeL_fmeasure": 0.3281387348358797, "rougeL_fmeasure_stderr": 0.0021203465141945137, "rougeL_precision": 0.3738498200188826, "rougeL_precision_stderr": 0.002838412885924971, "rougeL_recall": 0.3182575623661255, "rougeL_recall_stderr": 0.0023900815036475517, "rougeLsum_fmeasure": 0.36738120300377797, "rougeLsum_fmeasure_stderr": 0.0023163926735012027, "rougeLsum_precision": 0.417786727400929, "rougeLsum_precision_stderr": 0.003045398532883966, "rougeLsum_recall": 0.3560753985580499, "rougeLsum_recall_stderr": 0.002599451532045705}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.33149272564263, "bleu_stderr": 0.07974232529299086, "rouge1_fmeasure": 0.1896955125144789, "rouge1_fmeasure_stderr": 0.002340734957979406, "rouge1_precision": 0.13847616804662938, "rouge1_precision_stderr": 0.0018041764637102017, "rouge1_recall": 0.31904415227123595, "rouge1_recall_stderr": 0.004027572489989212, "rouge2_fmeasure": 0.0350647220803466, "rouge2_fmeasure_stderr": 0.0012715374921812529, "rouge2_precision": 0.02527327375436006, "rouge2_precision_stderr": 0.0009222971597167988, "rouge2_recall": 0.06045335036044256, "rouge2_recall_stderr": 0.002236755769451187, "rougeL_fmeasure": 0.13729471996457188, "rougeL_fmeasure_stderr": 0.0017170183133137199, "rougeL_precision": 0.10009153837626086, "rougeL_precision_stderr": 0.0013137776399014384, "rougeL_recall": 0.23188685927978395, "rougeL_recall_stderr": 0.0030451674669926395, "rougeLsum_fmeasure": 0.15117279935922695, "rougeLsum_fmeasure_stderr": 0.0019096515187809484, "rougeLsum_precision": 0.11024472368549436, "rougeLsum_precision_stderr": 0.001462750379843851, "rougeLsum_recall": 0.25488971591019244, "rougeLsum_recall_stderr": 0.003324154120048496}}, "1": {"article_DOC_summary": {"bleu": 1.1046533969464718, "bleu_stderr": 0.053683410087760904, "rouge1_fmeasure": 0.18889972012376904, "rouge1_fmeasure_stderr": 0.002436900710118824, "rouge1_precision": 0.1500240974795399, "rouge1_precision_stderr": 0.002401446918020248, "rouge1_recall": 0.29519675212489427, "rouge1_recall_stderr": 0.00400603167240481, "rouge2_fmeasure": 0.032271789589786035, "rouge2_fmeasure_stderr": 0.001293116981841995, "rouge2_precision": 0.025406891183942397, "rouge2_precision_stderr": 0.0011229153052462752, "rouge2_recall": 0.05217227337690051, "rouge2_recall_stderr": 0.0021538130410213197, "rougeL_fmeasure": 0.13723442742418349, "rougeL_fmeasure_stderr": 0.0017965589357707824, "rougeL_precision": 0.10924063995276528, "rougeL_precision_stderr": 0.0018050333433786028, "rougeL_recall": 0.21467779315037425, "rougeL_recall_stderr": 0.0029621577935301045, "rougeLsum_fmeasure": 0.1492200983245774, "rougeLsum_fmeasure_stderr": 0.001928942301625471, "rougeLsum_precision": 0.11782664845951525, "rougeLsum_precision_stderr": 0.0018412590388832792, "rougeLsum_recall": 0.2355786664105277, "rougeLsum_recall_stderr": 0.0033433492088470215}}, "2": {"article_DOC_summary": {"bleu": 1.2080604500122374, "bleu_stderr": 0.10600976050797593, "rouge1_fmeasure": 0.19480821111333477, "rouge1_fmeasure_stderr": 0.002548447408227861, "rouge1_precision": 0.16192738299665674, "rouge1_precision_stderr": 0.002724764920181696, "rouge1_recall": 0.29136949862152195, "rouge1_recall_stderr": 0.003939866240417818, "rouge2_fmeasure": 0.033326130922902826, "rouge2_fmeasure_stderr": 0.0013703895012450048, "rouge2_precision": 0.02780170866977056, "rouge2_precision_stderr": 0.0012902491443052238, "rouge2_recall": 0.050657763179210766, "rouge2_recall_stderr": 0.002057586234641573, "rougeL_fmeasure": 0.14308346408293485, "rougeL_fmeasure_stderr": 0.0019084194795786107, "rougeL_precision": 0.11906174906039903, "rougeL_precision_stderr": 0.0020626179284994545, "rougeL_recall": 0.2147686682396263, "rougeL_recall_stderr": 0.002992971735313024, "rougeLsum_fmeasure": 0.15265351502550148, "rougeLsum_fmeasure_stderr": 0.0020343408387009567, "rougeLsum_precision": 0.12595270006500264, "rougeLsum_precision_stderr": 0.002089256582233344, "rougeLsum_recall": 0.23121752027673836, "rougeLsum_recall_stderr": 0.003357739806618804}}, "3": {"article_DOC_summary": {"bleu": 1.3354579627853345, "bleu_stderr": 0.0787551002497137, "rouge1_fmeasure": 0.18708591173867786, "rouge1_fmeasure_stderr": 0.0028078435646932145, "rouge1_precision": 0.16259516777001937, "rouge1_precision_stderr": 0.0030728200086154447, "rouge1_recall": 0.269289410465233, "rouge1_recall_stderr": 0.00428240074281679, "rouge2_fmeasure": 0.03405049278930678, "rouge2_fmeasure_stderr": 0.0014753683227482124, "rouge2_precision": 0.030072444877442605, "rouge2_precision_stderr": 0.001480222722747046, "rouge2_recall": 0.04913570989408663, "rouge2_recall_stderr": 0.0021386327088809545, "rougeL_fmeasure": 0.1363637318729216, "rougeL_fmeasure_stderr": 0.002077616686086357, "rougeL_precision": 0.11908382903479889, "rougeL_precision_stderr": 0.0023534660363485516, "rougeL_recall": 0.19655219494376092, "rougeL_recall_stderr": 0.0031758709154479396, "rougeLsum_fmeasure": 0.14547126169694422, "rougeLsum_fmeasure_stderr": 0.00221176726638642, "rougeLsum_precision": 0.1257413085245787, "rougeLsum_precision_stderr": 0.0023846570386209945, "rougeLsum_recall": 0.2121332309440453, "rougeLsum_recall_stderr": 0.0035883574015232677}}, "4": {"article_DOC_summary": {"bleu": 0.4649759375914046, "bleu_stderr": 0.11179707603605749, "rouge1_fmeasure": 0.05005387987888931, "rouge1_fmeasure_stderr": 0.0027866581571753205, "rouge1_precision": 0.04717115702117185, "rouge1_precision_stderr": 0.0027458547990898, "rouge1_recall": 0.06854685849970572, "rouge1_recall_stderr": 0.004034335498605078, "rouge2_fmeasure": 0.00886282988047597, "rouge2_fmeasure_stderr": 0.0008457399189072509, "rouge2_precision": 0.008039252239171141, "rouge2_precision_stderr": 0.0008177591347774397, "rouge2_recall": 0.012871828788783901, "rouge2_recall_stderr": 0.0012947257895368077, "rougeL_fmeasure": 0.037042998151035164, "rougeL_fmeasure_stderr": 0.0020607318812993535, "rougeL_precision": 0.03560333982666727, "rougeL_precision_stderr": 0.0021361702503072603, "rougeL_recall": 0.050735327935754304, "rougeL_recall_stderr": 0.003012711453091767, "rougeLsum_fmeasure": 0.0387408600461143, "rougeLsum_fmeasure_stderr": 0.0021593263516489917, "rougeLsum_precision": 0.03692496706333677, "rougeLsum_precision_stderr": 0.002192842113184995, "rougeLsum_recall": 0.05337510829486164, "rougeLsum_recall_stderr": 0.0031922745643679297}}, "5": {"article_DOC_summary": {"bleu": 1.5081996494898117e-23, "bleu_stderr": 3.560689042664534e-18, "rouge1_fmeasure": 0.0018881021406329268, "rouge1_fmeasure_stderr": 0.0005285584147199992, "rouge1_precision": 0.0016915297589821948, "rouge1_precision_stderr": 0.00048713143022114703, "rouge1_recall": 0.002562936486956815, "rouge1_recall_stderr": 0.0007574911873960877, "rouge2_fmeasure": 0.00019917211727136963, "rouge2_fmeasure_stderr": 8.609231087472974e-05, "rouge2_precision": 0.00016030467176019738, "rouge2_precision_stderr": 7.30332967873648e-05, "rouge2_recall": 0.0003171427458769009, "rouge2_recall_stderr": 0.00014300870446452385, "rougeL_fmeasure": 0.0015342199902319133, "rougeL_fmeasure_stderr": 0.0004241729776677898, "rougeL_precision": 0.0013520925095504633, "rougeL_precision_stderr": 0.00037757052914657154, "rougeL_recall": 0.0021249179272997214, "rougeL_recall_stderr": 0.0006316914645536129, "rougeLsum_fmeasure": 0.0014609154009965347, "rougeLsum_fmeasure_stderr": 0.0004094371256569771, "rougeLsum_precision": 0.0012885165827922138, "rougeLsum_precision_stderr": 0.000365613793335358, "rougeLsum_recall": 0.0020249663088333218, "rougeLsum_recall_stderr": 0.0006080031860077729}}}} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d91b1f5bc020d4e413a6dc8cf1e67336777a57ef --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2148865105675037, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002739851744730188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24425778319951902, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027078004695038444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.19860742023540653, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019669822898704605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.047729902956845004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013778455296036068 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05132654178756232, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013072982733870482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.042199560827095785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009984994712996336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.15746129883842613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021243165104716925 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.178676546411626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020463162920555584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14373246146905902, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013823332704265676 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.201582658871862, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0025882580846911967 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2298482831292535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025699798075345995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1862201529400756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018350815379350304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.285316312600506, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07318703724203124 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..fd9f46cb76717777e33e9b8d01cef9ca65a39049 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19123547521805817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0030573229896881894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2015864889648793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003002796407976632 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1673664610062123, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022414488889316436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04120238523634148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013513476132338967 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0425607593666781, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012769091257706392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.035103874107299336, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009353728071065693 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14158935872924355, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0024049999199382307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.14757071761127052, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022268964035777597 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12162349534045766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015927292615778431 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.17867034243558017, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028775711712036097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.18884833907646806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0028264748023701333 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15630488036874124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002088106219230967 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.952652551469228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08453362226769602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c97f1a1b9e639d9a4ed5f1eddb1f3037fb1a0549 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06543156657179054, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002495986380075559 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.0672266515655875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002521383841330882 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.055242233660189904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019491440817189802 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.015701101053758496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010978015281121545 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.015657153764832166, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009749437105549986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.012417968712803235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007243737975246956 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.049422424802369726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0019329518383636773 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.04994410733045717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001901335194922695 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04087216675333205, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014372727294585936 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.061083176462803424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023412533367671035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06257047928104469, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023392750560332614 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.0513939982614947, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018096253619321696 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.1432851499362598, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.018545752951036856 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f2cf4b8307dea91bfc8a7908aee5a633086047aa --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.01142443298407895, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001168348600049017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.010393653713974759, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0010577641578929015 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009113409021214056, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008812650320045362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002752646600307901, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0004564370123056176 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0025596977500778532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003650435541065097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0021937463666516443, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003047993992105857 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.008694317655831221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009202143979895897 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.007818331013501685, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008075435288949127 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0067854192472234, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006485341464909711 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.01065083686424177, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011007522060213029 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.009678223252509898, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0009885816669464665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008458807054678006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008168184748216672 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 6.950848606024447e-11, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.311478808939334e-10 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_3.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4dd2dae6fa93ba78c64781b3ec2d681ec2096f16 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.16259516777001937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0030728200086154447 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.269289410465233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00428240074281679 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18708591173867786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028078435646932145 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.030072444877442605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001480222722747046 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.04913570989408663, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0021386327088809545 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03405049278930678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014753683227482124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11908382903479889, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0023534660363485516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.19655219494376092, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0031758709154479396 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1363637318729216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002077616686086357 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1257413085245787, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0023846570386209945 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2121332309440453, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035883574015232677 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14547126169694422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00221176726638642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.3354579627853345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0787551002497137 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_4.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..72e09939ab19e3623b2d8bfaf17d6eee6d9470f3 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04717115702117185, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0027458547990898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.06854685849970572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004034335498605078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05005387987888931, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027866581571753205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.008039252239171141, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008177591347774397 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.012871828788783901, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0012947257895368077 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00886282988047597, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0008457399189072509 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03560333982666727, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021361702503072603 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.050735327935754304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003012711453091767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.037042998151035164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020607318812993535 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03692496706333677, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002192842113184995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.05337510829486164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0031922745643679297 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0387408600461143, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021593263516489917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.4649759375914046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11179707603605749 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_5.json b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..787428e5518e925df8b4ba62b2c9548d4b029639 --- /dev/null +++ b/8b7178b4b/evaluation/generation/slim.8b7178b4b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0016915297589821948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00048713143022114703 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002562936486956815, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007574911873960877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0018881021406329268, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0005285584147199992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00016030467176019738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 7.30332967873648e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0003171427458769009, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00014300870446452385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00019917211727136963, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 8.609231087472974e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0013520925095504633, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00037757052914657154 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0021249179272997214, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0006316914645536129 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0015342199902319133, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004241729776677898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0012885165827922138, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000365613793335358 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0020249663088333218, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006080031860077729 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0014609154009965347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0004094371256569771 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5081996494898117e-23, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 3.560689042664534e-18 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b4b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 8, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_0.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..f69f4d34a80e91840cdc82e145f77772d38c3e41 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411249,0 +anli_r2,acc,0.346,0.015050266127564441,0 +anli_r3,acc,0.325,0.013526454480351018,0 +arc_challenge,acc,0.23464163822525597,0.012383873560768675,0 +arc_challenge,acc_norm,0.2593856655290102,0.012808273573927094,0 +arc_easy,acc,0.494949494949495,0.010259260102565858,0 +arc_easy,acc_norm,0.4515993265993266,0.010211600726405215,0 +boolq,acc,0.4883792048929664,0.008742692742551265,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2151416122004357,,1 +copa,acc,0.74,0.04408440022768077,0 +hellaswag,acc,0.39494124676359293,0.00487839022659172,0 +hellaswag,acc_norm,0.483469428400717,0.004987053652540279,0 +piqa,acc,0.6931447225244831,0.010760295070580359,0 +piqa,acc_norm,0.6893362350380848,0.01079707893372768,0 +rte,acc,0.4657039711191336,0.030025579819366426,0 +sciq,acc,0.69,0.014632638658632893,0 +sciq,acc_norm,0.633,0.015249378464171756,0 +storycloze_2016,acc,0.649919828968466,0.011030440255782963,0 +winogrande,acc,0.49329123914759276,0.014051220692330342,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json index e8d08e01cd7a9d14b8071e48a6511aeb223d0b40..c366b7e8968305170177c975cfea88e128ba6bfc 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_0.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.74, "acc_stderr": 0.04408440022768077 + }, + "hellaswag": { + "acc": 0.39494124676359293, + "acc_stderr": 0.00487839022659172, + "acc_norm": 0.483469428400717, + "acc_norm_stderr": 0.004987053652540279 + }, + "rte": { + "acc": 0.4657039711191336, + "acc_stderr": 0.030025579819366426 + }, + "winogrande": { + "acc": 0.49329123914759276, + "acc_stderr": 0.014051220692330342 + }, + "storycloze_2016": { + "acc": 0.649919828968466, + "acc_stderr": 0.011030440255782963 + }, + "boolq": { + "acc": 0.4883792048929664, + "acc_stderr": 0.008742692742551265 + }, + "arc_easy": { + "acc": 0.494949494949495, + "acc_stderr": 0.010259260102565858, + "acc_norm": 0.4515993265993266, + "acc_norm_stderr": 0.010211600726405215 + }, + "arc_challenge": { + "acc": 0.23464163822525597, + "acc_stderr": 0.012383873560768675, + "acc_norm": 0.2593856655290102, + "acc_norm_stderr": 0.012808273573927094 + }, + "sciq": { + "acc": 0.69, + "acc_stderr": 0.014632638658632893, + "acc_norm": 0.633, + "acc_norm_stderr": 0.015249378464171756 + }, + "piqa": { + "acc": 0.6931447225244831, + "acc_stderr": 0.010760295070580359, + "acc_norm": 0.6893362350380848, + "acc_norm_stderr": 0.01079707893372768 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-16-16-30-48_0shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-16-16-30-48_0shots_backup.json deleted file mode 100644 index e8d08e01cd7a9d14b8071e48a6511aeb223d0b40..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_0_lm-eval_global_step84877_2023-05-16-16-30-48_0shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411249 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564441 - }, - "anli_r3": { - "acc": 0.325, - "acc_stderr": 0.013526454480351018 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2151416122004357 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768077 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_1.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..e8f987ab1ec352dd98bd039c48f5da888950eed9 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.322,0.01478291360099667,0 +anli_r2,acc,0.35,0.015090650341444233,0 +anli_r3,acc,0.3308333333333333,0.013588208070708997,0 +arc_challenge,acc,0.24573378839590443,0.01258103345373011,0 +arc_challenge,acc_norm,0.2841296928327645,0.013179442447653887,0 +arc_easy,acc,0.494949494949495,0.01025926010256586,0 +arc_easy,acc_norm,0.48358585858585856,0.0102542535659293,0 +boolq,acc,0.5207951070336392,0.008737488341370728,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.289480167528948,,1 +copa,acc,0.74,0.04408440022768077,0 +hellaswag,acc,0.39384584744074885,0.004876028037941944,0 +hellaswag,acc_norm,0.484564827723561,0.004987403268345011,0 +piqa,acc,0.6855277475516867,0.010833009065106574,0 +piqa,acc_norm,0.6877040261153428,0.010812581599154424,0 +rte,acc,0.4620938628158845,0.030009848912529117,0 +sciq,acc,0.732,0.014013292702729486,0 +sciq,acc_norm,0.704,0.014442734941575018,0 +storycloze_2016,acc,0.6536611437733832,0.01100287402644642,0 +winogrande,acc,0.49329123914759276,0.014051220692330342,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json index 2c56437cac7e0fe5435e6972fe39d38ccfff2ca3..466cfd257204280364e025c095c7e7df4351d9ec 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_1.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.74, "acc_stderr": 0.04408440022768077 + }, + "hellaswag": { + "acc": 0.39384584744074885, + "acc_stderr": 0.004876028037941944, + "acc_norm": 0.484564827723561, + "acc_norm_stderr": 0.004987403268345011 + }, + "rte": { + "acc": 0.4620938628158845, + "acc_stderr": 0.030009848912529117 + }, + "winogrande": { + "acc": 0.49329123914759276, + "acc_stderr": 0.014051220692330342 + }, + "storycloze_2016": { + "acc": 0.6536611437733832, + "acc_stderr": 0.01100287402644642 + }, + "boolq": { + "acc": 0.5207951070336392, + "acc_stderr": 0.008737488341370728 + }, + "arc_easy": { + "acc": 0.494949494949495, + "acc_stderr": 0.01025926010256586, + "acc_norm": 0.48358585858585856, + "acc_norm_stderr": 0.0102542535659293 + }, + "arc_challenge": { + "acc": 0.24573378839590443, + "acc_stderr": 0.01258103345373011, + "acc_norm": 0.2841296928327645, + "acc_norm_stderr": 0.013179442447653887 + }, + "sciq": { + "acc": 0.732, + "acc_stderr": 0.014013292702729486, + "acc_norm": 0.704, + "acc_norm_stderr": 0.014442734941575018 + }, + "piqa": { + "acc": 0.6855277475516867, + "acc_stderr": 0.010833009065106574, + "acc_norm": 0.6877040261153428, + "acc_norm_stderr": 0.010812581599154424 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-16-16-30-48_1shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-16-16-30-48_1shots_backup.json deleted file mode 100644 index 2c56437cac7e0fe5435e6972fe39d38ccfff2ca3..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_1_lm-eval_global_step84877_2023-05-16-16-30-48_1shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.322, - "acc_stderr": 0.01478291360099667 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070708997 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.289480167528948 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768077 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_2.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3c601d9173ce0a769dbe92f3f36910b5ccde6963 --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095526,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.31833333333333336,0.013452948996996292,0 +arc_challenge,acc,0.24744027303754265,0.012610352663292673,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601341,0 +arc_easy,acc,0.4970538720538721,0.010259605416237575,0 +arc_easy,acc_norm,0.4802188552188552,0.010251751199542726,0 +boolq,acc,0.5137614678899083,0.00874174210687866,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.291852955787382,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.39822744473212507,0.004885323175701679,0 +hellaswag,acc_norm,0.4847639912368054,0.004987464257999317,0 +piqa,acc,0.6931447225244831,0.01076029507058036,0 +piqa,acc_norm,0.6996735582154516,0.010695225308183138,0 +rte,acc,0.4693140794223827,0.03003973059219781,0 +sciq,acc,0.732,0.014013292702729486,0 +sciq,acc_norm,0.708,0.014385511563477343,0 +storycloze_2016,acc,0.6435061464457509,0.011075964871050996,0 +winogrande,acc,0.4909234411996843,0.014050170094497697,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json index 0d48ab11124611d0def01a35abc7f1138440020b..666a40df7e1846ca96d40da3666b2dc9c8b9e1d7 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_2.json @@ -7,10 +7,81 @@ "anli_r2": { "acc": 0.34, "acc_stderr": 0.014987482264363937 + }, + "anli_r3": { + "acc": 0.31833333333333336, + "acc_stderr": 0.013452948996996292 + }, + "cb": { + "acc": 0.42857142857142855, + "acc_stderr": 0.06672848092813058, + "f1": 0.291852955787382 + }, + "copa": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316 + }, + "hellaswag": { + "acc": 0.39822744473212507, + "acc_stderr": 0.004885323175701679, + "acc_norm": 0.4847639912368054, + "acc_norm_stderr": 0.004987464257999317 + }, + "rte": { + "acc": 0.4693140794223827, + "acc_stderr": 0.03003973059219781 + }, + "winogrande": { + "acc": 0.4909234411996843, + "acc_stderr": 0.014050170094497697 + }, + "storycloze_2016": { + "acc": 0.6435061464457509, + "acc_stderr": 0.011075964871050996 + }, + "boolq": { + "acc": 0.5137614678899083, + "acc_stderr": 0.00874174210687866 + }, + "arc_easy": { + "acc": 0.4970538720538721, + "acc_stderr": 0.010259605416237575, + "acc_norm": 0.4802188552188552, + "acc_norm_stderr": 0.010251751199542726 + }, + "arc_challenge": { + "acc": 0.24744027303754265, + "acc_stderr": 0.012610352663292673, + "acc_norm": 0.2790102389078498, + "acc_norm_stderr": 0.013106784883601341 + }, + "sciq": { + "acc": 0.732, + "acc_stderr": 0.014013292702729486, + "acc_norm": 0.708, + "acc_norm_stderr": 0.014385511563477343 + }, + "piqa": { + "acc": 0.6931447225244831, + "acc_stderr": 0.01076029507058036, + "acc_norm": 0.6996735582154516, + "acc_norm_stderr": 0.010695225308183138 } }, "versions": { "anli_r1": 0, - "anli_r2": 0 + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-16-16-30-48_2shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-16-16-30-48_2shots_backup.json deleted file mode 100644 index 0d48ab11124611d0def01a35abc7f1138440020b..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_2_lm-eval_global_step84877_2023-05-16-16-30-48_2shots_backup.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0 - } -} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_3.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..09e036749307562cc8c8850af5b8bfb9dbeca64c --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456729,0 +anli_r2,acc,0.323,0.014794927843348639,0 +anli_r3,acc,0.3375,0.013655897185463653,0 +arc_challenge,acc,0.2354948805460751,0.012399451855004748,0 +arc_challenge,acc_norm,0.257679180887372,0.012780770562768409,0 +arc_easy,acc,0.49284511784511786,0.010258733022446368,0 +arc_easy,acc_norm,0.4823232323232323,0.01025336980569897,0 +boolq,acc,0.5314984709480123,0.00872768484861531,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.33534439416792355,,1 +copa,acc,0.68,0.046882617226215034,0 +hellaswag,acc,0.3972316271659032,0.004883246579496658,0 +hellaswag,acc_norm,0.48028281218880703,0.004985900172317692,0 +piqa,acc,0.6877040261153428,0.010812581599154424,0 +piqa,acc_norm,0.6964091403699674,0.010728079893076364,0 +rte,acc,0.4548736462093863,0.029973636495415252,0 +sciq,acc,0.743,0.013825416526895033,0 +sciq,acc_norm,0.71,0.014356395999905684,0 +storycloze_2016,acc,0.6376269374665954,0.011115793699210296,0 +winogrande,acc,0.494869771112865,0.014051745961790513,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json index af9dcec64f4b7a720594ff8e16ddfd07747da94d..784b45fdeada4cf06683621bf3fbe864a00fff4a 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_3.json @@ -3,9 +3,85 @@ "anli_r1": { "acc": 0.33, "acc_stderr": 0.014876872027456729 + }, + "anli_r2": { + "acc": 0.323, + "acc_stderr": 0.014794927843348639 + }, + "anli_r3": { + "acc": 0.3375, + "acc_stderr": 0.013655897185463653 + }, + "cb": { + "acc": 0.5178571428571429, + "acc_stderr": 0.06737697508644647, + "f1": 0.33534439416792355 + }, + "copa": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034 + }, + "hellaswag": { + "acc": 0.3972316271659032, + "acc_stderr": 0.004883246579496658, + "acc_norm": 0.48028281218880703, + "acc_norm_stderr": 0.004985900172317692 + }, + "rte": { + "acc": 0.4548736462093863, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.494869771112865, + "acc_stderr": 0.014051745961790513 + }, + "storycloze_2016": { + "acc": 0.6376269374665954, + "acc_stderr": 0.011115793699210296 + }, + "boolq": { + "acc": 0.5314984709480123, + "acc_stderr": 0.00872768484861531 + }, + "arc_easy": { + "acc": 0.49284511784511786, + "acc_stderr": 0.010258733022446368, + "acc_norm": 0.4823232323232323, + "acc_norm_stderr": 0.01025336980569897 + }, + "arc_challenge": { + "acc": 0.2354948805460751, + "acc_stderr": 0.012399451855004748, + "acc_norm": 0.257679180887372, + "acc_norm_stderr": 0.012780770562768409 + }, + "sciq": { + "acc": 0.743, + "acc_stderr": 0.013825416526895033, + "acc_norm": 0.71, + "acc_norm_stderr": 0.014356395999905684 + }, + "piqa": { + "acc": 0.6877040261153428, + "acc_stderr": 0.010812581599154424, + "acc_norm": 0.6964091403699674, + "acc_norm_stderr": 0.010728079893076364 } }, "versions": { - "anli_r1": 0 + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-16-16-30-48_3shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-16-16-30-48_3shots_backup.json deleted file mode 100644 index af9dcec64f4b7a720594ff8e16ddfd07747da94d..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_3_lm-eval_global_step84877_2023-05-16-16-30-48_3shots_backup.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456729 - } - }, - "versions": { - "anli_r1": 0 - } -} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_4.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..b82edadb6cee5d2afbf2e94bcb3cc6f165dbff0b --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.01487687202745673,0 +anli_r2,acc,0.337,0.014955087918653612,0 +anli_r3,acc,0.3416666666666667,0.013696658778002512,0 +arc_challenge,acc,0.24232081911262798,0.012521593295800115,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5012626262626263,0.010259750807991068,0 +arc_easy,acc_norm,0.502104377104377,0.010259692651537035,0 +boolq,acc,0.5443425076452599,0.00871059702108126,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.3263888888888889,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.39822744473212507,0.004885323175701676,0 +hellaswag,acc_norm,0.4894443337980482,0.004988669343786957,0 +piqa,acc,0.676822633297062,0.01091197412428213,0 +piqa,acc_norm,0.6833514689880305,0.010853160531978484,0 +rte,acc,0.44404332129963897,0.029907396333795994,0 +sciq,acc,0.749,0.01371813351688892,0 +sciq,acc_norm,0.74,0.013877773329774166,0 +storycloze_2016,acc,0.6419027258150721,0.011087006809925712,0 +winogrande,acc,0.5130228887134964,0.014047718393997667,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json index 78bf4e69f65866891bc6e9e356f89b224690c5a0..1bd7c75761dae3843de4aebd47c3c7c3f91acff7 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_4.json @@ -3,9 +3,85 @@ "anli_r1": { "acc": 0.33, "acc_stderr": 0.01487687202745673 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.014955087918653612 + }, + "anli_r3": { + "acc": 0.3416666666666667, + "acc_stderr": 0.013696658778002512 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.3263888888888889 + }, + "copa": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814 + }, + "hellaswag": { + "acc": 0.39822744473212507, + "acc_stderr": 0.004885323175701676, + "acc_norm": 0.4894443337980482, + "acc_norm_stderr": 0.004988669343786957 + }, + "rte": { + "acc": 0.44404332129963897, + "acc_stderr": 0.029907396333795994 + }, + "winogrande": { + "acc": 0.5130228887134964, + "acc_stderr": 0.014047718393997667 + }, + "storycloze_2016": { + "acc": 0.6419027258150721, + "acc_stderr": 0.011087006809925712 + }, + "boolq": { + "acc": 0.5443425076452599, + "acc_stderr": 0.00871059702108126 + }, + "arc_easy": { + "acc": 0.5012626262626263, + "acc_stderr": 0.010259750807991068, + "acc_norm": 0.502104377104377, + "acc_norm_stderr": 0.010259692651537035 + }, + "arc_challenge": { + "acc": 0.24232081911262798, + "acc_stderr": 0.012521593295800115, + "acc_norm": 0.27559726962457337, + "acc_norm_stderr": 0.013057169655761838 + }, + "sciq": { + "acc": 0.749, + "acc_stderr": 0.01371813351688892, + "acc_norm": 0.74, + "acc_norm_stderr": 0.013877773329774166 + }, + "piqa": { + "acc": 0.676822633297062, + "acc_stderr": 0.01091197412428213, + "acc_norm": 0.6833514689880305, + "acc_norm_stderr": 0.010853160531978484 } }, "versions": { - "anli_r1": 0 + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-16-16-30-48_4shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-16-16-30-48_4shots_backup.json deleted file mode 100644 index 78bf4e69f65866891bc6e9e356f89b224690c5a0..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_4_lm-eval_global_step84877_2023-05-16-16-30-48_4shots_backup.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - } - }, - "versions": { - "anli_r1": 0 - } -} \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_5.csv b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..9d92afd466cbd4796cd5ad7a4ede6eb32317a15c --- /dev/null +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.348,0.015070604603768408,0 +anli_r2,acc,0.325,0.014818724459095526,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378607,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601341,0 +arc_easy,acc,0.5058922558922558,0.010259071083844221,0 +arc_easy,acc_norm,0.4970538720538721,0.010259605416237574,0 +boolq,acc,0.5235474006116208,0.008735351675636606,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3080848777867311,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.3981278629755029,0.004885116465550268,0 +hellaswag,acc_norm,0.4869547898824935,0.00498808282521327,0 +piqa,acc,0.6735582154515778,0.0109404670461773,0 +piqa,acc_norm,0.6833514689880305,0.010853160531978484,0 +rte,acc,0.4693140794223827,0.030039730592197812,0 +sciq,acc,0.761,0.013493000446937594,0 +sciq,acc_norm,0.745,0.013790038620872833,0 +storycloze_2016,acc,0.6451095670764297,0.01106478765990412,0 +winogrande,acc,0.4980268350434096,0.01405237625922564,0 diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json index 1f412fc6531f0989861223f8b27959c5860067d2..6b5445473802156607ffa75b9e9423a40b9be394 100644 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json +++ b/8b7178b4b/evaluation/rankeval/8b7178b4b_5.json @@ -3,9 +3,85 @@ "anli_r1": { "acc": 0.348, "acc_stderr": 0.015070604603768408 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r3": { + "acc": 0.3433333333333333, + "acc_stderr": 0.01371263383046586 + }, + "cb": { + "acc": 0.5, + "acc_stderr": 0.06741998624632421, + "f1": 0.3080848777867311 + }, + "copa": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814 + }, + "hellaswag": { + "acc": 0.3981278629755029, + "acc_stderr": 0.004885116465550268, + "acc_norm": 0.4869547898824935, + "acc_norm_stderr": 0.00498808282521327 + }, + "rte": { + "acc": 0.4693140794223827, + "acc_stderr": 0.030039730592197812 + }, + "winogrande": { + "acc": 0.4980268350434096, + "acc_stderr": 0.01405237625922564 + }, + "storycloze_2016": { + "acc": 0.6451095670764297, + "acc_stderr": 0.01106478765990412 + }, + "boolq": { + "acc": 0.5235474006116208, + "acc_stderr": 0.008735351675636606 + }, + "arc_easy": { + "acc": 0.5058922558922558, + "acc_stderr": 0.010259071083844221, + "acc_norm": 0.4970538720538721, + "acc_norm_stderr": 0.010259605416237574 + }, + "arc_challenge": { + "acc": 0.25341296928327645, + "acc_stderr": 0.012710896778378607, + "acc_norm": 0.2790102389078498, + "acc_norm_stderr": 0.013106784883601341 + }, + "sciq": { + "acc": 0.761, + "acc_stderr": 0.013493000446937594, + "acc_norm": 0.745, + "acc_norm_stderr": 0.013790038620872833 + }, + "piqa": { + "acc": 0.6735582154515778, + "acc_stderr": 0.0109404670461773, + "acc_norm": 0.6833514689880305, + "acc_norm_stderr": 0.010853160531978484 } }, "versions": { - "anli_r1": 0 + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-16-16-30-48_5shots_backup.json b/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-16-16-30-48_5shots_backup.json deleted file mode 100644 index 1f412fc6531f0989861223f8b27959c5860067d2..0000000000000000000000000000000000000000 --- a/8b7178b4b/evaluation/rankeval/8b7178b4b_5_lm-eval_global_step84877_2023-05-16-16-30-48_5shots_backup.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.348, - "acc_stderr": 0.015070604603768408 - } - }, - "versions": { - "anli_r1": 0 - } -} \ No newline at end of file diff --git a/8b7178b58b/evaluation/8b7178b58b_1_babi.json b/8b7178b58b/evaluation/8b7178b58b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..6f674964e27933fbbe17bcccfaaa3a020cc0d126 --- /dev/null +++ b/8b7178b58b/evaluation/8b7178b58b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.11066666666666666, + "em_stderr": 0.0057286523433593035 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b58b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/8b7178b58b_2_babi.json b/8b7178b58b/evaluation/8b7178b58b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..b845903f6d24827c0b492a671627add1ac14a4e8 --- /dev/null +++ b/8b7178b58b/evaluation/8b7178b58b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.241, + "em_stderr": 0.00780982273232697 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b58b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/8b7178b58b_3_babi.json b/8b7178b58b/evaluation/8b7178b58b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..7e3ca905655453f550457c982078fffc92d4ea86 --- /dev/null +++ b/8b7178b58b/evaluation/8b7178b58b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.2926666666666667, + "em_stderr": 0.00830826729216739 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b58b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/8b7178b58b_4_babi.json b/8b7178b58b/evaluation/8b7178b58b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e4a5735146749b53b2c066669acf590b811da0 --- /dev/null +++ b/8b7178b58b/evaluation/8b7178b58b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.323, + "em_stderr": 0.0085390068633735 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b58b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/8b7178b58b_5_babi.json b/8b7178b58b/evaluation/8b7178b58b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c0f1091ed76ea1c9588fc174dc8855d8013625 --- /dev/null +++ b/8b7178b58b/evaluation/8b7178b58b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.3363333333333333, + "em_stderr": 0.008627236935801566 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b58b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/8b7178b88b_1_babi.json b/8b7178b88b/evaluation/8b7178b88b_1_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..e78cdb6a5b642b5aea8063138236abe3bb199b98 --- /dev/null +++ b/8b7178b88b/evaluation/8b7178b88b_1_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.104, + "em_stderr": 0.005574198647655273 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers", + "num_fewshot": 1, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/8b7178b88b_2_babi.json b/8b7178b88b/evaluation/8b7178b88b_2_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..5056fe9894033915091395c75842444b4ee44164 --- /dev/null +++ b/8b7178b88b/evaluation/8b7178b88b_2_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.22033333333333333, + "em_stderr": 0.007568439663816854 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers", + "num_fewshot": 2, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/8b7178b88b_3_babi.json b/8b7178b88b/evaluation/8b7178b88b_3_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..2db803c0ded66215d8921320be3805ecde1f5968 --- /dev/null +++ b/8b7178b88b/evaluation/8b7178b88b_3_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.271, + "em_stderr": 0.008116338972679679 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers", + "num_fewshot": 3, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/8b7178b88b_4_babi.json b/8b7178b88b/evaluation/8b7178b88b_4_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..112df709cd3c60c9758960d71e53f61ff17fe6fb --- /dev/null +++ b/8b7178b88b/evaluation/8b7178b88b_4_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.31266666666666665, + "em_stderr": 0.008465181264372679 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers", + "num_fewshot": 4, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/8b7178b88b_5_babi.json b/8b7178b88b/evaluation/8b7178b88b_5_babi.json new file mode 100644 index 0000000000000000000000000000000000000000..deb6593e674fa2255eaa16a31edd834c9014e7d6 --- /dev/null +++ b/8b7178b88b/evaluation/8b7178b88b_5_babi.json @@ -0,0 +1,22 @@ +{ + "results": { + "babi": { + "em": 0.326, + "em_stderr": 0.008559541766458567 + } + }, + "versions": { + "babi": 0 + }, + "config": { + "model": "gpt2", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers", + "num_fewshot": 5, + "batch_size": null, + "device": null, + "no_cache": true, + "limit": 3000, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file