laiviet commited on
Commit
d1253a8
·
1 Parent(s): 82b9918

Init commit with simple table

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +110 -0
  2. evals/arc-challenge/arc_ar_challenge_bloom-1b7.json +23 -0
  3. evals/arc-challenge/arc_ar_challenge_bloom-560.json +23 -0
  4. evals/arc-challenge/arc_ar_challenge_bloom-7b1.json +23 -0
  5. evals/arc-challenge/arc_ar_challenge_gpt2-large.json +23 -0
  6. evals/arc-challenge/arc_ar_challenge_gpt2-medium.json +23 -0
  7. evals/arc-challenge/arc_ar_challenge_gpt2.json +23 -0
  8. evals/arc-challenge/arc_ar_challenge_llama-7B.json +23 -0
  9. evals/arc-challenge/arc_bn_challenge_bloom-1b7.json +23 -0
  10. evals/arc-challenge/arc_bn_challenge_bloom-560.json +23 -0
  11. evals/arc-challenge/arc_bn_challenge_bloom-7b1.json +23 -0
  12. evals/arc-challenge/arc_bn_challenge_gpt2-large.json +23 -0
  13. evals/arc-challenge/arc_bn_challenge_gpt2-medium.json +23 -0
  14. evals/arc-challenge/arc_bn_challenge_gpt2.json +23 -0
  15. evals/arc-challenge/arc_bn_challenge_llama-7B.json +23 -0
  16. evals/arc-challenge/arc_ca_challenge_bloom-1b7.json +23 -0
  17. evals/arc-challenge/arc_ca_challenge_bloom-560.json +23 -0
  18. evals/arc-challenge/arc_ca_challenge_bloom-7b1.json +23 -0
  19. evals/arc-challenge/arc_ca_challenge_gpt2-large.json +23 -0
  20. evals/arc-challenge/arc_ca_challenge_gpt2-medium.json +23 -0
  21. evals/arc-challenge/arc_ca_challenge_gpt2.json +23 -0
  22. evals/arc-challenge/arc_ca_challenge_llama-7B.json +23 -0
  23. evals/arc-challenge/arc_da_challenge_bloom-1b7.json +23 -0
  24. evals/arc-challenge/arc_da_challenge_bloom-560.json +23 -0
  25. evals/arc-challenge/arc_da_challenge_bloom-7b1.json +23 -0
  26. evals/arc-challenge/arc_da_challenge_gpt2-large.json +23 -0
  27. evals/arc-challenge/arc_da_challenge_gpt2-medium.json +23 -0
  28. evals/arc-challenge/arc_da_challenge_gpt2.json +23 -0
  29. evals/arc-challenge/arc_da_challenge_llama-7B.json +23 -0
  30. evals/arc-challenge/arc_de_challenge_bloom-1b7.json +23 -0
  31. evals/arc-challenge/arc_de_challenge_bloom-560.json +23 -0
  32. evals/arc-challenge/arc_de_challenge_bloom-7b1.json +23 -0
  33. evals/arc-challenge/arc_de_challenge_gpt2-large.json +23 -0
  34. evals/arc-challenge/arc_de_challenge_gpt2-medium.json +23 -0
  35. evals/arc-challenge/arc_de_challenge_gpt2.json +23 -0
  36. evals/arc-challenge/arc_de_challenge_llama-7B.json +23 -0
  37. evals/arc-challenge/arc_es_challenge_bloom-1b7.json +23 -0
  38. evals/arc-challenge/arc_es_challenge_bloom-560.json +23 -0
  39. evals/arc-challenge/arc_es_challenge_bloom-7b1.json +23 -0
  40. evals/arc-challenge/arc_es_challenge_gpt2-large.json +23 -0
  41. evals/arc-challenge/arc_es_challenge_gpt2-medium.json +23 -0
  42. evals/arc-challenge/arc_es_challenge_gpt2.json +23 -0
  43. evals/arc-challenge/arc_es_challenge_llama-7B.json +23 -0
  44. evals/arc-challenge/arc_eu_challenge_bloom-1b7.json +23 -0
  45. evals/arc-challenge/arc_eu_challenge_bloom-560.json +23 -0
  46. evals/arc-challenge/arc_eu_challenge_bloom-7b1.json +23 -0
  47. evals/arc-challenge/arc_eu_challenge_gpt2-large.json +23 -0
  48. evals/arc-challenge/arc_eu_challenge_gpt2-medium.json +23 -0
  49. evals/arc-challenge/arc_eu_challenge_gpt2.json +23 -0
  50. evals/arc-challenge/arc_eu_challenge_llama-7B.json +23 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import glob
4
+ from collections import defaultdict
5
+ import gradio as gr
6
+
7
+ import glob
8
+
9
+ ARC = "arc_challenge"
10
+ HELLASWAG = "hellaswag"
11
+ MMLU = "mmlu"
12
+ TRUTHFULQA = "truthfulqa-mc"
13
+ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
14
+
15
+ METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
16
+
17
+
18
+ def collect_results():
19
+ performance_dict = defaultdict(dict)
20
+ pretrained_models = set()
21
+ for file in glob.glob('evals/*/*.json'):
22
+ with open(file, 'r') as f:
23
+ data = json.load(f)
24
+ if 'results' not in data:
25
+ continue
26
+ if 'config' not in data:
27
+ continue
28
+ results = data['results']
29
+ config = data['config']
30
+ if 'model_args' not in config:
31
+ continue
32
+
33
+ model_args = config['model_args'].split(',')
34
+ pretrained = [x for x in model_args if x.startswith('pretrained=')]
35
+ if len(pretrained) != 1:
36
+ continue
37
+ pretrained = pretrained[0].split('=')[1]
38
+ pretrained = pretrained.split('/')[-1]
39
+ pretrained_models.add(pretrained)
40
+
41
+ for lang_task, perfs in results.items():
42
+ if lang_task.startswith('arc_') and lang_task.endswith('_challenge'):
43
+ lang = lang_task.split('_')[1]
44
+ task = ARC
45
+ elif lang_task.startswith('hellaswag_'):
46
+ _, lang = lang_task.split('_')
47
+ task = HELLASWAG
48
+ elif lang_task.startswith('mmlu_'):
49
+ _, lang = lang_task.split('_')
50
+ task = MMLU
51
+ elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
52
+ lang = lang_task.split('_')[1]
53
+ task = TRUTHFULQA
54
+
55
+ if lang and task:
56
+ metric = METRICS[BENCHMARKS.index(task)]
57
+ p = round(perfs[metric] * 100, 1)
58
+ performance_dict[(pretrained, lang)][task] = p
59
+ return performance_dict, pretrained_models
60
+
61
+
62
+ def get_leaderboard_df(performance_dict, pretrained_models):
63
+ df = list()
64
+ for (pretrained, lang), perfs in performance_dict.items():
65
+ arc_perf = perfs.get(ARC, 0.0)
66
+ hellaswag_perf = perfs.get(HELLASWAG, 0.0)
67
+ mmlu_perf = perfs.get(MMLU, 0.0)
68
+ truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
69
+
70
+ if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
71
+ continue
72
+ avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
73
+ row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
74
+ df.append(row)
75
+ return df
76
+
77
+
78
+ MODEL_COL = "Model"
79
+ LANG_COL = "Language"
80
+ AVERAGE_COL = "Average"
81
+ ARC_COL = "ARC (25-shot)"
82
+ HELLASWAG_COL = "HellaSwag (10-shot)️"
83
+ MMLU_COL = "MMLU (5-shot))️"
84
+ TRUTHFULQA_COL = "TruthfulQA (0-shot)"
85
+
86
+ COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
87
+ TYPES = ["str", "str", "number", "number", "number", "number", "number"]
88
+
89
+ args = collect_results()
90
+ leaderboard_df = get_leaderboard_df(*args)
91
+
92
+ demo = gr.Blocks()
93
+ with demo:
94
+ gr.HTML('Open Multilingual Large Language Model Evaluation Leaderboard')
95
+ gr.Markdown('INTRODUCTION TEXT', elem_classes="markdown-text")
96
+
97
+ with gr.Box():
98
+ search_bar = gr.Textbox(
99
+ placeholder="Search models...", show_label=False, elem_id="search-bar"
100
+ )
101
+
102
+ leaderboard_table = gr.components.Dataframe(
103
+ value=leaderboard_df,
104
+ headers=COLS,
105
+ datatype=TYPES,
106
+ max_rows=5,
107
+ elem_id="leaderboard-table",
108
+ )
109
+
110
+ demo.launch()
evals/arc-challenge/arc_ar_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.22818791946308725,
5
+ "acc_stderr": 0.02435139725761051,
6
+ "acc_norm": 0.2516778523489933,
7
+ "acc_norm_stderr": 0.025181904610615872
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.2550335570469799,
5
+ "acc_stderr": 0.025292327380712708,
6
+ "acc_norm": 0.2550335570469799,
7
+ "acc_norm_stderr": 0.025292327380712708
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.28187919463087246,
5
+ "acc_stderr": 0.026106703750007426,
6
+ "acc_norm": 0.3087248322147651,
7
+ "acc_norm_stderr": 0.026806063072940547
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.20134228187919462,
5
+ "acc_stderr": 0.023268565767685306,
6
+ "acc_norm": 0.21476510067114093,
7
+ "acc_norm_stderr": 0.023828868848284352
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.19463087248322147,
5
+ "acc_stderr": 0.022973392306598162,
6
+ "acc_norm": 0.21140939597315436,
7
+ "acc_norm_stderr": 0.02369243605357901
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.20134228187919462,
5
+ "acc_stderr": 0.023268565767685313,
6
+ "acc_norm": 0.22483221476510068,
7
+ "acc_norm_stderr": 0.024224169829650755
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ar_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ar_challenge": {
4
+ "acc": 0.22483221476510068,
5
+ "acc_stderr": 0.02422416982965075,
6
+ "acc_norm": 0.24161073825503357,
7
+ "acc_norm_stderr": 0.024838535108028477
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ar_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.20945945945945946,
5
+ "acc_stderr": 0.023691963473475724,
6
+ "acc_norm": 0.2533783783783784,
7
+ "acc_norm_stderr": 0.025323518629100008
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.22972972972972974,
5
+ "acc_stderr": 0.024491712953916975,
6
+ "acc_norm": 0.24662162162162163,
7
+ "acc_norm_stderr": 0.025096383517594287
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.23986486486486486,
5
+ "acc_stderr": 0.02486094967084638,
6
+ "acc_norm": 0.28040540540540543,
7
+ "acc_norm_stderr": 0.026153277917823237
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.2195945945945946,
5
+ "acc_stderr": 0.024102381106046785,
6
+ "acc_norm": 0.2668918918918919,
7
+ "acc_norm_stderr": 0.025753762926257924
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.20608108108108109,
5
+ "acc_stderr": 0.02355028295929425,
6
+ "acc_norm": 0.24662162162162163,
7
+ "acc_norm_stderr": 0.02509638351759427
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.22635135135135134,
5
+ "acc_stderr": 0.024364215012920555,
6
+ "acc_norm": 0.2668918918918919,
7
+ "acc_norm_stderr": 0.025753762926257917
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_bn_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_bn_challenge": {
4
+ "acc": 0.22635135135135134,
5
+ "acc_stderr": 0.024364215012920565,
6
+ "acc_norm": 0.26013513513513514,
7
+ "acc_norm_stderr": 0.02554257639364025
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_bn_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.2356902356902357,
5
+ "acc_stderr": 0.02466946003490763,
6
+ "acc_norm": 0.27946127946127947,
7
+ "acc_norm_stderr": 0.026082164400369843
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.2053872053872054,
5
+ "acc_stderr": 0.02348110951859932,
6
+ "acc_norm": 0.23232323232323232,
7
+ "acc_norm_stderr": 0.02454650495612789
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.3164983164983165,
5
+ "acc_stderr": 0.02703395838420779,
6
+ "acc_norm": 0.3434343434343434,
7
+ "acc_norm_stderr": 0.0276003816062635
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.20875420875420875,
5
+ "acc_stderr": 0.02362258775627148,
6
+ "acc_norm": 0.22895622895622897,
7
+ "acc_norm_stderr": 0.02442136264227106
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.20875420875420875,
5
+ "acc_stderr": 0.023622587756271473,
6
+ "acc_norm": 0.21212121212121213,
7
+ "acc_norm_stderr": 0.023761611918761673
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.21885521885521886,
5
+ "acc_stderr": 0.024032467624412215,
6
+ "acc_norm": 0.21885521885521886,
7
+ "acc_norm_stderr": 0.02403246762441221
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_ca_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_ca_challenge": {
4
+ "acc": 0.29292929292929293,
5
+ "acc_stderr": 0.026452514969665927,
6
+ "acc_norm": 0.29292929292929293,
7
+ "acc_norm_stderr": 0.02645251496966592
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_ca_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.2255892255892256,
5
+ "acc_stderr": 0.02429399929295737,
6
+ "acc_norm": 0.26262626262626265,
7
+ "acc_norm_stderr": 0.02557802773320011
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.25925925925925924,
5
+ "acc_stderr": 0.025471492792791667,
6
+ "acc_norm": 0.24579124579124578,
7
+ "acc_norm_stderr": 0.025025521384235284
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.24242424242424243,
5
+ "acc_stderr": 0.02490893747050877,
6
+ "acc_norm": 0.24915824915824916,
7
+ "acc_norm_stderr": 0.025140041284626418
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.23232323232323232,
5
+ "acc_stderr": 0.02454650495612789,
6
+ "acc_norm": 0.24242424242424243,
7
+ "acc_norm_stderr": 0.024908937470508753
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.24579124579124578,
5
+ "acc_stderr": 0.0250255213842353,
6
+ "acc_norm": 0.2727272727272727,
7
+ "acc_norm_stderr": 0.025886127156886297
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.2222222222222222,
5
+ "acc_stderr": 0.02416437978893547,
6
+ "acc_norm": 0.23905723905723905,
7
+ "acc_norm_stderr": 0.024790260423468984
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_da_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_da_challenge": {
4
+ "acc": 0.3063973063973064,
5
+ "acc_stderr": 0.026794891419479452,
6
+ "acc_norm": 0.3367003367003367,
7
+ "acc_norm_stderr": 0.02746823841289221
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_da_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.24496644295302014,
5
+ "acc_stderr": 0.024955035980898946,
6
+ "acc_norm": 0.2953020134228188,
7
+ "acc_norm_stderr": 0.026470155629081085
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.2348993288590604,
5
+ "acc_stderr": 0.024599255015999244,
6
+ "acc_norm": 0.28187919463087246,
7
+ "acc_norm_stderr": 0.026106703750007426
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.2684563758389262,
5
+ "acc_stderr": 0.0257145395148175,
6
+ "acc_norm": 0.2684563758389262,
7
+ "acc_norm_stderr": 0.0257145395148175
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.23825503355704697,
5
+ "acc_stderr": 0.024719951493159625,
6
+ "acc_norm": 0.27181208053691275,
7
+ "acc_norm_stderr": 0.025815342279487567
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.23825503355704697,
5
+ "acc_stderr": 0.024719951493159625,
6
+ "acc_norm": 0.28859060402684567,
7
+ "acc_norm_stderr": 0.026291942108676806
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.22483221476510068,
5
+ "acc_stderr": 0.02422416982965075,
6
+ "acc_norm": 0.21140939597315436,
7
+ "acc_norm_stderr": 0.02369243605357901
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_de_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_de_challenge": {
4
+ "acc": 0.2785234899328859,
5
+ "acc_stderr": 0.0260114035784859,
6
+ "acc_norm": 0.348993288590604,
7
+ "acc_norm_stderr": 0.027658144793750224
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_de_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.2356902356902357,
5
+ "acc_stderr": 0.02466946003490763,
6
+ "acc_norm": 0.2895622895622896,
7
+ "acc_norm_stderr": 0.026362594432681956
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.2255892255892256,
5
+ "acc_stderr": 0.024293999292957367,
6
+ "acc_norm": 0.2356902356902357,
7
+ "acc_norm_stderr": 0.02466946003490764
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.3265993265993266,
5
+ "acc_stderr": 0.027258287015652305,
6
+ "acc_norm": 0.3602693602693603,
7
+ "acc_norm_stderr": 0.02790399493827167
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.2222222222222222,
5
+ "acc_stderr": 0.024164379788935483,
6
+ "acc_norm": 0.26262626262626265,
7
+ "acc_norm_stderr": 0.02557802773320012
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.1919191919191919,
5
+ "acc_stderr": 0.022889733897083934,
6
+ "acc_norm": 0.25252525252525254,
7
+ "acc_norm_stderr": 0.02525252525252536
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.19865319865319866,
5
+ "acc_stderr": 0.023190610381322127,
6
+ "acc_norm": 0.24579124579124578,
7
+ "acc_norm_stderr": 0.0250255213842353
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_es_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_es_challenge": {
4
+ "acc": 0.3501683501683502,
5
+ "acc_stderr": 0.027726370308831506,
6
+ "acc_norm": 0.3602693602693603,
7
+ "acc_norm_stderr": 0.02790399493827167
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_es_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_bloom-1b7.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.22377622377622378,
5
+ "acc_stderr": 0.02468755105337312,
6
+ "acc_norm": 0.2517482517482518,
7
+ "acc_norm_stderr": 0.02570896966075011
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-1b7",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_bloom-560.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.24475524475524477,
5
+ "acc_stderr": 0.02546756553847068,
6
+ "acc_norm": 0.19230769230769232,
7
+ "acc_norm_stderr": 0.023345268410264786
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=bigscience/bloom-560m",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_bloom-7b1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.23076923076923078,
5
+ "acc_stderr": 0.024957141712425013,
6
+ "acc_norm": 0.24125874125874125,
7
+ "acc_norm_stderr": 0.025343462496583764
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_gpt2-large.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.25874125874125875,
5
+ "acc_stderr": 0.02594151450124707,
6
+ "acc_norm": 0.24125874125874125,
7
+ "acc_norm_stderr": 0.025343462496583737
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-large",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_gpt2-medium.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.2762237762237762,
5
+ "acc_stderr": 0.026485626798716442,
6
+ "acc_norm": 0.25874125874125875,
7
+ "acc_norm_stderr": 0.025941514501247064
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2-medium",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_gpt2.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.2762237762237762,
5
+ "acc_stderr": 0.026485626798716456,
6
+ "acc_norm": 0.24825174825174826,
7
+ "acc_norm_stderr": 0.025589390464738234
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=gpt2",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/arc-challenge/arc_eu_challenge_llama-7B.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_eu_challenge": {
4
+ "acc": 0.26223776223776224,
5
+ "acc_stderr": 0.026054539173797044,
6
+ "acc_norm": 0.23426573426573427,
7
+ "acc_norm_stderr": 0.02508828621716978
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_eu_challenge": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
+ "batch_size": "1",
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }