yuchenlin commited on
Commit
8db13c5
·
1 Parent(s): 983bc41

update results

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -43,6 +43,17 @@
43
  "Total Puzzles": 1000,
44
  "Reason Lens": "439.96"
45
  },
 
 
 
 
 
 
 
 
 
 
 
46
  {
47
  "Model": "gpt-4o-2024-05-13",
48
  "Mode": "sampling",
@@ -54,6 +65,28 @@
54
  "Total Puzzles": 1000,
55
  "Reason Lens": "1549.74"
56
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  {
58
  "Model": "Mistral-Large-2",
59
  "Mode": "greedy",
@@ -120,6 +153,17 @@
120
  "Total Puzzles": 1000,
121
  "Reason Lens": "1165.90"
122
  },
 
 
 
 
 
 
 
 
 
 
 
123
  {
124
  "Model": "Meta-Llama-3.1-70B-Instruct",
125
  "Mode": "greedy",
@@ -142,6 +186,17 @@
142
  "Total Puzzles": 1000,
143
  "Reason Lens": "1260.23"
144
  },
 
 
 
 
 
 
 
 
 
 
 
145
  {
146
  "Model": "Qwen2-72B-Instruct",
147
  "Mode": "greedy",
@@ -483,6 +538,17 @@
483
  "Total Puzzles": 1000,
484
  "Reason Lens": "1473.23"
485
  },
 
 
 
 
 
 
 
 
 
 
 
486
  {
487
  "Model": "gemma-2-2b-it",
488
  "Mode": "greedy",
 
43
  "Total Puzzles": 1000,
44
  "Reason Lens": "439.96"
45
  },
46
+ {
47
+ "Model": "gpt-4o-2024-08-06",
48
+ "Mode": "greedy",
49
+ "Puzzle Acc": "31.70",
50
+ "Cell Acc": "50.34",
51
+ "No answer": "3.60",
52
+ "Easy Puzzle Acc": "84.64",
53
+ "Hard Puzzle Acc": "11.11",
54
+ "Total Puzzles": 1000,
55
+ "Reason Lens": "1106.51"
56
+ },
57
  {
58
  "Model": "gpt-4o-2024-05-13",
59
  "Mode": "sampling",
 
65
  "Total Puzzles": 1000,
66
  "Reason Lens": "1549.74"
67
  },
68
+ {
69
+ "Model": "gemini-1.5-pro-exp-0827",
70
+ "Mode": "greedy",
71
+ "Puzzle Acc": "30.50",
72
+ "Cell Acc": "50.84",
73
+ "No answer": "0.80",
74
+ "Easy Puzzle Acc": "79.64",
75
+ "Hard Puzzle Acc": "11.39",
76
+ "Total Puzzles": 1000,
77
+ "Reason Lens": "1594.47"
78
+ },
79
+ {
80
+ "Model": "chatgpt-4o-latest-24-09-07",
81
+ "Mode": "greedy",
82
+ "Puzzle Acc": "29.90",
83
+ "Cell Acc": "48.83",
84
+ "No answer": "4.20",
85
+ "Easy Puzzle Acc": "81.43",
86
+ "Hard Puzzle Acc": "9.86",
87
+ "Total Puzzles": 1000,
88
+ "Reason Lens": "1539.99"
89
+ },
90
  {
91
  "Model": "Mistral-Large-2",
92
  "Mode": "greedy",
 
153
  "Total Puzzles": 1000,
154
  "Reason Lens": "1165.90"
155
  },
156
+ {
157
+ "Model": "gemini-1.5-pro-exp-0801",
158
+ "Mode": "greedy",
159
+ "Puzzle Acc": "25.20",
160
+ "Cell Acc": "48.50",
161
+ "No answer": "0.00",
162
+ "Easy Puzzle Acc": "72.50",
163
+ "Hard Puzzle Acc": "6.81",
164
+ "Total Puzzles": 1000,
165
+ "Reason Lens": "1389.75"
166
+ },
167
  {
168
  "Model": "Meta-Llama-3.1-70B-Instruct",
169
  "Mode": "greedy",
 
186
  "Total Puzzles": 1000,
187
  "Reason Lens": "1260.23"
188
  },
189
+ {
190
+ "Model": "deepseek-v2.5-0908",
191
+ "Mode": "greedy",
192
+ "Puzzle Acc": "22.10",
193
+ "Cell Acc": "38.01",
194
+ "No answer": "12.70",
195
+ "Easy Puzzle Acc": "68.21",
196
+ "Hard Puzzle Acc": "4.17",
197
+ "Total Puzzles": 1000,
198
+ "Reason Lens": "1294.46"
199
+ },
200
  {
201
  "Model": "Qwen2-72B-Instruct",
202
  "Mode": "greedy",
 
538
  "Total Puzzles": 1000,
539
  "Reason Lens": "1473.23"
540
  },
541
+ {
542
+ "Model": "Phi-3.5-mini-instruct",
543
+ "Mode": "greedy",
544
+ "Puzzle Acc": "6.40",
545
+ "Cell Acc": "5.98",
546
+ "No answer": "80.60",
547
+ "Easy Puzzle Acc": "21.79",
548
+ "Hard Puzzle Acc": "0.42",
549
+ "Total Puzzles": 1000,
550
+ "Reason Lens": "718.43"
551
+ },
552
  {
553
  "Model": "gemma-2-2b-it",
554
  "Mode": "greedy",