yuchenlin commited on
Commit
8ad2c31
·
1 Parent(s): 8db13c5

update o1 results

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -1,4 +1,15 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "Model": "claude-3-5-sonnet-20240620",
4
  "Mode": "greedy",
@@ -22,7 +33,7 @@
22
  "Reason Lens": "1153.83"
23
  },
24
  {
25
- "Model": "Llama-3.1-405B-Instruct-Turbo",
26
  "Mode": "greedy",
27
  "Puzzle Acc": "32.60",
28
  "Cell Acc": "45.80",
@@ -33,7 +44,7 @@
33
  "Reason Lens": "314.66"
34
  },
35
  {
36
- "Model": "Llama-3.1-405B-Instruct-Turbo",
37
  "Mode": "sampling",
38
  "Puzzle Acc": "32.60",
39
  "Cell Acc": "47.04",
@@ -76,6 +87,17 @@
76
  "Total Puzzles": 1000,
77
  "Reason Lens": "1594.47"
78
  },
 
 
 
 
 
 
 
 
 
 
 
79
  {
80
  "Model": "chatgpt-4o-latest-24-09-07",
81
  "Mode": "greedy",
@@ -164,6 +186,28 @@
164
  "Total Puzzles": 1000,
165
  "Reason Lens": "1389.75"
166
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  {
168
  "Model": "Meta-Llama-3.1-70B-Instruct",
169
  "Mode": "greedy",
@@ -176,7 +220,7 @@
176
  "Reason Lens": "1483.68"
177
  },
178
  {
179
- "Model": "deepseek-chat",
180
  "Mode": "greedy",
181
  "Puzzle Acc": "22.70",
182
  "Cell Acc": "42.46",
@@ -209,7 +253,7 @@
209
  "Reason Lens": "1813.82"
210
  },
211
  {
212
- "Model": "deepseek-coder",
213
  "Mode": "greedy",
214
  "Puzzle Acc": "21.10",
215
  "Cell Acc": "41.58",
@@ -220,7 +264,7 @@
220
  "Reason Lens": "1324.55"
221
  },
222
  {
223
- "Model": "DeepSeek-Coder-V2-0724",
224
  "Mode": "greedy",
225
  "Puzzle Acc": "20.50",
226
  "Cell Acc": "42.35",
@@ -352,7 +396,7 @@
352
  "Reason Lens": "391.19"
353
  },
354
  {
355
- "Model": "gemma-2-27b-it@nvidia",
356
  "Mode": "greedy",
357
  "Puzzle Acc": "16.30",
358
  "Cell Acc": "41.18",
@@ -407,7 +451,7 @@
407
  "Reason Lens": "1043.90"
408
  },
409
  {
410
- "Model": "gemma-2-9b-it@nvidia",
411
  "Mode": "greedy",
412
  "Puzzle Acc": "12.80",
413
  "Cell Acc": "36.79",
 
1
  [
2
+ {
3
+ "Model": "o1-mini-2024-09-12",
4
+ "Mode": "greedy",
5
+ "Puzzle Acc": "52.60",
6
+ "Cell Acc": "52.29",
7
+ "No answer": "0.80",
8
+ "Easy Puzzle Acc": "87.14",
9
+ "Hard Puzzle Acc": "39.17",
10
+ "Total Puzzles": 1000,
11
+ "Reason Lens": "993.28"
12
+ },
13
  {
14
  "Model": "claude-3-5-sonnet-20240620",
15
  "Mode": "greedy",
 
33
  "Reason Lens": "1153.83"
34
  },
35
  {
36
+ "Model": "Llama-3.1-405B-Inst-fp8@together",
37
  "Mode": "greedy",
38
  "Puzzle Acc": "32.60",
39
  "Cell Acc": "45.80",
 
44
  "Reason Lens": "314.66"
45
  },
46
  {
47
+ "Model": "Llama-3.1-405B-Inst-fp8@together",
48
  "Mode": "sampling",
49
  "Puzzle Acc": "32.60",
50
  "Cell Acc": "47.04",
 
87
  "Total Puzzles": 1000,
88
  "Reason Lens": "1594.47"
89
  },
90
+ {
91
+ "Model": "Llama-3.1-405B-Inst@sambanova",
92
+ "Mode": "greedy",
93
+ "Puzzle Acc": "30.10",
94
+ "Cell Acc": "39.06",
95
+ "No answer": "24.70",
96
+ "Easy Puzzle Acc": "84.64",
97
+ "Hard Puzzle Acc": "8.89",
98
+ "Total Puzzles": 1000,
99
+ "Reason Lens": "2001.12"
100
+ },
101
  {
102
  "Model": "chatgpt-4o-latest-24-09-07",
103
  "Mode": "greedy",
 
186
  "Total Puzzles": 1000,
187
  "Reason Lens": "1389.75"
188
  },
189
+ {
190
+ "Model": "Llama-3.1-405B-Inst@hyperbolic",
191
+ "Mode": "greedy",
192
+ "Puzzle Acc": "25.00",
193
+ "Cell Acc": "46.62",
194
+ "No answer": "6.25",
195
+ "Easy Puzzle Acc": "66.67",
196
+ "Hard Puzzle Acc": "15.38",
197
+ "Total Puzzles": 16,
198
+ "Reason Lens": "1517.13"
199
+ },
200
+ {
201
+ "Model": "gemini-1.5-flash-exp-0827",
202
+ "Mode": "greedy",
203
+ "Puzzle Acc": "25.00",
204
+ "Cell Acc": "43.56",
205
+ "No answer": "8.50",
206
+ "Easy Puzzle Acc": "70.71",
207
+ "Hard Puzzle Acc": "7.22",
208
+ "Total Puzzles": 1000,
209
+ "Reason Lens": "1705.11"
210
+ },
211
  {
212
  "Model": "Meta-Llama-3.1-70B-Instruct",
213
  "Mode": "greedy",
 
220
  "Reason Lens": "1483.68"
221
  },
222
  {
223
+ "Model": "deepseek-v2-chat-0628",
224
  "Mode": "greedy",
225
  "Puzzle Acc": "22.70",
226
  "Cell Acc": "42.46",
 
253
  "Reason Lens": "1813.82"
254
  },
255
  {
256
+ "Model": "deepseek-v2-coder-0614",
257
  "Mode": "greedy",
258
  "Puzzle Acc": "21.10",
259
  "Cell Acc": "41.58",
 
264
  "Reason Lens": "1324.55"
265
  },
266
  {
267
+ "Model": "deepseek-v2-coder-0724",
268
  "Mode": "greedy",
269
  "Puzzle Acc": "20.50",
270
  "Cell Acc": "42.35",
 
396
  "Reason Lens": "391.19"
397
  },
398
  {
399
+ "Model": "gemma-2-27b-it",
400
  "Mode": "greedy",
401
  "Puzzle Acc": "16.30",
402
  "Cell Acc": "41.18",
 
451
  "Reason Lens": "1043.90"
452
  },
453
  {
454
+ "Model": "gemma-2-9b-it",
455
  "Mode": "greedy",
456
  "Puzzle Acc": "12.80",
457
  "Cell Acc": "36.79",