Spaces:
Running
Running
update o1 results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -1,4 +1,15 @@
|
|
1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
{
|
3 |
"Model": "claude-3-5-sonnet-20240620",
|
4 |
"Mode": "greedy",
|
@@ -22,7 +33,7 @@
|
|
22 |
"Reason Lens": "1153.83"
|
23 |
},
|
24 |
{
|
25 |
-
"Model": "Llama-3.1-405B-
|
26 |
"Mode": "greedy",
|
27 |
"Puzzle Acc": "32.60",
|
28 |
"Cell Acc": "45.80",
|
@@ -33,7 +44,7 @@
|
|
33 |
"Reason Lens": "314.66"
|
34 |
},
|
35 |
{
|
36 |
-
"Model": "Llama-3.1-405B-
|
37 |
"Mode": "sampling",
|
38 |
"Puzzle Acc": "32.60",
|
39 |
"Cell Acc": "47.04",
|
@@ -76,6 +87,17 @@
|
|
76 |
"Total Puzzles": 1000,
|
77 |
"Reason Lens": "1594.47"
|
78 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
{
|
80 |
"Model": "chatgpt-4o-latest-24-09-07",
|
81 |
"Mode": "greedy",
|
@@ -164,6 +186,28 @@
|
|
164 |
"Total Puzzles": 1000,
|
165 |
"Reason Lens": "1389.75"
|
166 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
{
|
168 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
169 |
"Mode": "greedy",
|
@@ -176,7 +220,7 @@
|
|
176 |
"Reason Lens": "1483.68"
|
177 |
},
|
178 |
{
|
179 |
-
"Model": "deepseek-chat",
|
180 |
"Mode": "greedy",
|
181 |
"Puzzle Acc": "22.70",
|
182 |
"Cell Acc": "42.46",
|
@@ -209,7 +253,7 @@
|
|
209 |
"Reason Lens": "1813.82"
|
210 |
},
|
211 |
{
|
212 |
-
"Model": "deepseek-coder",
|
213 |
"Mode": "greedy",
|
214 |
"Puzzle Acc": "21.10",
|
215 |
"Cell Acc": "41.58",
|
@@ -220,7 +264,7 @@
|
|
220 |
"Reason Lens": "1324.55"
|
221 |
},
|
222 |
{
|
223 |
-
"Model": "
|
224 |
"Mode": "greedy",
|
225 |
"Puzzle Acc": "20.50",
|
226 |
"Cell Acc": "42.35",
|
@@ -352,7 +396,7 @@
|
|
352 |
"Reason Lens": "391.19"
|
353 |
},
|
354 |
{
|
355 |
-
"Model": "gemma-2-27b-it
|
356 |
"Mode": "greedy",
|
357 |
"Puzzle Acc": "16.30",
|
358 |
"Cell Acc": "41.18",
|
@@ -407,7 +451,7 @@
|
|
407 |
"Reason Lens": "1043.90"
|
408 |
},
|
409 |
{
|
410 |
-
"Model": "gemma-2-9b-it
|
411 |
"Mode": "greedy",
|
412 |
"Puzzle Acc": "12.80",
|
413 |
"Cell Acc": "36.79",
|
|
|
1 |
[
|
2 |
+
{
|
3 |
+
"Model": "o1-mini-2024-09-12",
|
4 |
+
"Mode": "greedy",
|
5 |
+
"Puzzle Acc": "52.60",
|
6 |
+
"Cell Acc": "52.29",
|
7 |
+
"No answer": "0.80",
|
8 |
+
"Easy Puzzle Acc": "87.14",
|
9 |
+
"Hard Puzzle Acc": "39.17",
|
10 |
+
"Total Puzzles": 1000,
|
11 |
+
"Reason Lens": "993.28"
|
12 |
+
},
|
13 |
{
|
14 |
"Model": "claude-3-5-sonnet-20240620",
|
15 |
"Mode": "greedy",
|
|
|
33 |
"Reason Lens": "1153.83"
|
34 |
},
|
35 |
{
|
36 |
+
"Model": "Llama-3.1-405B-Inst-fp8@together",
|
37 |
"Mode": "greedy",
|
38 |
"Puzzle Acc": "32.60",
|
39 |
"Cell Acc": "45.80",
|
|
|
44 |
"Reason Lens": "314.66"
|
45 |
},
|
46 |
{
|
47 |
+
"Model": "Llama-3.1-405B-Inst-fp8@together",
|
48 |
"Mode": "sampling",
|
49 |
"Puzzle Acc": "32.60",
|
50 |
"Cell Acc": "47.04",
|
|
|
87 |
"Total Puzzles": 1000,
|
88 |
"Reason Lens": "1594.47"
|
89 |
},
|
90 |
+
{
|
91 |
+
"Model": "Llama-3.1-405B-Inst@sambanova",
|
92 |
+
"Mode": "greedy",
|
93 |
+
"Puzzle Acc": "30.10",
|
94 |
+
"Cell Acc": "39.06",
|
95 |
+
"No answer": "24.70",
|
96 |
+
"Easy Puzzle Acc": "84.64",
|
97 |
+
"Hard Puzzle Acc": "8.89",
|
98 |
+
"Total Puzzles": 1000,
|
99 |
+
"Reason Lens": "2001.12"
|
100 |
+
},
|
101 |
{
|
102 |
"Model": "chatgpt-4o-latest-24-09-07",
|
103 |
"Mode": "greedy",
|
|
|
186 |
"Total Puzzles": 1000,
|
187 |
"Reason Lens": "1389.75"
|
188 |
},
|
189 |
+
{
|
190 |
+
"Model": "Llama-3.1-405B-Inst@hyperbolic",
|
191 |
+
"Mode": "greedy",
|
192 |
+
"Puzzle Acc": "25.00",
|
193 |
+
"Cell Acc": "46.62",
|
194 |
+
"No answer": "6.25",
|
195 |
+
"Easy Puzzle Acc": "66.67",
|
196 |
+
"Hard Puzzle Acc": "15.38",
|
197 |
+
"Total Puzzles": 16,
|
198 |
+
"Reason Lens": "1517.13"
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"Model": "gemini-1.5-flash-exp-0827",
|
202 |
+
"Mode": "greedy",
|
203 |
+
"Puzzle Acc": "25.00",
|
204 |
+
"Cell Acc": "43.56",
|
205 |
+
"No answer": "8.50",
|
206 |
+
"Easy Puzzle Acc": "70.71",
|
207 |
+
"Hard Puzzle Acc": "7.22",
|
208 |
+
"Total Puzzles": 1000,
|
209 |
+
"Reason Lens": "1705.11"
|
210 |
+
},
|
211 |
{
|
212 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
213 |
"Mode": "greedy",
|
|
|
220 |
"Reason Lens": "1483.68"
|
221 |
},
|
222 |
{
|
223 |
+
"Model": "deepseek-v2-chat-0628",
|
224 |
"Mode": "greedy",
|
225 |
"Puzzle Acc": "22.70",
|
226 |
"Cell Acc": "42.46",
|
|
|
253 |
"Reason Lens": "1813.82"
|
254 |
},
|
255 |
{
|
256 |
+
"Model": "deepseek-v2-coder-0614",
|
257 |
"Mode": "greedy",
|
258 |
"Puzzle Acc": "21.10",
|
259 |
"Cell Acc": "41.58",
|
|
|
264 |
"Reason Lens": "1324.55"
|
265 |
},
|
266 |
{
|
267 |
+
"Model": "deepseek-v2-coder-0724",
|
268 |
"Mode": "greedy",
|
269 |
"Puzzle Acc": "20.50",
|
270 |
"Cell Acc": "42.35",
|
|
|
396 |
"Reason Lens": "391.19"
|
397 |
},
|
398 |
{
|
399 |
+
"Model": "gemma-2-27b-it",
|
400 |
"Mode": "greedy",
|
401 |
"Puzzle Acc": "16.30",
|
402 |
"Cell Acc": "41.18",
|
|
|
451 |
"Reason Lens": "1043.90"
|
452 |
},
|
453 |
{
|
454 |
+
"Model": "gemma-2-9b-it",
|
455 |
"Mode": "greedy",
|
456 |
"Puzzle Acc": "12.80",
|
457 |
"Cell Acc": "36.79",
|