Spaces:
Running
Running
update results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -43,6 +43,17 @@
|
|
43 |
"Total Puzzles": 1000,
|
44 |
"Reason Lens": "439.96"
|
45 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
{
|
47 |
"Model": "gpt-4o-2024-05-13",
|
48 |
"Mode": "sampling",
|
@@ -54,6 +65,28 @@
|
|
54 |
"Total Puzzles": 1000,
|
55 |
"Reason Lens": "1549.74"
|
56 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
{
|
58 |
"Model": "Mistral-Large-2",
|
59 |
"Mode": "greedy",
|
@@ -120,6 +153,17 @@
|
|
120 |
"Total Puzzles": 1000,
|
121 |
"Reason Lens": "1165.90"
|
122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
{
|
124 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
125 |
"Mode": "greedy",
|
@@ -142,6 +186,17 @@
|
|
142 |
"Total Puzzles": 1000,
|
143 |
"Reason Lens": "1260.23"
|
144 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
{
|
146 |
"Model": "Qwen2-72B-Instruct",
|
147 |
"Mode": "greedy",
|
@@ -483,6 +538,17 @@
|
|
483 |
"Total Puzzles": 1000,
|
484 |
"Reason Lens": "1473.23"
|
485 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
{
|
487 |
"Model": "gemma-2-2b-it",
|
488 |
"Mode": "greedy",
|
|
|
43 |
"Total Puzzles": 1000,
|
44 |
"Reason Lens": "439.96"
|
45 |
},
|
46 |
+
{
|
47 |
+
"Model": "gpt-4o-2024-08-06",
|
48 |
+
"Mode": "greedy",
|
49 |
+
"Puzzle Acc": "31.70",
|
50 |
+
"Cell Acc": "50.34",
|
51 |
+
"No answer": "3.60",
|
52 |
+
"Easy Puzzle Acc": "84.64",
|
53 |
+
"Hard Puzzle Acc": "11.11",
|
54 |
+
"Total Puzzles": 1000,
|
55 |
+
"Reason Lens": "1106.51"
|
56 |
+
},
|
57 |
{
|
58 |
"Model": "gpt-4o-2024-05-13",
|
59 |
"Mode": "sampling",
|
|
|
65 |
"Total Puzzles": 1000,
|
66 |
"Reason Lens": "1549.74"
|
67 |
},
|
68 |
+
{
|
69 |
+
"Model": "gemini-1.5-pro-exp-0827",
|
70 |
+
"Mode": "greedy",
|
71 |
+
"Puzzle Acc": "30.50",
|
72 |
+
"Cell Acc": "50.84",
|
73 |
+
"No answer": "0.80",
|
74 |
+
"Easy Puzzle Acc": "79.64",
|
75 |
+
"Hard Puzzle Acc": "11.39",
|
76 |
+
"Total Puzzles": 1000,
|
77 |
+
"Reason Lens": "1594.47"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"Model": "chatgpt-4o-latest-24-09-07",
|
81 |
+
"Mode": "greedy",
|
82 |
+
"Puzzle Acc": "29.90",
|
83 |
+
"Cell Acc": "48.83",
|
84 |
+
"No answer": "4.20",
|
85 |
+
"Easy Puzzle Acc": "81.43",
|
86 |
+
"Hard Puzzle Acc": "9.86",
|
87 |
+
"Total Puzzles": 1000,
|
88 |
+
"Reason Lens": "1539.99"
|
89 |
+
},
|
90 |
{
|
91 |
"Model": "Mistral-Large-2",
|
92 |
"Mode": "greedy",
|
|
|
153 |
"Total Puzzles": 1000,
|
154 |
"Reason Lens": "1165.90"
|
155 |
},
|
156 |
+
{
|
157 |
+
"Model": "gemini-1.5-pro-exp-0801",
|
158 |
+
"Mode": "greedy",
|
159 |
+
"Puzzle Acc": "25.20",
|
160 |
+
"Cell Acc": "48.50",
|
161 |
+
"No answer": "0.00",
|
162 |
+
"Easy Puzzle Acc": "72.50",
|
163 |
+
"Hard Puzzle Acc": "6.81",
|
164 |
+
"Total Puzzles": 1000,
|
165 |
+
"Reason Lens": "1389.75"
|
166 |
+
},
|
167 |
{
|
168 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
169 |
"Mode": "greedy",
|
|
|
186 |
"Total Puzzles": 1000,
|
187 |
"Reason Lens": "1260.23"
|
188 |
},
|
189 |
+
{
|
190 |
+
"Model": "deepseek-v2.5-0908",
|
191 |
+
"Mode": "greedy",
|
192 |
+
"Puzzle Acc": "22.10",
|
193 |
+
"Cell Acc": "38.01",
|
194 |
+
"No answer": "12.70",
|
195 |
+
"Easy Puzzle Acc": "68.21",
|
196 |
+
"Hard Puzzle Acc": "4.17",
|
197 |
+
"Total Puzzles": 1000,
|
198 |
+
"Reason Lens": "1294.46"
|
199 |
+
},
|
200 |
{
|
201 |
"Model": "Qwen2-72B-Instruct",
|
202 |
"Mode": "greedy",
|
|
|
538 |
"Total Puzzles": 1000,
|
539 |
"Reason Lens": "1473.23"
|
540 |
},
|
541 |
+
{
|
542 |
+
"Model": "Phi-3.5-mini-instruct",
|
543 |
+
"Mode": "greedy",
|
544 |
+
"Puzzle Acc": "6.40",
|
545 |
+
"Cell Acc": "5.98",
|
546 |
+
"No answer": "80.60",
|
547 |
+
"Easy Puzzle Acc": "21.79",
|
548 |
+
"Hard Puzzle Acc": "0.42",
|
549 |
+
"Total Puzzles": 1000,
|
550 |
+
"Reason Lens": "718.43"
|
551 |
+
},
|
552 |
{
|
553 |
"Model": "gemma-2-2b-it",
|
554 |
"Mode": "greedy",
|