Spaces:
Running
Running
[ | |
{ | |
"Model": "claude-3-5-sonnet-20240620", | |
"Mode": "greedy", | |
"Puzzle Acc": "33.40", | |
"Cell Acc": "54.34", | |
"No answer": "0.00", | |
"Easy Puzzle Acc": "87.50", | |
"Hard Puzzle Acc": "12.36", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1141.94" | |
}, | |
{ | |
"Model": "claude-3-5-sonnet-20240620", | |
"Mode": "sampling", | |
"Puzzle Acc": "33.40", | |
"Cell Acc": "53.01", | |
"No answer": "0.10", | |
"Easy Puzzle Acc": "88.21", | |
"Hard Puzzle Acc": "12.08", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1153.83" | |
}, | |
{ | |
"Model": "Llama-3.1-405B-Instruct-Turbo", | |
"Mode": "greedy", | |
"Puzzle Acc": "32.60", | |
"Cell Acc": "45.80", | |
"No answer": "12.50", | |
"Easy Puzzle Acc": "87.14", | |
"Hard Puzzle Acc": "11.39", | |
"Total Puzzles": 1000, | |
"Reason Lens": "314.66" | |
}, | |
{ | |
"Model": "Llama-3.1-405B-Instruct-Turbo", | |
"Mode": "sampling", | |
"Puzzle Acc": "32.60", | |
"Cell Acc": "47.04", | |
"No answer": "10.80", | |
"Easy Puzzle Acc": "86.07", | |
"Hard Puzzle Acc": "11.81", | |
"Total Puzzles": 1000, | |
"Reason Lens": "439.96" | |
}, | |
{ | |
"Model": "gpt-4o-2024-05-13", | |
"Mode": "sampling", | |
"Puzzle Acc": "30.80", | |
"Cell Acc": "46.19", | |
"No answer": "6.60", | |
"Easy Puzzle Acc": "81.07", | |
"Hard Puzzle Acc": "11.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1549.74" | |
}, | |
{ | |
"Model": "Mistral-Large-2", | |
"Mode": "greedy", | |
"Puzzle Acc": "29.00", | |
"Cell Acc": "47.64", | |
"No answer": "1.70", | |
"Easy Puzzle Acc": "80.36", | |
"Hard Puzzle Acc": "9.03", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1592.39" | |
}, | |
{ | |
"Model": "gpt-4-turbo-2024-04-09", | |
"Mode": "greedy", | |
"Puzzle Acc": "28.40", | |
"Cell Acc": "47.90", | |
"No answer": "0.10", | |
"Easy Puzzle Acc": "80.71", | |
"Hard Puzzle Acc": "8.06", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1148.46" | |
}, | |
{ | |
"Model": "gpt-4o-2024-05-13", | |
"Mode": "greedy", | |
"Puzzle Acc": "28.20", | |
"Cell Acc": "38.72", | |
"No answer": "19.30", | |
"Easy Puzzle Acc": "77.86", | |
"Hard Puzzle Acc": "8.89", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1643.51" | |
}, | |
{ | |
"Model": "gpt-4-0314", | |
"Mode": "greedy", | |
"Puzzle Acc": "27.10", | |
"Cell Acc": "47.43", | |
"No answer": "0.20", | |
"Easy Puzzle Acc": "77.14", | |
"Hard Puzzle Acc": "7.64", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1203.17" | |
}, | |
{ | |
"Model": "claude-3-opus-20240229", | |
"Mode": "greedy", | |
"Puzzle Acc": "27.00", | |
"Cell Acc": "48.91", | |
"No answer": "0.00", | |
"Easy Puzzle Acc": "78.21", | |
"Hard Puzzle Acc": "7.08", | |
"Total Puzzles": 1000, | |
"Reason Lens": "855.72" | |
}, | |
{ | |
"Model": "gpt-4-turbo-2024-04-09", | |
"Mode": "sampling", | |
"Puzzle Acc": "26.40", | |
"Cell Acc": "47.93", | |
"No answer": "0.00", | |
"Easy Puzzle Acc": "74.29", | |
"Hard Puzzle Acc": "7.78", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1165.90" | |
}, | |
{ | |
"Model": "Meta-Llama-3.1-70B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "24.90", | |
"Cell Acc": "27.98", | |
"No answer": "43.00", | |
"Easy Puzzle Acc": "73.57", | |
"Hard Puzzle Acc": "5.97", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1483.68" | |
}, | |
{ | |
"Model": "deepseek-chat", | |
"Mode": "greedy", | |
"Puzzle Acc": "22.70", | |
"Cell Acc": "42.46", | |
"No answer": "5.20", | |
"Easy Puzzle Acc": "68.57", | |
"Hard Puzzle Acc": "4.86", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1260.23" | |
}, | |
{ | |
"Model": "Qwen2-72B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "21.40", | |
"Cell Acc": "38.32", | |
"No answer": "10.20", | |
"Easy Puzzle Acc": "63.93", | |
"Hard Puzzle Acc": "4.86", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1813.82" | |
}, | |
{ | |
"Model": "deepseek-coder", | |
"Mode": "greedy", | |
"Puzzle Acc": "21.10", | |
"Cell Acc": "41.58", | |
"No answer": "4.90", | |
"Easy Puzzle Acc": "64.64", | |
"Hard Puzzle Acc": "4.17", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1324.55" | |
}, | |
{ | |
"Model": "DeepSeek-Coder-V2-0724", | |
"Mode": "greedy", | |
"Puzzle Acc": "20.50", | |
"Cell Acc": "42.35", | |
"No answer": "3.40", | |
"Easy Puzzle Acc": "61.79", | |
"Hard Puzzle Acc": "4.44", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1230.63" | |
}, | |
{ | |
"Model": "gpt-4o-mini-2024-07-18", | |
"Mode": "greedy", | |
"Puzzle Acc": "20.10", | |
"Cell Acc": "41.26", | |
"No answer": "0.10", | |
"Easy Puzzle Acc": "62.50", | |
"Hard Puzzle Acc": "3.61", | |
"Total Puzzles": 1000, | |
"Reason Lens": "943.52" | |
}, | |
{ | |
"Model": "gemini-1.5-pro", | |
"Mode": "sampling", | |
"Puzzle Acc": "19.70", | |
"Cell Acc": "45.24", | |
"No answer": "0.40", | |
"Easy Puzzle Acc": "60.00", | |
"Hard Puzzle Acc": "4.03", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1356.77" | |
}, | |
{ | |
"Model": "gemini-1.5-flash", | |
"Mode": "greedy", | |
"Puzzle Acc": "19.40", | |
"Cell Acc": "31.77", | |
"No answer": "22.70", | |
"Easy Puzzle Acc": "59.29", | |
"Hard Puzzle Acc": "3.89", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1538.18" | |
}, | |
{ | |
"Model": "gemini-1.5-pro", | |
"Mode": "greedy", | |
"Puzzle Acc": "19.40", | |
"Cell Acc": "44.59", | |
"No answer": "0.80", | |
"Easy Puzzle Acc": "55.71", | |
"Hard Puzzle Acc": "5.28", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1336.17" | |
}, | |
{ | |
"Model": "yi-large-preview", | |
"Mode": "greedy", | |
"Puzzle Acc": "18.90", | |
"Cell Acc": "42.61", | |
"No answer": "1.40", | |
"Easy Puzzle Acc": "58.93", | |
"Hard Puzzle Acc": "3.33", | |
"Total Puzzles": 1000, | |
"Reason Lens": "833.36" | |
}, | |
{ | |
"Model": "yi-large", | |
"Mode": "greedy", | |
"Puzzle Acc": "18.80", | |
"Cell Acc": "39.83", | |
"No answer": "1.80", | |
"Easy Puzzle Acc": "58.21", | |
"Hard Puzzle Acc": "3.47", | |
"Total Puzzles": 1000, | |
"Reason Lens": "757.01" | |
}, | |
{ | |
"Model": "claude-3-sonnet-20240229", | |
"Mode": "greedy", | |
"Puzzle Acc": "18.70", | |
"Cell Acc": "43.66", | |
"No answer": "0.00", | |
"Easy Puzzle Acc": "58.93", | |
"Hard Puzzle Acc": "3.06", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1095.37" | |
}, | |
{ | |
"Model": "Qwen2-72B-Instruct", | |
"Mode": "sampling", | |
"Puzzle Acc": "18.70", | |
"Cell Acc": "40.57", | |
"No answer": "3.20", | |
"Easy Puzzle Acc": "57.50", | |
"Hard Puzzle Acc": "3.61", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1894.72" | |
}, | |
{ | |
"Model": "gemini-1.5-flash", | |
"Mode": "sampling", | |
"Puzzle Acc": "18.40", | |
"Cell Acc": "36.03", | |
"No answer": "12.80", | |
"Easy Puzzle Acc": "57.86", | |
"Hard Puzzle Acc": "3.06", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1713.03" | |
}, | |
{ | |
"Model": "Meta-Llama-3-70B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "16.80", | |
"Cell Acc": "42.31", | |
"No answer": "0.20", | |
"Easy Puzzle Acc": "52.86", | |
"Hard Puzzle Acc": "2.78", | |
"Total Puzzles": 1000, | |
"Reason Lens": "809.95" | |
}, | |
{ | |
"Model": "Athene-70B", | |
"Mode": "greedy", | |
"Puzzle Acc": "16.70", | |
"Cell Acc": "32.98", | |
"No answer": "21.10", | |
"Easy Puzzle Acc": "52.50", | |
"Hard Puzzle Acc": "2.78", | |
"Total Puzzles": 1000, | |
"Reason Lens": "391.19" | |
}, | |
{ | |
"Model": "gemma-2-27b-it@nvidia", | |
"Mode": "greedy", | |
"Puzzle Acc": "16.30", | |
"Cell Acc": "41.18", | |
"No answer": "1.10", | |
"Easy Puzzle Acc": "50.71", | |
"Hard Puzzle Acc": "2.92", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1014.56" | |
}, | |
{ | |
"Model": "claude-3-haiku-20240307", | |
"Mode": "greedy", | |
"Puzzle Acc": "14.30", | |
"Cell Acc": "37.87", | |
"No answer": "0.10", | |
"Easy Puzzle Acc": "47.86", | |
"Hard Puzzle Acc": "1.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1015.06" | |
}, | |
{ | |
"Model": "command-r-plus", | |
"Mode": "greedy", | |
"Puzzle Acc": "13.90", | |
"Cell Acc": "39.01", | |
"No answer": "0.20", | |
"Easy Puzzle Acc": "44.64", | |
"Hard Puzzle Acc": "1.94", | |
"Total Puzzles": 1000, | |
"Reason Lens": "810.53" | |
}, | |
{ | |
"Model": "reka-core-20240501", | |
"Mode": "greedy", | |
"Puzzle Acc": "13.00", | |
"Cell Acc": "33.88", | |
"No answer": "4.00", | |
"Easy Puzzle Acc": "43.21", | |
"Hard Puzzle Acc": "1.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1078.29" | |
}, | |
{ | |
"Model": "Meta-Llama-3.1-8B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "12.80", | |
"Cell Acc": "13.68", | |
"No answer": "61.50", | |
"Easy Puzzle Acc": "43.57", | |
"Hard Puzzle Acc": "0.83", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1043.90" | |
}, | |
{ | |
"Model": "gemma-2-9b-it@nvidia", | |
"Mode": "greedy", | |
"Puzzle Acc": "12.80", | |
"Cell Acc": "36.79", | |
"No answer": "0.00", | |
"Easy Puzzle Acc": "41.79", | |
"Hard Puzzle Acc": "1.53", | |
"Total Puzzles": 1000, | |
"Reason Lens": "849.84" | |
}, | |
{ | |
"Model": "Meta-Llama-3-8B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "11.90", | |
"Cell Acc": "23.70", | |
"No answer": "29.20", | |
"Easy Puzzle Acc": "40.71", | |
"Hard Puzzle Acc": "0.69", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1216.40" | |
}, | |
{ | |
"Model": "Mistral-Nemo-Instruct-2407", | |
"Mode": "greedy", | |
"Puzzle Acc": "11.80", | |
"Cell Acc": "34.93", | |
"No answer": "1.60", | |
"Easy Puzzle Acc": "38.93", | |
"Hard Puzzle Acc": "1.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "925.88" | |
}, | |
{ | |
"Model": "Phi-3-mini-4k-instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "11.60", | |
"Cell Acc": "13.50", | |
"No answer": "59.00", | |
"Easy Puzzle Acc": "38.21", | |
"Hard Puzzle Acc": "1.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "790.29" | |
}, | |
{ | |
"Model": "Yi-1.5-34B-Chat", | |
"Mode": "greedy", | |
"Puzzle Acc": "11.50", | |
"Cell Acc": "32.73", | |
"No answer": "4.40", | |
"Easy Puzzle Acc": "37.50", | |
"Hard Puzzle Acc": "1.39", | |
"Total Puzzles": 1000, | |
"Reason Lens": "869.65" | |
}, | |
{ | |
"Model": "Meta-Llama-3-8B-Instruct", | |
"Mode": "sampling", | |
"Puzzle Acc": "11.00", | |
"Cell Acc": "26.11", | |
"No answer": "22.30", | |
"Easy Puzzle Acc": "36.79", | |
"Hard Puzzle Acc": "0.97", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1282.40" | |
}, | |
{ | |
"Model": "gpt-3.5-turbo-0125", | |
"Mode": "greedy", | |
"Puzzle Acc": "10.10", | |
"Cell Acc": "33.06", | |
"No answer": "0.10", | |
"Easy Puzzle Acc": "33.57", | |
"Hard Puzzle Acc": "0.97", | |
"Total Puzzles": 1000, | |
"Reason Lens": "820.66" | |
}, | |
{ | |
"Model": "command-r", | |
"Mode": "greedy", | |
"Puzzle Acc": "9.90", | |
"Cell Acc": "32.66", | |
"No answer": "1.50", | |
"Easy Puzzle Acc": "32.14", | |
"Hard Puzzle Acc": "1.25", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1005.17" | |
}, | |
{ | |
"Model": "reka-flash-20240226", | |
"Mode": "greedy", | |
"Puzzle Acc": "9.30", | |
"Cell Acc": "25.67", | |
"No answer": "18.70", | |
"Easy Puzzle Acc": "30.71", | |
"Hard Puzzle Acc": "0.97", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1074.80" | |
}, | |
{ | |
"Model": "mathstral-7B-v0.1", | |
"Mode": "greedy", | |
"Puzzle Acc": "9.00", | |
"Cell Acc": "20.42", | |
"No answer": "36.00", | |
"Easy Puzzle Acc": "30.00", | |
"Hard Puzzle Acc": "0.83", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1148.16" | |
}, | |
{ | |
"Model": "Mixtral-8x7B-Instruct-v0.1", | |
"Mode": "greedy", | |
"Puzzle Acc": "8.70", | |
"Cell Acc": "26.47", | |
"No answer": "20.30", | |
"Easy Puzzle Acc": "28.93", | |
"Hard Puzzle Acc": "0.83", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1177.21" | |
}, | |
{ | |
"Model": "Qwen2-7B-Instruct", | |
"Mode": "greedy", | |
"Puzzle Acc": "8.40", | |
"Cell Acc": "22.06", | |
"No answer": "24.40", | |
"Easy Puzzle Acc": "29.29", | |
"Hard Puzzle Acc": "0.28", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1473.23" | |
}, | |
{ | |
"Model": "gemma-2-2b-it", | |
"Mode": "greedy", | |
"Puzzle Acc": "4.20", | |
"Cell Acc": "9.97", | |
"No answer": "57.20", | |
"Easy Puzzle Acc": "14.29", | |
"Hard Puzzle Acc": "0.28", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1032.89" | |
}, | |
{ | |
"Model": "Yi-1.5-9B-Chat", | |
"Mode": "greedy", | |
"Puzzle Acc": "2.30", | |
"Cell Acc": "7.53", | |
"No answer": "11.30", | |
"Easy Puzzle Acc": "8.21", | |
"Hard Puzzle Acc": "0.00", | |
"Total Puzzles": 1000, | |
"Reason Lens": "1592.60" | |
} | |
] |