OSQ-Leaderboard / big (1).json
SondosMB's picture
Upload 5 files
b782462 verified
raw
history blame
1.5 kB
[
{
"model": "GPT-4",
"Average": 65.94,
"MMLU": 74.8,
"WinoGrande": 66.2,
"PiQA": 61.6,
"CommonsenseQA": 63.0,
"Race": 67.0,
"MedMCQA": 51.8,
"OpenkookQA": 60.3
},
{
"model": "Claude-3 Opus",
"Average": 62.64,
"MMLU": 70.4,
"WinoGrande": 63.5,
"PiQA": 59.1,
"CommonsenseQA": 63.7,
"Race": 66.2,
"MedMCQA": 49.1,
"OpenkookQA": 54.0
},
{
"model": "Mistral Large",
"Average": 61.45,
"MMLU": 67.8,
"WinoGrande": 56.8,
"PiQA": 61.2,
"CommonsenseQA": 55.4,
"Race": 70.1,
"MedMCQA": 43.4,
"OpenkookQA": 58.7
},
{
"model": "GPT-3.5",
"Average": 59.06,
"MMLU": 65.4,
"WinoGrande": 54.6,
"PiQA": 54.9,
"CommonsenseQA": 67.9,
"Race": 60.1,
"MedMCQA": 41.4,
"OpenkookQA": 49.9
},
{
"model": "Gemini Pro",
"Average": 54.45,
"MMLU": 57.7,
"WinoGrande": 56.4,
"PiQA": 47.7,
"CommonsenseQA": 50.6,
"Race": 61.0,
"MedMCQA": 37.5,
"OpenkookQA": 52.5
},
{
"model": "Llama3-70b-instruct",
"Average": 54.06,
"MMLU": 64.67,
"WinoGrande": 57.14,
"PiQA": 43.1,
"CommonsenseQA": 55.49,
"Race": 58.21,
"MedMCQA": 41.67,
"OpenkookQA": 41.93
}
]