OSQ-Leaderboard / small (1).json
SondosMB's picture
Upload 5 files
b782462 verified
raw
history blame
3.03 kB
[
{
"model": "OPT (1.3B)",
"Average": 7.84,
"MMLU": 7.4,
"WinoGrande": 12.47,
"PiQA": 4.45,
"CommonsenseQA": 7.61,
"Race": 13.61,
"MedMCQA": 1.25,
"OpenkookQA": 4.48
},
{
"model": "SlimPajama",
"Average": 9.54,
"MMLU": 9.22,
"WinoGrande": 14.76,
"PiQA": 5.32,
"CommonsenseQA": 9.01,
"Race": 16.19,
"MedMCQA": 1.68,
"OpenkookQA": 5.7
},
{
"model": "OLMo (1B)",
"Average": 8.8,
"MMLU": 8.54,
"WinoGrande": 6.16,
"PiQA": 8.05,
"CommonsenseQA": 13.1,
"Race": 13.61,
"MedMCQA": 2.1,
"OpenkookQA": 6.11
},
{
"model": "GPT-Neo (1.3B)",
"Average": 7.38,
"MMLU": 6.94,
"WinoGrande": 10.81,
"PiQA": 4.31,
"CommonsenseQA": 6.34,
"Race": 13.75,
"MedMCQA": 2.63,
"OpenkookQA": 4.89
},
{
"model": "Cerebras-GPT (1.3B)",
"Average": 4.84,
"MMLU": 5.37,
"WinoGrande": 9.31,
"PiQA": 2.16,
"CommonsenseQA": 6.2,
"Race": 6.9,
"MedMCQA": 1.04,
"OpenkookQA": 3.46
},
{
"model": "RedPajama (1B)",
"Average": 9.01,
"MMLU": 9.21,
"WinoGrande": 16.97,
"PiQA": 1.39,
"CommonsenseQA": 11.41,
"Race": 14.35,
"MedMCQA": 1.86,
"OpenkookQA": 3.87
},
{
"model": "Pythia (1.4B)",
"Average": 8.73,
"MMLU": 9.66,
"WinoGrande": 11.52,
"PiQA": 4.17,
"CommonsenseQA": 9.01,
"Race": 12.76,
"MedMCQA": 3.19,
"OpenkookQA": 5.3
},
{
"model": "TinyLLama (1.1B)",
"Average": 8.39,
"MMLU": 8.94,
"WinoGrande": 12.23,
"PiQA": 3.59,
"CommonsenseQA": 6.06,
"Race": 16.7,
"MedMCQA": 2.07,
"OpenkookQA": 4.68
},
{
"model": "OELM (1B)",
"Average": 8.99,
"MMLU": 9.03,
"WinoGrande": 10.18,
"PiQA": 9.05,
"CommonsenseQA": 7.75,
"Race": 12.78,
"MedMCQA": 2.5,
"OpenkookQA": 6.31
},
{
"model": "Phi-3-mini-128k-instruct (3.8B)",
"Average": 39.73,
"MMLU": 36.97,
"WinoGrande": 46.88,
"PiQA": 32.04,
"CommonsenseQA": 49.15,
"Race": 37.81,
"MedMCQA": 22.61,
"OpenkookQA": 33.6
},
{
"model": "Gemma (2B)",
"Average": 17.37,
"MMLU": 17.52,
"WinoGrande": 22.68,
"PiQA": 15.09,
"CommonsenseQA": 27.46,
"Race": 14.32,
"MedMCQA": 4.57,
"OpenkookQA": 14.26
},
{
"model": "Qwen (1.8B)",
"Average": 21.61,
"MMLU": 10.0,
"WinoGrande": 40.97,
"PiQA": 15.52,
"CommonsenseQA": 31.13,
"Race": 34.91,
"MedMCQA": 4.7,
"OpenkookQA": 20.37
}
]