Spaces:
Running
Running
Upload 5 files
Browse files- app.py +99 -0
- big (1).json +68 -0
- constants.py +26 -0
- requirements (1).txt +2 -0
- small (1).json +134 -0
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from constants import INTRODUCTION_TEXT,CITATION_TEXT
|
4 |
+
|
5 |
+
|
6 |
+
# Define the formatter function
|
7 |
+
def formatter(x):
|
8 |
+
try:
|
9 |
+
return round(x, 2)
|
10 |
+
except:
|
11 |
+
return x
|
12 |
+
|
13 |
+
|
14 |
+
# Example DataFrames
|
15 |
+
|
16 |
+
jsond_data = pd.read_json('big.json')
|
17 |
+
original_df = pd.DataFrame(jsond_data)
|
18 |
+
print(original_df)
|
19 |
+
|
20 |
+
jsond_data2 = pd.read_json('small.json')
|
21 |
+
Small_original_df = pd.DataFrame(jsond_data2)
|
22 |
+
print(Small_original_df)
|
23 |
+
|
24 |
+
# Apply formatter to the entire DataFrame
|
25 |
+
original_df = original_df.applymap(formatter)
|
26 |
+
Small_original_df=Small_original_df.applymap(formatter)
|
27 |
+
|
28 |
+
|
29 |
+
# Correct data types for Gradio DataFrame component
|
30 |
+
TYPES = ['str', 'number', 'number', 'number']
|
31 |
+
|
32 |
+
|
33 |
+
LAST_UPDATED = "May 10th 2024"
|
34 |
+
|
35 |
+
# CSS for styling
|
36 |
+
css = """
|
37 |
+
.markdown-text{font-size: 200pt}
|
38 |
+
.markdown-text-small{font-size: 13pt}
|
39 |
+
th {
|
40 |
+
text-align: center;
|
41 |
+
}
|
42 |
+
td {
|
43 |
+
font-size: 15px; /* Adjust the font size as needed */
|
44 |
+
text-align: center;
|
45 |
+
}
|
46 |
+
#od-benchmark-tab-table-button{
|
47 |
+
font-size: 15pt;
|
48 |
+
font-weight: bold;
|
49 |
+
}
|
50 |
+
|
51 |
+
#Intro{
|
52 |
+
font-size: 100pt;
|
53 |
+
}
|
54 |
+
"""
|
55 |
+
|
56 |
+
|
57 |
+
def build_demo(original_df,Small_original_df, TYPES):
|
58 |
+
with gr.Blocks(css=css) as demo:
|
59 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_id="Intro")
|
60 |
+
with gr.Tabs():
|
61 |
+
with gr.TabItem("🏅Leaderboard_Large",elem_id="od-benchmark-tab-table", id=0):
|
62 |
+
leaderboard_table = gr.components.Dataframe(
|
63 |
+
value=original_df,
|
64 |
+
datatype=TYPES,
|
65 |
+
label="Leaderboard_Big",
|
66 |
+
height=1000,
|
67 |
+
wrap=False,
|
68 |
+
interactive=False,
|
69 |
+
visible=True,
|
70 |
+
min_width=60,
|
71 |
+
)
|
72 |
+
|
73 |
+
with gr.TabItem("🏅 Leaderboard_Small",elem_id="od-benchmark-tab-table", id=1):
|
74 |
+
leaderboard_table = gr.components.Dataframe(
|
75 |
+
value=Small_original_df,
|
76 |
+
datatype=TYPES,
|
77 |
+
label="Leaderboard_small",
|
78 |
+
height=1000,
|
79 |
+
wrap=False,
|
80 |
+
interactive=False,
|
81 |
+
visible=True,
|
82 |
+
min_width=60,
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
+
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
|
87 |
+
|
88 |
+
with gr.Row():
|
89 |
+
with gr.Accordion("📙 Citation", open=False):
|
90 |
+
gr.Textbox(
|
91 |
+
value=CITATION_TEXT, lines=18,
|
92 |
+
label="",
|
93 |
+
elem_id="citation-button",
|
94 |
+
show_copy_button=True)
|
95 |
+
|
96 |
+
return demo
|
97 |
+
|
98 |
+
demo = build_demo(original_df,Small_original_df, TYPES)
|
99 |
+
demo.launch(share='True')
|
big (1).json
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"model": "GPT-4",
|
4 |
+
"Average": 65.94,
|
5 |
+
"MMLU": 74.8,
|
6 |
+
"WinoGrande": 66.2,
|
7 |
+
"PiQA": 61.6,
|
8 |
+
"CommonsenseQA": 63.0,
|
9 |
+
"Race": 67.0,
|
10 |
+
"MedMCQA": 51.8,
|
11 |
+
"OpenkookQA": 60.3
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"model": "Claude-3 Opus",
|
15 |
+
"Average": 62.64,
|
16 |
+
"MMLU": 70.4,
|
17 |
+
"WinoGrande": 63.5,
|
18 |
+
"PiQA": 59.1,
|
19 |
+
"CommonsenseQA": 63.7,
|
20 |
+
"Race": 66.2,
|
21 |
+
"MedMCQA": 49.1,
|
22 |
+
"OpenkookQA": 54.0
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"model": "Mistral Large",
|
26 |
+
"Average": 61.45,
|
27 |
+
"MMLU": 67.8,
|
28 |
+
"WinoGrande": 56.8,
|
29 |
+
"PiQA": 61.2,
|
30 |
+
"CommonsenseQA": 55.4,
|
31 |
+
"Race": 70.1,
|
32 |
+
"MedMCQA": 43.4,
|
33 |
+
"OpenkookQA": 58.7
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"model": "GPT-3.5",
|
37 |
+
"Average": 59.06,
|
38 |
+
"MMLU": 65.4,
|
39 |
+
"WinoGrande": 54.6,
|
40 |
+
"PiQA": 54.9,
|
41 |
+
"CommonsenseQA": 67.9,
|
42 |
+
"Race": 60.1,
|
43 |
+
"MedMCQA": 41.4,
|
44 |
+
"OpenkookQA": 49.9
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"model": "Gemini Pro",
|
48 |
+
"Average": 54.45,
|
49 |
+
"MMLU": 57.7,
|
50 |
+
"WinoGrande": 56.4,
|
51 |
+
"PiQA": 47.7,
|
52 |
+
"CommonsenseQA": 50.6,
|
53 |
+
"Race": 61.0,
|
54 |
+
"MedMCQA": 37.5,
|
55 |
+
"OpenkookQA": 52.5
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"model": "Llama3-70b-instruct",
|
59 |
+
"Average": 54.06,
|
60 |
+
"MMLU": 64.67,
|
61 |
+
"WinoGrande": 57.14,
|
62 |
+
"PiQA": 43.1,
|
63 |
+
"CommonsenseQA": 55.49,
|
64 |
+
"Race": 58.21,
|
65 |
+
"MedMCQA": 41.67,
|
66 |
+
"OpenkookQA": 41.93
|
67 |
+
}
|
68 |
+
]
|
constants.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
banner_url = "https://huggingface.co/spaces/WildEval/WildBench-Leaderboard/resolve/main/%E2%80%8Eleaderboard_logo_v2.png" # the same repo here.
|
7 |
+
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
|
8 |
+
|
9 |
+
INTRODUCTION_TEXT= """
|
10 |
+
# OS Benchmark (Evaluating LLMs with OS and MCQ)
|
11 |
+
🔗 [Website](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 💻 [GitHub](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 📖 [Paper](#) | 🐦 [Tweet 1](#) | 🐦 [Tweet 2](#)
|
12 |
+
|
13 |
+
> ### MBZUAI-LLM-Leaderboard, a new framework for evaluating large language models (LLMs) by transitioning from multiple-choice questions (MCQs) to open-style questions.
|
14 |
+
This approach addresses the inherent biases and limitations of MCQs, such as selection bias and the effect of random guessing. By utilizing open-style questions,
|
15 |
+
the framework aims to provide a more accurate assessment of LLMs' abilities across various benchmarks and ensure that the evaluation reflects true capabilities,
|
16 |
+
particularly in terms of language understanding and reasoning.
|
17 |
+
|
18 |
+
"""
|
19 |
+
|
20 |
+
CITATION_TEXT = """@artical{..,
|
21 |
+
title={MBZUAI-LLM-Leaderboard: From Multi-choice to Open-style Questions for LLMs Evaluation, Benchmark, and Arena},
|
22 |
+
author={},
|
23 |
+
year={2024},
|
24 |
+
archivePrefix={arXiv}
|
25 |
+
}
|
26 |
+
"""
|
requirements (1).txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pandas
|
small (1).json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"model": "OPT (1.3B)",
|
4 |
+
"Average": 7.84,
|
5 |
+
"MMLU": 7.4,
|
6 |
+
"WinoGrande": 12.47,
|
7 |
+
"PiQA": 4.45,
|
8 |
+
"CommonsenseQA": 7.61,
|
9 |
+
"Race": 13.61,
|
10 |
+
"MedMCQA": 1.25,
|
11 |
+
"OpenkookQA": 4.48
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"model": "SlimPajama",
|
15 |
+
"Average": 9.54,
|
16 |
+
"MMLU": 9.22,
|
17 |
+
"WinoGrande": 14.76,
|
18 |
+
"PiQA": 5.32,
|
19 |
+
"CommonsenseQA": 9.01,
|
20 |
+
"Race": 16.19,
|
21 |
+
"MedMCQA": 1.68,
|
22 |
+
"OpenkookQA": 5.7
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"model": "OLMo (1B)",
|
26 |
+
"Average": 8.8,
|
27 |
+
"MMLU": 8.54,
|
28 |
+
"WinoGrande": 6.16,
|
29 |
+
"PiQA": 8.05,
|
30 |
+
"CommonsenseQA": 13.1,
|
31 |
+
"Race": 13.61,
|
32 |
+
"MedMCQA": 2.1,
|
33 |
+
"OpenkookQA": 6.11
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"model": "GPT-Neo (1.3B)",
|
37 |
+
"Average": 7.38,
|
38 |
+
"MMLU": 6.94,
|
39 |
+
"WinoGrande": 10.81,
|
40 |
+
"PiQA": 4.31,
|
41 |
+
"CommonsenseQA": 6.34,
|
42 |
+
"Race": 13.75,
|
43 |
+
"MedMCQA": 2.63,
|
44 |
+
"OpenkookQA": 4.89
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"model": "Cerebras-GPT (1.3B)",
|
48 |
+
"Average": 4.84,
|
49 |
+
"MMLU": 5.37,
|
50 |
+
"WinoGrande": 9.31,
|
51 |
+
"PiQA": 2.16,
|
52 |
+
"CommonsenseQA": 6.2,
|
53 |
+
"Race": 6.9,
|
54 |
+
"MedMCQA": 1.04,
|
55 |
+
"OpenkookQA": 3.46
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"model": "RedPajama (1B)",
|
59 |
+
"Average": 9.01,
|
60 |
+
"MMLU": 9.21,
|
61 |
+
"WinoGrande": 16.97,
|
62 |
+
"PiQA": 1.39,
|
63 |
+
"CommonsenseQA": 11.41,
|
64 |
+
"Race": 14.35,
|
65 |
+
"MedMCQA": 1.86,
|
66 |
+
"OpenkookQA": 3.87
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"model": "Pythia (1.4B)",
|
70 |
+
"Average": 8.73,
|
71 |
+
"MMLU": 9.66,
|
72 |
+
"WinoGrande": 11.52,
|
73 |
+
"PiQA": 4.17,
|
74 |
+
"CommonsenseQA": 9.01,
|
75 |
+
"Race": 12.76,
|
76 |
+
"MedMCQA": 3.19,
|
77 |
+
"OpenkookQA": 5.3
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"model": "TinyLLama (1.1B)",
|
81 |
+
"Average": 8.39,
|
82 |
+
"MMLU": 8.94,
|
83 |
+
"WinoGrande": 12.23,
|
84 |
+
"PiQA": 3.59,
|
85 |
+
"CommonsenseQA": 6.06,
|
86 |
+
"Race": 16.7,
|
87 |
+
"MedMCQA": 2.07,
|
88 |
+
"OpenkookQA": 4.68
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"model": "OELM (1B)",
|
92 |
+
"Average": 8.99,
|
93 |
+
"MMLU": 9.03,
|
94 |
+
"WinoGrande": 10.18,
|
95 |
+
"PiQA": 9.05,
|
96 |
+
"CommonsenseQA": 7.75,
|
97 |
+
"Race": 12.78,
|
98 |
+
"MedMCQA": 2.5,
|
99 |
+
"OpenkookQA": 6.31
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"model": "Phi-3-mini-128k-instruct (3.8B)",
|
103 |
+
"Average": 39.73,
|
104 |
+
"MMLU": 36.97,
|
105 |
+
"WinoGrande": 46.88,
|
106 |
+
"PiQA": 32.04,
|
107 |
+
"CommonsenseQA": 49.15,
|
108 |
+
"Race": 37.81,
|
109 |
+
"MedMCQA": 22.61,
|
110 |
+
"OpenkookQA": 33.6
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"model": "Gemma (2B)",
|
114 |
+
"Average": 17.37,
|
115 |
+
"MMLU": 17.52,
|
116 |
+
"WinoGrande": 22.68,
|
117 |
+
"PiQA": 15.09,
|
118 |
+
"CommonsenseQA": 27.46,
|
119 |
+
"Race": 14.32,
|
120 |
+
"MedMCQA": 4.57,
|
121 |
+
"OpenkookQA": 14.26
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"model": "Qwen (1.8B)",
|
125 |
+
"Average": 21.61,
|
126 |
+
"MMLU": 10.0,
|
127 |
+
"WinoGrande": 40.97,
|
128 |
+
"PiQA": 15.52,
|
129 |
+
"CommonsenseQA": 31.13,
|
130 |
+
"Race": 34.91,
|
131 |
+
"MedMCQA": 4.7,
|
132 |
+
"OpenkookQA": 20.37
|
133 |
+
}
|
134 |
+
]
|