Adam Jirkovsky
commited on
Commit
·
2fc1b8f
1
Parent(s):
e3e7110
Add graphical results comparison
Browse files- app.py +29 -8
- src/display/about.py +2 -1
- src/display/utils.py +2 -0
- src/populate.py +2 -9
app.py
CHANGED
@@ -14,7 +14,7 @@ from src.display.about import (
|
|
14 |
TABLE_DESC,
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
18 |
from src.display.utils import (
|
19 |
BENCHMARK_COLS,
|
20 |
COLS,
|
@@ -35,6 +35,7 @@ from src.submission.submit import add_new_eval
|
|
35 |
from captcha.image import ImageCaptcha
|
36 |
from PIL import Image
|
37 |
import random, string
|
|
|
38 |
|
39 |
|
40 |
original_df = None
|
@@ -44,6 +45,12 @@ leaderboard_df = None
|
|
44 |
def restart_space():
|
45 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def download_data():
|
48 |
global original_df
|
49 |
global leaderboard_df
|
@@ -65,7 +72,8 @@ def download_data():
|
|
65 |
|
66 |
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
67 |
leaderboard_df = original_df.copy()
|
68 |
-
|
|
|
69 |
|
70 |
download_data()
|
71 |
|
@@ -88,8 +96,6 @@ def update_table(
|
|
88 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
89 |
filtered_df = filter_queries(query, hidden_df)
|
90 |
df = select_columns(filtered_df, columns)
|
91 |
-
print("TF")
|
92 |
-
print(df)
|
93 |
return df
|
94 |
|
95 |
|
@@ -234,7 +240,6 @@ with demo:
|
|
234 |
)
|
235 |
"""
|
236 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
237 |
-
#print(shown_columns.value)
|
238 |
leaderboard_table = gr.Dataframe(
|
239 |
value=leaderboard_df[
|
240 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
|
@@ -244,10 +249,8 @@ with demo:
|
|
244 |
elem_id="leaderboard-table",
|
245 |
interactive=False,
|
246 |
visible=True,
|
247 |
-
wrap=False
|
248 |
)
|
249 |
-
print(leaderboard_table.value)
|
250 |
-
print(leaderboard_table.headers)
|
251 |
|
252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
253 |
hidden_leaderboard_table_for_search = gr.Dataframe(
|
@@ -278,6 +281,24 @@ with demo:
|
|
278 |
leaderboard_table,
|
279 |
queue=True,
|
280 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
282 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
283 |
|
|
|
14 |
TABLE_DESC,
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
+
from src.display.formatting import styled_error, styled_message, styled_warning, model_hyperlink
|
18 |
from src.display.utils import (
|
19 |
BENCHMARK_COLS,
|
20 |
COLS,
|
|
|
35 |
from captcha.image import ImageCaptcha
|
36 |
from PIL import Image
|
37 |
import random, string
|
38 |
+
import matplotlib.pyplot as plt
|
39 |
|
40 |
|
41 |
original_df = None
|
|
|
45 |
def restart_space():
|
46 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
47 |
|
48 |
+
def add_model_hyperlinks(row):
|
49 |
+
if row["Model URL"] is None or row["Model URL"] == "":
|
50 |
+
return row["Model"]
|
51 |
+
else:
|
52 |
+
return model_hyperlink(row["Model URL"], row["Model"])
|
53 |
+
|
54 |
def download_data():
|
55 |
global original_df
|
56 |
global leaderboard_df
|
|
|
72 |
|
73 |
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
74 |
leaderboard_df = original_df.copy()
|
75 |
+
leaderboard_df["Model"] = leaderboard_df.apply(add_model_hyperlinks, axis=1)
|
76 |
+
leaderboard_df.sort_values(by=["Aggregate Score"], ascending=False, inplace=True)
|
77 |
|
78 |
download_data()
|
79 |
|
|
|
96 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
97 |
filtered_df = filter_queries(query, hidden_df)
|
98 |
df = select_columns(filtered_df, columns)
|
|
|
|
|
99 |
return df
|
100 |
|
101 |
|
|
|
240 |
)
|
241 |
"""
|
242 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
|
|
243 |
leaderboard_table = gr.Dataframe(
|
244 |
value=leaderboard_df[
|
245 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
|
|
|
249 |
elem_id="leaderboard-table",
|
250 |
interactive=False,
|
251 |
visible=True,
|
252 |
+
wrap=False,
|
253 |
)
|
|
|
|
|
254 |
|
255 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
256 |
hidden_leaderboard_table_for_search = gr.Dataframe(
|
|
|
281 |
leaderboard_table,
|
282 |
queue=True,
|
283 |
)
|
284 |
+
|
285 |
+
model_num = len(original_df)
|
286 |
+
graph_df = original_df.drop(columns=["_", "Precision", "Model URL"]).set_index("Model").T
|
287 |
+
graph_ax = graph_df.plot(
|
288 |
+
kind="barh",
|
289 |
+
title="Graphical performance comparison",
|
290 |
+
xlabel="Accuracy [%]",
|
291 |
+
ylabel="Model",
|
292 |
+
width=0.9,
|
293 |
+
figsize=(15, 7 + 2*model_num),
|
294 |
+
)
|
295 |
+
graph_ax.invert_yaxis()
|
296 |
+
for container in graph_ax.containers:
|
297 |
+
graph_ax.bar_label(container, fontsize=8, fmt="%.1f")
|
298 |
+
graph_ax.legend(loc='center left', bbox_to_anchor=(1.01, 0.95))
|
299 |
+
plt.tight_layout(rect=[0, 0, 0.95, 1])
|
300 |
+
|
301 |
+
plot = gr.Plot(graph_ax.get_figure(), label="Graphical performance comparison")
|
302 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
303 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
304 |
|
src/display/about.py
CHANGED
@@ -31,7 +31,7 @@ class Tasks(Enum):
|
|
31 |
# Your leaderboard name
|
32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
33 |
|
34 |
-
TABLE_DESC = "The values shown in the table represent the accuracy metric in percentage."
|
35 |
|
36 |
# What does your leaderboard evaluate?
|
37 |
INTRODUCTION_OLD = """
|
@@ -94,6 +94,7 @@ The leaderboard table also displays aggregated scores across task categories, in
|
|
94 |
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
|
95 |
- **Math (Avg.):** GSM8K, Klokanek
|
96 |
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
|
|
|
97 |
|
98 |
## Evaluation Process
|
99 |
|
|
|
31 |
# Your leaderboard name
|
32 |
TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
|
33 |
|
34 |
+
TABLE_DESC = "The values shown in the leaderboard table represent the accuracy metric in percentage."
|
35 |
|
36 |
# What does your leaderboard evaluate?
|
37 |
INTRODUCTION_OLD = """
|
|
|
94 |
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
|
95 |
- **Math (Avg.):** GSM8K, Klokanek
|
96 |
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
|
97 |
+
- **Aggregate Score:** Average over above categories
|
98 |
|
99 |
## Evaluation Process
|
100 |
|
src/display/utils.py
CHANGED
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
|
|
54 |
auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
|
55 |
auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
|
56 |
auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
|
@@ -100,6 +101,7 @@ HEADER_MAP = {
|
|
100 |
"subjectivity_cs": "Subjectivity",
|
101 |
"truthfulqa_cs": "TruthfulQA",
|
102 |
"dummy": "_",
|
|
|
103 |
}
|
104 |
|
105 |
|
|
|
51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
54 |
+
auto_eval_column_dict.append(["aggregate_score", ColumnContent, ColumnContent("Aggregate Score", "number", True)])
|
55 |
auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
|
56 |
auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
|
57 |
auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
|
|
|
101 |
"subjectivity_cs": "Subjectivity",
|
102 |
"truthfulqa_cs": "TruthfulQA",
|
103 |
"dummy": "_",
|
104 |
+
"aggregate_score": "Aggregate Score",
|
105 |
}
|
106 |
|
107 |
|
src/populate.py
CHANGED
@@ -9,13 +9,6 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
-
def add_model_hyperlinks(row):
|
13 |
-
if row["Model URL"] is None or row["Model URL"] == "":
|
14 |
-
return row["Model"]
|
15 |
-
else:
|
16 |
-
return model_hyperlink(row["Model URL"], row["Model"])
|
17 |
-
|
18 |
-
|
19 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
20 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
21 |
#all_data_json = [v.to_dict() for v in raw_data]
|
@@ -28,14 +21,14 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
28 |
df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
|
29 |
df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
|
30 |
df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
|
|
|
31 |
df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
|
32 |
df = df[cols].round(decimals=2)
|
33 |
df.replace(r'\s+', np.nan, regex=True)
|
34 |
# filter out if any of the benchmarks have not been produced
|
35 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
36 |
-
df['Model'] = df.apply(add_model_hyperlinks, axis=1)
|
37 |
|
38 |
-
return raw_data, df
|
39 |
|
40 |
|
41 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
#all_data_json = [v.to_dict() for v in raw_data]
|
|
|
21 |
df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
|
22 |
df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
|
23 |
df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
|
24 |
+
df["Aggregate Score"] = df[["Grammar (Avg.)", "Knowledge (Avg.)", "Reasoning (Avg.)", "Math (Avg.)", "Classification (Avg.)"]].mean(axis=1)
|
25 |
df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
|
26 |
df = df[cols].round(decimals=2)
|
27 |
df.replace(r'\s+', np.nan, regex=True)
|
28 |
# filter out if any of the benchmarks have not been produced
|
29 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
30 |
|
31 |
+
return raw_data, df,
|
32 |
|
33 |
|
34 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|