Adam Jirkovsky
commited on
Commit
·
e3e7110
1
Parent(s):
c9612ab
Add aggregate scores and fix last column visibility
Browse files- app.py +16 -12
- src/display/about.py +7 -0
- src/display/utils.py +9 -0
- src/populate.py +6 -0
app.py
CHANGED
@@ -22,6 +22,7 @@ from src.display.utils import (
|
|
22 |
EVAL_TYPES,
|
23 |
NUMERIC_INTERVALS,
|
24 |
TYPES,
|
|
|
25 |
AutoEvalColumn,
|
26 |
ModelType,
|
27 |
fields,
|
@@ -83,9 +84,12 @@ def update_table(
|
|
83 |
columns: list,
|
84 |
query: str,
|
85 |
):
|
|
|
86 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
87 |
filtered_df = filter_queries(query, hidden_df)
|
88 |
df = select_columns(filtered_df, columns)
|
|
|
|
|
89 |
return df
|
90 |
|
91 |
|
@@ -230,23 +234,23 @@ with demo:
|
|
230 |
)
|
231 |
"""
|
232 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
233 |
-
|
|
|
234 |
value=leaderboard_df[
|
235 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
236 |
-
+ shown_columns.value
|
237 |
-
|
238 |
],
|
239 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
240 |
-
datatype=
|
241 |
elem_id="leaderboard-table",
|
242 |
interactive=False,
|
243 |
visible=True,
|
244 |
-
wrap=False
|
245 |
-
#column_widths=["2%", "2%"],
|
246 |
)
|
|
|
|
|
247 |
|
248 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
249 |
-
hidden_leaderboard_table_for_search = gr.
|
250 |
value=original_df[COLS],
|
251 |
headers=COLS,
|
252 |
datatype=TYPES,
|
@@ -387,14 +391,14 @@ with demo:
|
|
387 |
elem_id="citation-button",
|
388 |
show_copy_button=True,
|
389 |
)
|
390 |
-
|
391 |
demo.load(
|
392 |
fn=generate_captcha,
|
393 |
outputs=[captcha_img, text]
|
394 |
)
|
395 |
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")
|
400 |
|
|
|
22 |
EVAL_TYPES,
|
23 |
NUMERIC_INTERVALS,
|
24 |
TYPES,
|
25 |
+
TYPES_LITE,
|
26 |
AutoEvalColumn,
|
27 |
ModelType,
|
28 |
fields,
|
|
|
84 |
columns: list,
|
85 |
query: str,
|
86 |
):
|
87 |
+
columns += "_" # The dataframe does not display the last column - BUG in gradio?
|
88 |
#filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
89 |
filtered_df = filter_queries(query, hidden_df)
|
90 |
df = select_columns(filtered_df, columns)
|
91 |
+
print("TF")
|
92 |
+
print(df)
|
93 |
return df
|
94 |
|
95 |
|
|
|
234 |
)
|
235 |
"""
|
236 |
gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
|
237 |
+
#print(shown_columns.value)
|
238 |
+
leaderboard_table = gr.Dataframe(
|
239 |
value=leaderboard_df[
|
240 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
|
|
|
|
|
241 |
],
|
242 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
243 |
+
datatype=TYPES_LITE,
|
244 |
elem_id="leaderboard-table",
|
245 |
interactive=False,
|
246 |
visible=True,
|
247 |
+
wrap=False
|
|
|
248 |
)
|
249 |
+
print(leaderboard_table.value)
|
250 |
+
print(leaderboard_table.headers)
|
251 |
|
252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
253 |
+
hidden_leaderboard_table_for_search = gr.Dataframe(
|
254 |
value=original_df[COLS],
|
255 |
headers=COLS,
|
256 |
datatype=TYPES,
|
|
|
391 |
elem_id="citation-button",
|
392 |
show_copy_button=True,
|
393 |
)
|
394 |
+
|
395 |
demo.load(
|
396 |
fn=generate_captcha,
|
397 |
outputs=[captcha_img, text]
|
398 |
)
|
399 |
|
400 |
+
scheduler = BackgroundScheduler()
|
401 |
+
scheduler.add_job(restart_space, "interval", seconds=86400)
|
402 |
+
scheduler.start()
|
403 |
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")
|
404 |
|
src/display/about.py
CHANGED
@@ -88,6 +88,13 @@ All currently supported benchmarks are listed in the table below:
|
|
88 |
| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
|
89 |
| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
## Evaluation Process
|
92 |
|
93 |
### 1. Install CzechBench:
|
|
|
88 |
| [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
|
89 |
| [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
|
90 |
|
91 |
+
The leaderboard table also displays aggregated scores across task categories, including:
|
92 |
+
- **Grammar (Avg.):** AGREE
|
93 |
+
- **Knowledge (Avg.):** ARC-Challenge, ARC-Easy, MMLU, TruthfulQA
|
94 |
+
- **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
|
95 |
+
- **Math (Avg.):** GSM8K, Klokanek
|
96 |
+
- **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
|
97 |
+
|
98 |
## Evaluation Process
|
99 |
|
100 |
### 1. Install CzechBench:
|
src/display/utils.py
CHANGED
@@ -51,6 +51,11 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
|
|
|
|
|
|
|
|
|
|
54 |
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
|
55 |
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
|
56 |
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
|
@@ -66,6 +71,8 @@ auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "n
|
|
66 |
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
|
67 |
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
|
68 |
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
|
|
|
|
|
69 |
|
70 |
|
71 |
# We use make dataclass to dynamically fill the scores from Tasks
|
@@ -76,6 +83,7 @@ HEADER_MAP = {
|
|
76 |
"eval_name": "Model",
|
77 |
"precision": "Precision",
|
78 |
"hf_model_id": "Model URL",
|
|
|
79 |
"agree_cs": "AGREE",
|
80 |
"anli_cs": "ANLI",
|
81 |
"arc_challenge_cs": "ARC-Challenge",
|
@@ -91,6 +99,7 @@ HEADER_MAP = {
|
|
91 |
"sqad_cs": "SQAD",
|
92 |
"subjectivity_cs": "Subjectivity",
|
93 |
"truthfulqa_cs": "TruthfulQA",
|
|
|
94 |
}
|
95 |
|
96 |
|
|
|
51 |
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
52 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
53 |
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
|
54 |
+
auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
|
55 |
+
auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
|
56 |
+
auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
|
57 |
+
auto_eval_column_dict.append(["math_avg", ColumnContent, ColumnContent("Math (Avg.)", "number", True)])
|
58 |
+
auto_eval_column_dict.append(["classification_avg", ColumnContent, ColumnContent("Classification (Avg.)", "number", True)])
|
59 |
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
|
60 |
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
|
61 |
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
|
|
|
71 |
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
|
72 |
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
|
73 |
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
|
74 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("_", "str", True, dummy=True)]) # The dataframe does not display the last column - BUG in gradio?
|
75 |
+
|
76 |
|
77 |
|
78 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
83 |
"eval_name": "Model",
|
84 |
"precision": "Precision",
|
85 |
"hf_model_id": "Model URL",
|
86 |
+
"knowledge_avg": "Knowledge (Avg.)",
|
87 |
"agree_cs": "AGREE",
|
88 |
"anli_cs": "ANLI",
|
89 |
"arc_challenge_cs": "ARC-Challenge",
|
|
|
99 |
"sqad_cs": "SQAD",
|
100 |
"subjectivity_cs": "Subjectivity",
|
101 |
"truthfulqa_cs": "TruthfulQA",
|
102 |
+
"dummy": "_",
|
103 |
}
|
104 |
|
105 |
|
src/populate.py
CHANGED
@@ -23,6 +23,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
23 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
24 |
df = df.rename(columns=HEADER_MAP)
|
25 |
df[df.select_dtypes(include=['number']).columns] *= 100 # convert to percentage
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
df = df[cols].round(decimals=2)
|
27 |
df.replace(r'\s+', np.nan, regex=True)
|
28 |
# filter out if any of the benchmarks have not been produced
|
|
|
23 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
24 |
df = df.rename(columns=HEADER_MAP)
|
25 |
df[df.select_dtypes(include=['number']).columns] *= 100 # convert to percentage
|
26 |
+
df["Grammar (Avg.)"] = df[["AGREE"]].mean(axis=1)
|
27 |
+
df["Knowledge (Avg.)"] = df[["ARC-Challenge", "ARC-Easy", "MMLU", "TruthfulQA"]].mean(axis=1)
|
28 |
+
df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
|
29 |
+
df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
|
30 |
+
df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
|
31 |
+
df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
|
32 |
df = df[cols].round(decimals=2)
|
33 |
df.replace(r'\s+', np.nan, regex=True)
|
34 |
# filter out if any of the benchmarks have not been produced
|