Adam Jirkovsky commited on
Commit
2fc1b8f
·
1 Parent(s): e3e7110

Add graphical results comparison

Browse files
Files changed (4) hide show
  1. app.py +29 -8
  2. src/display/about.py +2 -1
  3. src/display/utils.py +2 -0
  4. src/populate.py +2 -9
app.py CHANGED
@@ -14,7 +14,7 @@ from src.display.about import (
14
  TABLE_DESC,
15
  )
16
  from src.display.css_html_js import custom_css
17
- from src.display.formatting import styled_error, styled_message, styled_warning
18
  from src.display.utils import (
19
  BENCHMARK_COLS,
20
  COLS,
@@ -35,6 +35,7 @@ from src.submission.submit import add_new_eval
35
  from captcha.image import ImageCaptcha
36
  from PIL import Image
37
  import random, string
 
38
 
39
 
40
  original_df = None
@@ -44,6 +45,12 @@ leaderboard_df = None
44
  def restart_space():
45
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
46
 
 
 
 
 
 
 
47
  def download_data():
48
  global original_df
49
  global leaderboard_df
@@ -65,7 +72,8 @@ def download_data():
65
 
66
  _, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
67
  leaderboard_df = original_df.copy()
68
-
 
69
 
70
  download_data()
71
 
@@ -88,8 +96,6 @@ def update_table(
88
  #filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
89
  filtered_df = filter_queries(query, hidden_df)
90
  df = select_columns(filtered_df, columns)
91
- print("TF")
92
- print(df)
93
  return df
94
 
95
 
@@ -234,7 +240,6 @@ with demo:
234
  )
235
  """
236
  gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
237
- #print(shown_columns.value)
238
  leaderboard_table = gr.Dataframe(
239
  value=leaderboard_df[
240
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
@@ -244,10 +249,8 @@ with demo:
244
  elem_id="leaderboard-table",
245
  interactive=False,
246
  visible=True,
247
- wrap=False
248
  )
249
- print(leaderboard_table.value)
250
- print(leaderboard_table.headers)
251
 
252
  # Dummy leaderboard for handling the case when the user uses backspace key
253
  hidden_leaderboard_table_for_search = gr.Dataframe(
@@ -278,6 +281,24 @@ with demo:
278
  leaderboard_table,
279
  queue=True,
280
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
282
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
283
 
 
14
  TABLE_DESC,
15
  )
16
  from src.display.css_html_js import custom_css
17
+ from src.display.formatting import styled_error, styled_message, styled_warning, model_hyperlink
18
  from src.display.utils import (
19
  BENCHMARK_COLS,
20
  COLS,
 
35
  from captcha.image import ImageCaptcha
36
  from PIL import Image
37
  import random, string
38
+ import matplotlib.pyplot as plt
39
 
40
 
41
  original_df = None
 
45
  def restart_space():
46
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
47
 
48
+ def add_model_hyperlinks(row):
49
+ if row["Model URL"] is None or row["Model URL"] == "":
50
+ return row["Model"]
51
+ else:
52
+ return model_hyperlink(row["Model URL"], row["Model"])
53
+
54
  def download_data():
55
  global original_df
56
  global leaderboard_df
 
72
 
73
  _, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
74
  leaderboard_df = original_df.copy()
75
+ leaderboard_df["Model"] = leaderboard_df.apply(add_model_hyperlinks, axis=1)
76
+ leaderboard_df.sort_values(by=["Aggregate Score"], ascending=False, inplace=True)
77
 
78
  download_data()
79
 
 
96
  #filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
97
  filtered_df = filter_queries(query, hidden_df)
98
  df = select_columns(filtered_df, columns)
 
 
99
  return df
100
 
101
 
 
240
  )
241
  """
242
  gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
 
243
  leaderboard_table = gr.Dataframe(
244
  value=leaderboard_df[
245
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
 
249
  elem_id="leaderboard-table",
250
  interactive=False,
251
  visible=True,
252
+ wrap=False,
253
  )
 
 
254
 
255
  # Dummy leaderboard for handling the case when the user uses backspace key
256
  hidden_leaderboard_table_for_search = gr.Dataframe(
 
281
  leaderboard_table,
282
  queue=True,
283
  )
284
+
285
+ model_num = len(original_df)
286
+ graph_df = original_df.drop(columns=["_", "Precision", "Model URL"]).set_index("Model").T
287
+ graph_ax = graph_df.plot(
288
+ kind="barh",
289
+ title="Graphical performance comparison",
290
+ xlabel="Accuracy [%]",
291
+ ylabel="Model",
292
+ width=0.9,
293
+ figsize=(15, 7 + 2*model_num),
294
+ )
295
+ graph_ax.invert_yaxis()
296
+ for container in graph_ax.containers:
297
+ graph_ax.bar_label(container, fontsize=8, fmt="%.1f")
298
+ graph_ax.legend(loc='center left', bbox_to_anchor=(1.01, 0.95))
299
+ plt.tight_layout(rect=[0, 0, 0.95, 1])
300
+
301
+ plot = gr.Plot(graph_ax.get_figure(), label="Graphical performance comparison")
302
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
303
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
304
 
src/display/about.py CHANGED
@@ -31,7 +31,7 @@ class Tasks(Enum):
31
  # Your leaderboard name
32
  TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
33
 
34
- TABLE_DESC = "The values shown in the table represent the accuracy metric in percentage."
35
 
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_OLD = """
@@ -94,6 +94,7 @@ The leaderboard table also displays aggregated scores across task categories, in
94
  - **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
95
  - **Math (Avg.):** GSM8K, Klokanek
96
  - **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
 
97
 
98
  ## Evaluation Process
99
 
 
31
  # Your leaderboard name
32
  TITLE = """<h1 align="center" id="space-title">🇨🇿 CzechBench Leaderboard</h1>"""
33
 
34
+ TABLE_DESC = "The values shown in the leaderboard table represent the accuracy metric in percentage."
35
 
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_OLD = """
 
94
  - **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
95
  - **Math (Avg.):** GSM8K, Klokanek
96
  - **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
97
+ - **Aggregate Score:** Average over above categories
98
 
99
  ## Evaluation Process
100
 
src/display/utils.py CHANGED
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
51
  auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
53
  auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
 
54
  auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
55
  auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
56
  auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
@@ -100,6 +101,7 @@ HEADER_MAP = {
100
  "subjectivity_cs": "Subjectivity",
101
  "truthfulqa_cs": "TruthfulQA",
102
  "dummy": "_",
 
103
  }
104
 
105
 
 
51
  auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
53
  auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
54
+ auto_eval_column_dict.append(["aggregate_score", ColumnContent, ColumnContent("Aggregate Score", "number", True)])
55
  auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
56
  auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
57
  auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
 
101
  "subjectivity_cs": "Subjectivity",
102
  "truthfulqa_cs": "TruthfulQA",
103
  "dummy": "_",
104
+ "aggregate_score": "Aggregate Score",
105
  }
106
 
107
 
src/populate.py CHANGED
@@ -9,13 +9,6 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def add_model_hyperlinks(row):
13
- if row["Model URL"] is None or row["Model URL"] == "":
14
- return row["Model"]
15
- else:
16
- return model_hyperlink(row["Model URL"], row["Model"])
17
-
18
-
19
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
20
  raw_data = get_raw_eval_results(results_path, requests_path)
21
  #all_data_json = [v.to_dict() for v in raw_data]
@@ -28,14 +21,14 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
28
  df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
29
  df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
30
  df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
 
31
  df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
32
  df = df[cols].round(decimals=2)
33
  df.replace(r'\s+', np.nan, regex=True)
34
  # filter out if any of the benchmarks have not been produced
35
  df = df[has_no_nan_values(df, benchmark_cols)]
36
- df['Model'] = df.apply(add_model_hyperlinks, axis=1)
37
 
38
- return raw_data, df
39
 
40
 
41
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
 
 
 
 
 
 
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  #all_data_json = [v.to_dict() for v in raw_data]
 
21
  df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
22
  df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
23
  df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
24
+ df["Aggregate Score"] = df[["Grammar (Avg.)", "Knowledge (Avg.)", "Reasoning (Avg.)", "Math (Avg.)", "Classification (Avg.)"]].mean(axis=1)
25
  df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
26
  df = df[cols].round(decimals=2)
27
  df.replace(r'\s+', np.nan, regex=True)
28
  # filter out if any of the benchmarks have not been produced
29
  df = df[has_no_nan_values(df, benchmark_cols)]
 
30
 
31
+ return raw_data, df,
32
 
33
 
34
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: