Adam Jirkovsky commited on
Commit
e3e7110
·
1 Parent(s): c9612ab

Add aggregate scores and fix last column visibility

Browse files
Files changed (4) hide show
  1. app.py +16 -12
  2. src/display/about.py +7 -0
  3. src/display/utils.py +9 -0
  4. src/populate.py +6 -0
app.py CHANGED
@@ -22,6 +22,7 @@ from src.display.utils import (
22
  EVAL_TYPES,
23
  NUMERIC_INTERVALS,
24
  TYPES,
 
25
  AutoEvalColumn,
26
  ModelType,
27
  fields,
@@ -83,9 +84,12 @@ def update_table(
83
  columns: list,
84
  query: str,
85
  ):
 
86
  #filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
87
  filtered_df = filter_queries(query, hidden_df)
88
  df = select_columns(filtered_df, columns)
 
 
89
  return df
90
 
91
 
@@ -230,23 +234,23 @@ with demo:
230
  )
231
  """
232
  gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
233
- leaderboard_table = gr.components.Dataframe(
 
234
  value=leaderboard_df[
235
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
236
- + shown_columns.value
237
-
238
  ],
239
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
240
- datatype=TYPES,
241
  elem_id="leaderboard-table",
242
  interactive=False,
243
  visible=True,
244
- wrap=False,
245
- #column_widths=["2%", "2%"],
246
  )
 
 
247
 
248
  # Dummy leaderboard for handling the case when the user uses backspace key
249
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
250
  value=original_df[COLS],
251
  headers=COLS,
252
  datatype=TYPES,
@@ -387,14 +391,14 @@ with demo:
387
  elem_id="citation-button",
388
  show_copy_button=True,
389
  )
390
-
391
  demo.load(
392
  fn=generate_captcha,
393
  outputs=[captcha_img, text]
394
  )
395
 
396
- #scheduler = BackgroundScheduler()
397
- #scheduler.add_job(restart_space, "interval", seconds=3600)
398
- #scheduler.start()
399
  demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")
400
 
 
22
  EVAL_TYPES,
23
  NUMERIC_INTERVALS,
24
  TYPES,
25
+ TYPES_LITE,
26
  AutoEvalColumn,
27
  ModelType,
28
  fields,
 
84
  columns: list,
85
  query: str,
86
  ):
87
+ columns += "_" # The dataframe does not display the last column - BUG in gradio?
88
  #filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
89
  filtered_df = filter_queries(query, hidden_df)
90
  df = select_columns(filtered_df, columns)
91
+ print("TF")
92
+ print(df)
93
  return df
94
 
95
 
 
234
  )
235
  """
236
  gr.Markdown(TABLE_DESC, elem_classes="markdown-text")
237
+ #print(shown_columns.value)
238
+ leaderboard_table = gr.Dataframe(
239
  value=leaderboard_df[
240
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value
 
 
241
  ],
242
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
243
+ datatype=TYPES_LITE,
244
  elem_id="leaderboard-table",
245
  interactive=False,
246
  visible=True,
247
+ wrap=False
 
248
  )
249
+ print(leaderboard_table.value)
250
+ print(leaderboard_table.headers)
251
 
252
  # Dummy leaderboard for handling the case when the user uses backspace key
253
+ hidden_leaderboard_table_for_search = gr.Dataframe(
254
  value=original_df[COLS],
255
  headers=COLS,
256
  datatype=TYPES,
 
391
  elem_id="citation-button",
392
  show_copy_button=True,
393
  )
394
+
395
  demo.load(
396
  fn=generate_captcha,
397
  outputs=[captcha_img, text]
398
  )
399
 
400
+ scheduler = BackgroundScheduler()
401
+ scheduler.add_job(restart_space, "interval", seconds=86400)
402
+ scheduler.start()
403
  demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")
404
 
src/display/about.py CHANGED
@@ -88,6 +88,13 @@ All currently supported benchmarks are listed in the table below:
88
  | [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
89
  | [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
90
 
 
 
 
 
 
 
 
91
  ## Evaluation Process
92
 
93
  ### 1. Install CzechBench:
 
88
  | [Subjectivity](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/subjectivity_cs) | CS (Original) | Subjectivity Analysis | Acc, Macro F1 | 2000 | subjectivity_cs |
89
  | [TruthfulQA](https://github.com/jirkoada/czechbench_eval_harness/tree/main/lm_eval/tasks/czechbench/truthfulqa_cs) | CS (Translated) | Knowledge-Based QA | Acc | 813 | truthfulqa_cs |
90
 
91
+ The leaderboard table also displays aggregated scores across task categories, including:
92
+ - **Grammar (Avg.):** AGREE
93
+ - **Knowledge (Avg.):** ARC-Challenge, ARC-Easy, MMLU, TruthfulQA
94
+ - **Reasoning (Avg.):** ANLI, Belebele, CTKFacts, SQAD
95
+ - **Math (Avg.):** GSM8K, Klokanek
96
+ - **Classification (Avg.):** Czech News, Facebook Comments, Mall Reviews, Subjectivity
97
+
98
  ## Evaluation Process
99
 
100
  ### 1. Install CzechBench:
src/display/utils.py CHANGED
@@ -51,6 +51,11 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
51
  auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
53
  auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
 
 
 
 
 
54
  auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
55
  auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
56
  auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
@@ -66,6 +71,8 @@ auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "n
66
  auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
67
  auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
68
  auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
 
 
69
 
70
 
71
  # We use make dataclass to dynamically fill the scores from Tasks
@@ -76,6 +83,7 @@ HEADER_MAP = {
76
  "eval_name": "Model",
77
  "precision": "Precision",
78
  "hf_model_id": "Model URL",
 
79
  "agree_cs": "AGREE",
80
  "anli_cs": "ANLI",
81
  "arc_challenge_cs": "ARC-Challenge",
@@ -91,6 +99,7 @@ HEADER_MAP = {
91
  "sqad_cs": "SQAD",
92
  "subjectivity_cs": "Subjectivity",
93
  "truthfulqa_cs": "TruthfulQA",
 
94
  }
95
 
96
 
 
51
  auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
52
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
53
  auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model URL", "str", False)])
54
+ auto_eval_column_dict.append(["grammar_avg", ColumnContent, ColumnContent("Grammar (Avg.)", "number", True)])
55
+ auto_eval_column_dict.append(["knowledge_avg", ColumnContent, ColumnContent("Knowledge (Avg.)", "number", True)])
56
+ auto_eval_column_dict.append(["reasoning_avg", ColumnContent, ColumnContent("Reasoning (Avg.)", "number", True)])
57
+ auto_eval_column_dict.append(["math_avg", ColumnContent, ColumnContent("Math (Avg.)", "number", True)])
58
+ auto_eval_column_dict.append(["classification_avg", ColumnContent, ColumnContent("Classification (Avg.)", "number", True)])
59
  auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
60
  auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
61
  auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
 
71
  auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
72
  auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
73
  auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
74
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("_", "str", True, dummy=True)]) # The dataframe does not display the last column - BUG in gradio?
75
+
76
 
77
 
78
  # We use make dataclass to dynamically fill the scores from Tasks
 
83
  "eval_name": "Model",
84
  "precision": "Precision",
85
  "hf_model_id": "Model URL",
86
+ "knowledge_avg": "Knowledge (Avg.)",
87
  "agree_cs": "AGREE",
88
  "anli_cs": "ANLI",
89
  "arc_challenge_cs": "ARC-Challenge",
 
99
  "sqad_cs": "SQAD",
100
  "subjectivity_cs": "Subjectivity",
101
  "truthfulqa_cs": "TruthfulQA",
102
+ "dummy": "_",
103
  }
104
 
105
 
src/populate.py CHANGED
@@ -23,6 +23,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
23
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
  df = df.rename(columns=HEADER_MAP)
25
  df[df.select_dtypes(include=['number']).columns] *= 100 # convert to percentage
 
 
 
 
 
 
26
  df = df[cols].round(decimals=2)
27
  df.replace(r'\s+', np.nan, regex=True)
28
  # filter out if any of the benchmarks have not been produced
 
23
  #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
  df = df.rename(columns=HEADER_MAP)
25
  df[df.select_dtypes(include=['number']).columns] *= 100 # convert to percentage
26
+ df["Grammar (Avg.)"] = df[["AGREE"]].mean(axis=1)
27
+ df["Knowledge (Avg.)"] = df[["ARC-Challenge", "ARC-Easy", "MMLU", "TruthfulQA"]].mean(axis=1)
28
+ df["Reasoning (Avg.)"] = df[["ANLI", "Belebele", "CTKFacts", "SQAD"]].mean(axis=1)
29
+ df["Math (Avg.)"] = df[["GSM8K", "Klokanek"]].mean(axis=1)
30
+ df["Classification (Avg.)"] = df[["Czech News", "Facebook Comments", "Mall Reviews", "Subjectivity"]].mean(axis=1)
31
+ df["_"] = "" # The dataframe does not display the last column - BUG in gradio?
32
  df = df[cols].round(decimals=2)
33
  df.replace(r'\s+', np.nan, regex=True)
34
  # filter out if any of the benchmarks have not been produced