yzabc007 commited on
Commit
bac050f
Β·
1 Parent(s): 46e675d

Update space

Browse files
Files changed (2) hide show
  1. app.py +79 -32
  2. src/populate.py +24 -7
app.py CHANGED
@@ -183,29 +183,6 @@ with demo:
183
  )
184
  )
185
 
186
- with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
187
- DESCRIPTION_TEXT = """
188
- Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
189
- We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
190
- coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
191
- """
192
- gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
193
-
194
- with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
195
- leaderboard = overall_leaderboard(
196
- get_model_leaderboard_df(
197
- model_result_path,
198
- benchmark_cols=[
199
- AutoEvalColumn.rank_overall.name,
200
- AutoEvalColumn.model.name,
201
- AutoEvalColumn.score_overall.name,
202
- AutoEvalColumn.sd_overall.name,
203
- AutoEvalColumn.license.name,
204
- AutoEvalColumn.organization.name,
205
- AutoEvalColumn.knowledge_cutoff.name,
206
- ],
207
- rank_col=[AutoEvalColumn.rank_overall.name],
208
- ))
209
 
210
 
211
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=2):
@@ -232,11 +209,18 @@ with demo:
232
  model_result_path,
233
  benchmark_cols=[
234
  AutoEvalColumn.model.name,
235
- AutoEvalColumn.rank_math_algebra.name,
236
- AutoEvalColumn.rank_math_geometry.name,
237
- AutoEvalColumn.rank_math_probability.name,
 
 
 
 
 
 
 
238
  ],
239
- rank_col=[],
240
  )
241
  )
242
 
@@ -292,6 +276,21 @@ with demo:
292
  )
293
  )
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
296
  DESCRIPTION_TEXT = """
297
  Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
@@ -323,10 +322,16 @@ with demo:
323
  model_result_path,
324
  benchmark_cols=[
325
  AutoEvalColumn.model.name,
326
- AutoEvalColumn.rank_reason_logical.name,
327
- AutoEvalColumn.rank_reason_social.name,
 
 
 
 
 
 
328
  ],
329
- rank_col=[],
330
  )
331
  )
332
 
@@ -364,6 +369,19 @@ with demo:
364
  )
365
  )
366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  with gr.TabItem("πŸ”¬ Science", elem_id="science-table", id=4):
368
  CURRENT_TEXT = """
369
  Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
@@ -385,9 +403,14 @@ with demo:
385
  model_result_path,
386
  benchmark_cols=[
387
  AutoEvalColumn.model.name,
388
- AutoEvalColumn.rank_chemistry.name,
 
 
 
 
 
389
  ],
390
- rank_col=[],
391
  )
392
  )
393
 
@@ -468,6 +491,30 @@ with demo:
468
 
469
 
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
 
473
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=6):
 
183
  )
184
  )
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
 
188
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=2):
 
209
  model_result_path,
210
  benchmark_cols=[
211
  AutoEvalColumn.model.name,
212
+ AutoEvalColumn.license.name,
213
+ AutoEvalColumn.organization.name,
214
+ AutoEvalColumn.knowledge_cutoff.name,
215
+
216
+ AutoEvalColumn.score_math_algebra.name,
217
+ AutoEvalColumn.score_math_geometry.name,
218
+ AutoEvalColumn.score_math_probability.name,
219
+ # AutoEvalColumn.rank_math_algebra.name,
220
+ # AutoEvalColumn.rank_math_geometry.name,
221
+ # AutoEvalColumn.rank_math_probability.name,
222
  ],
223
+ rank_col=['sort_by_score'],
224
  )
225
  )
226
 
 
276
  )
277
  )
278
 
279
+
280
+ # with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"):
281
+ # leaderboard = overall_leaderboard(
282
+ # get_model_leaderboard_df(
283
+ # model_result_path,
284
+ # benchmark_cols=[
285
+ # AutoEvalColumn.model.name,
286
+ # AutoEvalColumn.rank_math_algebra.name,
287
+ # AutoEvalColumn.rank_math_geometry.name,
288
+ # AutoEvalColumn.rank_math_probability.name,
289
+ # ],
290
+ # rank_col=[],
291
+ # )
292
+ # )
293
+
294
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
295
  DESCRIPTION_TEXT = """
296
  Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
 
322
  model_result_path,
323
  benchmark_cols=[
324
  AutoEvalColumn.model.name,
325
+ AutoEvalColumn.license.name,
326
+ AutoEvalColumn.organization.name,
327
+ AutoEvalColumn.knowledge_cutoff.name,
328
+
329
+ AutoEvalColumn.score_reason_logical.name,
330
+ AutoEvalColumn.score_reason_social.name,
331
+ # AutoEvalColumn.rank_reason_logical.name,
332
+ # AutoEvalColumn.rank_reason_social.name,
333
  ],
334
+ rank_col=['sort_by_score'],
335
  )
336
  )
337
 
 
369
  )
370
  )
371
 
372
+ # with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"):
373
+ # leaderboard = overall_leaderboard(
374
+ # get_model_leaderboard_df(
375
+ # model_result_path,
376
+ # benchmark_cols=[
377
+ # AutoEvalColumn.model.name,
378
+ # AutoEvalColumn.rank_reason_logical.name,
379
+ # AutoEvalColumn.rank_reason_social.name,
380
+ # ],
381
+ # rank_col=[],
382
+ # )
383
+ # )
384
+
385
  with gr.TabItem("πŸ”¬ Science", elem_id="science-table", id=4):
386
  CURRENT_TEXT = """
387
  Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
 
403
  model_result_path,
404
  benchmark_cols=[
405
  AutoEvalColumn.model.name,
406
+ AutoEvalColumn.license.name,
407
+ AutoEvalColumn.organization.name,
408
+ AutoEvalColumn.knowledge_cutoff.name,
409
+
410
+ AutoEvalColumn.score_chemistry.name,
411
+ # AutoEvalColumn.rank_chemistry.name,
412
  ],
413
+ rank_col=['sort_by_score'],
414
  )
415
  )
416
 
 
491
 
492
 
493
 
494
+ with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
495
+ DESCRIPTION_TEXT = """
496
+ Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
497
+ We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
498
+ coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
499
+ """
500
+ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
501
+
502
+ with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
503
+ leaderboard = overall_leaderboard(
504
+ get_model_leaderboard_df(
505
+ model_result_path,
506
+ benchmark_cols=[
507
+ AutoEvalColumn.rank_overall.name,
508
+ AutoEvalColumn.model.name,
509
+ AutoEvalColumn.score_overall.name,
510
+ AutoEvalColumn.sd_overall.name,
511
+ AutoEvalColumn.license.name,
512
+ AutoEvalColumn.organization.name,
513
+ AutoEvalColumn.knowledge_cutoff.name,
514
+ ],
515
+ rank_col=[AutoEvalColumn.rank_overall.name],
516
+ ))
517
+
518
 
519
 
520
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=6):
src/populate.py CHANGED
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
24
 
25
  # if there is one col in rank_col, this is an isolated dimension to rank by
26
  # sort by that selected column and remove NaN values
27
- if rank_col:
28
  # df = df.dropna(subset=benchmark_cols)
29
  df = df.dropna(subset=rank_col)
30
  df = df.fillna(0.00)
@@ -32,8 +32,29 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
32
  df = df.sort_values(by=[rank_col[0]], ascending=True)
33
  # print(rank_col, benchmark_cols)
34
  # print(df.head())
35
- else:
36
- # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  avg_rank = df.iloc[:, 1:].mean(axis=1)
38
  df["Average Rank"] = avg_rank.round(decimals=4)
39
  df = df.sort_values(by=["Average Rank"], ascending=True)
@@ -46,10 +67,6 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
46
  df.insert(0, 'Rank', rank)
47
 
48
 
49
- for col in benchmark_cols:
50
- if 'Std dev' in col or 'Score' in col:
51
- df[col] = (df[col]).map('{:.2f}'.format)
52
- df[col] = df[col].round(decimals=2)
53
 
54
 
55
  # for col in benchmark_cols:
 
24
 
25
  # if there is one col in rank_col, this is an isolated dimension to rank by
26
  # sort by that selected column and remove NaN values
27
+ if rank_col and rank_col[0] != "sort_by_score":
28
  # df = df.dropna(subset=benchmark_cols)
29
  df = df.dropna(subset=rank_col)
30
  df = df.fillna(0.00)
 
32
  df = df.sort_values(by=[rank_col[0]], ascending=True)
33
  # print(rank_col, benchmark_cols)
34
  # print(df.head())
35
+
36
+ for col in benchmark_cols:
37
+ if 'Std dev' in col or 'Score' in col:
38
+ df[col] = (df[col]).map('{:.2f}'.format)
39
+ df[col] = df[col].round(decimals=2)
40
+
41
+ elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
42
+ offset_idx = 4
43
+ avg_scores = df.iloc[:, offset_idx:].mean(axis=1)
44
+ df.insert(1, "Average Score", avg_scores)
45
+
46
+ df["Average Score"] = avg_scores.round(decimals=4)
47
+ df = df.sort_values(by=["Average Score"], ascending=False)
48
+ df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
49
+
50
+ df = df.drop(columns=benchmark_cols[offset_idx:])
51
+ # print(benchmark_cols)
52
+ # print(df.head())
53
+ # insert a rank column
54
+ rank = np.arange(1, len(df)+1)
55
+ df.insert(0, 'Rank', rank)
56
+
57
+ else: # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
58
  avg_rank = df.iloc[:, 1:].mean(axis=1)
59
  df["Average Rank"] = avg_rank.round(decimals=4)
60
  df = df.sort_values(by=["Average Rank"], ascending=True)
 
67
  df.insert(0, 'Rank', rank)
68
 
69
 
 
 
 
 
70
 
71
 
72
  # for col in benchmark_cols: