Update space
Browse files- app.py +79 -32
- src/populate.py +24 -7
app.py
CHANGED
@@ -183,29 +183,6 @@ with demo:
|
|
183 |
)
|
184 |
)
|
185 |
|
186 |
-
with gr.TabItem("π― Mixed", elem_id="llm-benchmark-tab-table", id=1):
|
187 |
-
DESCRIPTION_TEXT = """
|
188 |
-
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
|
189 |
-
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
|
190 |
-
coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
|
191 |
-
"""
|
192 |
-
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
193 |
-
|
194 |
-
with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
|
195 |
-
leaderboard = overall_leaderboard(
|
196 |
-
get_model_leaderboard_df(
|
197 |
-
model_result_path,
|
198 |
-
benchmark_cols=[
|
199 |
-
AutoEvalColumn.rank_overall.name,
|
200 |
-
AutoEvalColumn.model.name,
|
201 |
-
AutoEvalColumn.score_overall.name,
|
202 |
-
AutoEvalColumn.sd_overall.name,
|
203 |
-
AutoEvalColumn.license.name,
|
204 |
-
AutoEvalColumn.organization.name,
|
205 |
-
AutoEvalColumn.knowledge_cutoff.name,
|
206 |
-
],
|
207 |
-
rank_col=[AutoEvalColumn.rank_overall.name],
|
208 |
-
))
|
209 |
|
210 |
|
211 |
with gr.TabItem("π’ Math", elem_id="math-tab-table", id=2):
|
@@ -232,11 +209,18 @@ with demo:
|
|
232 |
model_result_path,
|
233 |
benchmark_cols=[
|
234 |
AutoEvalColumn.model.name,
|
235 |
-
AutoEvalColumn.
|
236 |
-
AutoEvalColumn.
|
237 |
-
AutoEvalColumn.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
],
|
239 |
-
rank_col=[],
|
240 |
)
|
241 |
)
|
242 |
|
@@ -292,6 +276,21 @@ with demo:
|
|
292 |
)
|
293 |
)
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
with gr.TabItem("π§ Reasoning", elem_id="reasonong-tab-table", id=3):
|
296 |
DESCRIPTION_TEXT = """
|
297 |
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
|
@@ -323,10 +322,16 @@ with demo:
|
|
323 |
model_result_path,
|
324 |
benchmark_cols=[
|
325 |
AutoEvalColumn.model.name,
|
326 |
-
AutoEvalColumn.
|
327 |
-
AutoEvalColumn.
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
],
|
329 |
-
rank_col=[],
|
330 |
)
|
331 |
)
|
332 |
|
@@ -364,6 +369,19 @@ with demo:
|
|
364 |
)
|
365 |
)
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
|
368 |
CURRENT_TEXT = """
|
369 |
Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
|
@@ -385,9 +403,14 @@ with demo:
|
|
385 |
model_result_path,
|
386 |
benchmark_cols=[
|
387 |
AutoEvalColumn.model.name,
|
388 |
-
AutoEvalColumn.
|
|
|
|
|
|
|
|
|
|
|
389 |
],
|
390 |
-
rank_col=[],
|
391 |
)
|
392 |
)
|
393 |
|
@@ -468,6 +491,30 @@ with demo:
|
|
468 |
|
469 |
|
470 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
|
472 |
|
473 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
|
|
183 |
)
|
184 |
)
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
|
188 |
with gr.TabItem("π’ Math", elem_id="math-tab-table", id=2):
|
|
|
209 |
model_result_path,
|
210 |
benchmark_cols=[
|
211 |
AutoEvalColumn.model.name,
|
212 |
+
AutoEvalColumn.license.name,
|
213 |
+
AutoEvalColumn.organization.name,
|
214 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
215 |
+
|
216 |
+
AutoEvalColumn.score_math_algebra.name,
|
217 |
+
AutoEvalColumn.score_math_geometry.name,
|
218 |
+
AutoEvalColumn.score_math_probability.name,
|
219 |
+
# AutoEvalColumn.rank_math_algebra.name,
|
220 |
+
# AutoEvalColumn.rank_math_geometry.name,
|
221 |
+
# AutoEvalColumn.rank_math_probability.name,
|
222 |
],
|
223 |
+
rank_col=['sort_by_score'],
|
224 |
)
|
225 |
)
|
226 |
|
|
|
276 |
)
|
277 |
)
|
278 |
|
279 |
+
|
280 |
+
# with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"):
|
281 |
+
# leaderboard = overall_leaderboard(
|
282 |
+
# get_model_leaderboard_df(
|
283 |
+
# model_result_path,
|
284 |
+
# benchmark_cols=[
|
285 |
+
# AutoEvalColumn.model.name,
|
286 |
+
# AutoEvalColumn.rank_math_algebra.name,
|
287 |
+
# AutoEvalColumn.rank_math_geometry.name,
|
288 |
+
# AutoEvalColumn.rank_math_probability.name,
|
289 |
+
# ],
|
290 |
+
# rank_col=[],
|
291 |
+
# )
|
292 |
+
# )
|
293 |
+
|
294 |
with gr.TabItem("π§ Reasoning", elem_id="reasonong-tab-table", id=3):
|
295 |
DESCRIPTION_TEXT = """
|
296 |
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
|
|
|
322 |
model_result_path,
|
323 |
benchmark_cols=[
|
324 |
AutoEvalColumn.model.name,
|
325 |
+
AutoEvalColumn.license.name,
|
326 |
+
AutoEvalColumn.organization.name,
|
327 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
328 |
+
|
329 |
+
AutoEvalColumn.score_reason_logical.name,
|
330 |
+
AutoEvalColumn.score_reason_social.name,
|
331 |
+
# AutoEvalColumn.rank_reason_logical.name,
|
332 |
+
# AutoEvalColumn.rank_reason_social.name,
|
333 |
],
|
334 |
+
rank_col=['sort_by_score'],
|
335 |
)
|
336 |
)
|
337 |
|
|
|
369 |
)
|
370 |
)
|
371 |
|
372 |
+
# with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"):
|
373 |
+
# leaderboard = overall_leaderboard(
|
374 |
+
# get_model_leaderboard_df(
|
375 |
+
# model_result_path,
|
376 |
+
# benchmark_cols=[
|
377 |
+
# AutoEvalColumn.model.name,
|
378 |
+
# AutoEvalColumn.rank_reason_logical.name,
|
379 |
+
# AutoEvalColumn.rank_reason_social.name,
|
380 |
+
# ],
|
381 |
+
# rank_col=[],
|
382 |
+
# )
|
383 |
+
# )
|
384 |
+
|
385 |
with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
|
386 |
CURRENT_TEXT = """
|
387 |
Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
|
|
|
403 |
model_result_path,
|
404 |
benchmark_cols=[
|
405 |
AutoEvalColumn.model.name,
|
406 |
+
AutoEvalColumn.license.name,
|
407 |
+
AutoEvalColumn.organization.name,
|
408 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
409 |
+
|
410 |
+
AutoEvalColumn.score_chemistry.name,
|
411 |
+
# AutoEvalColumn.rank_chemistry.name,
|
412 |
],
|
413 |
+
rank_col=['sort_by_score'],
|
414 |
)
|
415 |
)
|
416 |
|
|
|
491 |
|
492 |
|
493 |
|
494 |
+
with gr.TabItem("π― Mixed", elem_id="llm-benchmark-tab-table", id=1):
|
495 |
+
DESCRIPTION_TEXT = """
|
496 |
+
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
|
497 |
+
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
|
498 |
+
coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
|
499 |
+
"""
|
500 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
501 |
+
|
502 |
+
with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
|
503 |
+
leaderboard = overall_leaderboard(
|
504 |
+
get_model_leaderboard_df(
|
505 |
+
model_result_path,
|
506 |
+
benchmark_cols=[
|
507 |
+
AutoEvalColumn.rank_overall.name,
|
508 |
+
AutoEvalColumn.model.name,
|
509 |
+
AutoEvalColumn.score_overall.name,
|
510 |
+
AutoEvalColumn.sd_overall.name,
|
511 |
+
AutoEvalColumn.license.name,
|
512 |
+
AutoEvalColumn.organization.name,
|
513 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
514 |
+
],
|
515 |
+
rank_col=[AutoEvalColumn.rank_overall.name],
|
516 |
+
))
|
517 |
+
|
518 |
|
519 |
|
520 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
|
src/populate.py
CHANGED
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
24 |
|
25 |
# if there is one col in rank_col, this is an isolated dimension to rank by
|
26 |
# sort by that selected column and remove NaN values
|
27 |
-
if rank_col:
|
28 |
# df = df.dropna(subset=benchmark_cols)
|
29 |
df = df.dropna(subset=rank_col)
|
30 |
df = df.fillna(0.00)
|
@@ -32,8 +32,29 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
32 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
33 |
# print(rank_col, benchmark_cols)
|
34 |
# print(df.head())
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
38 |
df["Average Rank"] = avg_rank.round(decimals=4)
|
39 |
df = df.sort_values(by=["Average Rank"], ascending=True)
|
@@ -46,10 +67,6 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
46 |
df.insert(0, 'Rank', rank)
|
47 |
|
48 |
|
49 |
-
for col in benchmark_cols:
|
50 |
-
if 'Std dev' in col or 'Score' in col:
|
51 |
-
df[col] = (df[col]).map('{:.2f}'.format)
|
52 |
-
df[col] = df[col].round(decimals=2)
|
53 |
|
54 |
|
55 |
# for col in benchmark_cols:
|
|
|
24 |
|
25 |
# if there is one col in rank_col, this is an isolated dimension to rank by
|
26 |
# sort by that selected column and remove NaN values
|
27 |
+
if rank_col and rank_col[0] != "sort_by_score":
|
28 |
# df = df.dropna(subset=benchmark_cols)
|
29 |
df = df.dropna(subset=rank_col)
|
30 |
df = df.fillna(0.00)
|
|
|
32 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
33 |
# print(rank_col, benchmark_cols)
|
34 |
# print(df.head())
|
35 |
+
|
36 |
+
for col in benchmark_cols:
|
37 |
+
if 'Std dev' in col or 'Score' in col:
|
38 |
+
df[col] = (df[col]).map('{:.2f}'.format)
|
39 |
+
df[col] = df[col].round(decimals=2)
|
40 |
+
|
41 |
+
elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
|
42 |
+
offset_idx = 4
|
43 |
+
avg_scores = df.iloc[:, offset_idx:].mean(axis=1)
|
44 |
+
df.insert(1, "Average Score", avg_scores)
|
45 |
+
|
46 |
+
df["Average Score"] = avg_scores.round(decimals=4)
|
47 |
+
df = df.sort_values(by=["Average Score"], ascending=False)
|
48 |
+
df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
|
49 |
+
|
50 |
+
df = df.drop(columns=benchmark_cols[offset_idx:])
|
51 |
+
# print(benchmark_cols)
|
52 |
+
# print(df.head())
|
53 |
+
# insert a rank column
|
54 |
+
rank = np.arange(1, len(df)+1)
|
55 |
+
df.insert(0, 'Rank', rank)
|
56 |
+
|
57 |
+
else: # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
58 |
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
59 |
df["Average Rank"] = avg_rank.round(decimals=4)
|
60 |
df = df.sort_values(by=["Average Rank"], ascending=True)
|
|
|
67 |
df.insert(0, 'Rank', rank)
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
72 |
# for col in benchmark_cols:
|