Spaces:

LLM360
/

de-arena

Running

App Files Files Community

yzabc007 commited on Oct 21, 2024

Commit

bac050f

1 Parent(s): 46e675d

Update space

Browse files

Files changed (2) hide show

app.py +79 -32
src/populate.py +24 -7

app.py CHANGED Viewed

@@ -183,29 +183,6 @@ with demo:
                 )
             )
-        with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
-            DESCRIPTION_TEXT = """
-            Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
-            We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
-            coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
-            """
-            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
-            with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
-                leaderboard = overall_leaderboard(
-                    get_model_leaderboard_df(
-                        model_result_path,
-                        benchmark_cols=[
-                            AutoEvalColumn.rank_overall.name,
-                            AutoEvalColumn.model.name,
-                            AutoEvalColumn.score_overall.name,
-                            AutoEvalColumn.sd_overall.name,
-                            AutoEvalColumn.license.name,
-                            AutoEvalColumn.organization.name,
-                            AutoEvalColumn.knowledge_cutoff.name,
-                            ],
-                        rank_col=[AutoEvalColumn.rank_overall.name],
-                    ))
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
@@ -232,11 +209,18 @@ with demo:
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
-                            AutoEvalColumn.rank_math_algebra.name,
-                            AutoEvalColumn.rank_math_geometry.name,
-                            AutoEvalColumn.rank_math_probability.name,
                             ],
-                        rank_col=[],
                     )
                 )
@@ -292,6 +276,21 @@ with demo:
                     )
                 )
         with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
             DESCRIPTION_TEXT = """
             Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
@@ -323,10 +322,16 @@ with demo:
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
-                            AutoEvalColumn.rank_reason_logical.name,
-                            AutoEvalColumn.rank_reason_social.name,
                             ],
-                        rank_col=[],
                     )
                 )
@@ -364,6 +369,19 @@ with demo:
                     )
                 )
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
             Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
@@ -385,9 +403,14 @@ with demo:
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
-                            AutoEvalColumn.rank_chemistry.name,
                             ],
-                        rank_col=[],
                     )
                 )
@@ -468,6 +491,30 @@ with demo:
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):

                 )
             )
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            AutoEvalColumn.score_math_algebra.name,
+                            AutoEvalColumn.score_math_geometry.name,
+                            AutoEvalColumn.score_math_probability.name,
+                            # AutoEvalColumn.rank_math_algebra.name,
+                            # AutoEvalColumn.rank_math_geometry.name,
+                            # AutoEvalColumn.rank_math_probability.name,
                             ],
+                        rank_col=['sort_by_score'],
                     )
                 )
                     )
                 )
+            # with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"):
+            #     leaderboard = overall_leaderboard(
+            #         get_model_leaderboard_df(
+            #             model_result_path,
+            #             benchmark_cols=[
+            #                 AutoEvalColumn.model.name,
+            #                 AutoEvalColumn.rank_math_algebra.name,
+            #                 AutoEvalColumn.rank_math_geometry.name,
+            #                 AutoEvalColumn.rank_math_probability.name,
+            #                 ],
+            #             rank_col=[],
+            #         )
+            #     )
         with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
             DESCRIPTION_TEXT = """
             Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            AutoEvalColumn.score_reason_logical.name,
+                            AutoEvalColumn.score_reason_social.name,
+                            # AutoEvalColumn.rank_reason_logical.name,
+                            # AutoEvalColumn.rank_reason_social.name,
                             ],
+                        rank_col=['sort_by_score'],
                     )
                 )
                     )
                 )
+            # with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"):
+            #     leaderboard = overall_leaderboard(
+            #         get_model_leaderboard_df(
+            #             model_result_path,
+            #             benchmark_cols=[
+            #                 AutoEvalColumn.model.name,
+            #                 AutoEvalColumn.rank_reason_logical.name,
+            #                 AutoEvalColumn.rank_reason_social.name,
+            #                 ],
+            #             rank_col=[],
+            #         )
+            #     )
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
             Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
                         model_result_path,
                         benchmark_cols=[
                             AutoEvalColumn.model.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            AutoEvalColumn.score_chemistry.name,
+                            # AutoEvalColumn.rank_chemistry.name,
                             ],
+                        rank_col=['sort_by_score'],
                     )
                 )
+        with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
+            DESCRIPTION_TEXT = """
+            Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
+            We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
+            coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
+            """
+            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.rank_overall.name,
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.score_overall.name,
+                            AutoEvalColumn.sd_overall.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            ],
+                        rank_col=[AutoEvalColumn.rank_overall.name],
+                    ))
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):

src/populate.py CHANGED Viewed

@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
     # if there is one col in rank_col, this is an isolated dimension to rank by
     # sort by that selected column and remove NaN values
-    if rank_col:
         # df = df.dropna(subset=benchmark_cols)
         df = df.dropna(subset=rank_col)
         df = df.fillna(0.00)
@@ -32,8 +32,29 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col, benchmark_cols)
         # print(df.head())
-    else:
-        # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
         avg_rank = df.iloc[:, 1:].mean(axis=1)
         df["Average Rank"] = avg_rank.round(decimals=4)
         df = df.sort_values(by=["Average Rank"], ascending=True)
@@ -46,10 +67,6 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
         df.insert(0, 'Rank', rank)
-    for col in benchmark_cols:
-        if 'Std dev' in col or 'Score' in col:
-            df[col] = (df[col]).map('{:.2f}'.format)
-            df[col] = df[col].round(decimals=2)
     # for col in benchmark_cols:

     # if there is one col in rank_col, this is an isolated dimension to rank by
     # sort by that selected column and remove NaN values
+    if rank_col and rank_col[0] != "sort_by_score":
         # df = df.dropna(subset=benchmark_cols)
         df = df.dropna(subset=rank_col)
         df = df.fillna(0.00)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col, benchmark_cols)
         # print(df.head())
+        for col in benchmark_cols:
+            if 'Std dev' in col or 'Score' in col:
+                df[col] = (df[col]).map('{:.2f}'.format)
+                df[col] = df[col].round(decimals=2)
+    elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
+        offset_idx = 4
+        avg_scores = df.iloc[:, offset_idx:].mean(axis=1)
+        df.insert(1, "Average Score", avg_scores)
+        df["Average Score"] = avg_scores.round(decimals=4)
+        df = df.sort_values(by=["Average Score"], ascending=False)
+        df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
+        df = df.drop(columns=benchmark_cols[offset_idx:])
+        # print(benchmark_cols)
+        # print(df.head())
+        # insert a rank column
+        rank = np.arange(1, len(df)+1)
+        df.insert(0, 'Rank', rank)
+    else:  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
         avg_rank = df.iloc[:, 1:].mean(axis=1)
         df["Average Rank"] = avg_rank.round(decimals=4)
         df = df.sort_values(by=["Average Rank"], ascending=True)
         df.insert(0, 'Rank', rank)
     # for col in benchmark_cols: