Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
167137b
1
Parent(s):
311dc3a
Add filters
Browse files- app.py +157 -101
- release_date_mapping.json +1 -1
- requirements.txt +2 -1
- utils.py +60 -0
app.py
CHANGED
@@ -2,113 +2,169 @@ import pickle
|
|
2 |
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
if k not in elo_results:
|
57 |
-
continue
|
58 |
-
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
59 |
-
|
60 |
-
# gather open llm leaderboard data
|
61 |
-
LEADERBOARD_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.csv"
|
62 |
-
leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)
|
63 |
-
latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[
|
64 |
-
0
|
65 |
-
]
|
66 |
-
|
67 |
-
latest_leaderboard_file_local = hf_hub_download(
|
68 |
-
repo_id="lmsys/chatbot-arena-leaderboard",
|
69 |
-
filename=latest_leaderboard_file.split("/")[-1],
|
70 |
-
repo_type="space",
|
71 |
)
|
72 |
-
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
|
73 |
-
|
74 |
-
###################
|
75 |
-
### Prepare Data
|
76 |
-
###################
|
77 |
-
|
78 |
-
# merge leaderboard data with ELO data
|
79 |
-
merged_dfs = {}
|
80 |
-
for k, v in arena_dfs.items():
|
81 |
-
merged_dfs[k] = (
|
82 |
-
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
|
83 |
-
.sort_values("rating", ascending=False)
|
84 |
-
.reset_index(drop=True)
|
85 |
-
)
|
86 |
|
87 |
-
# add release dates into the merged data
|
88 |
-
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
|
89 |
-
for k, v in merged_dfs.items():
|
90 |
-
merged_dfs[k] = pd.merge(
|
91 |
-
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
|
92 |
-
)
|
93 |
df = merged_dfs["Overall"]
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
with gr.Row():
|
101 |
-
gr.
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
|
114 |
-
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
+
import plotly.express as px
|
6 |
+
|
7 |
+
from utils import (
|
8 |
+
KEY_TO_CATEGORY_NAME,
|
9 |
+
PROPRIETARY_LICENSES,
|
10 |
+
download_latest_data_from_space,
|
11 |
+
)
|
12 |
+
|
13 |
+
# with gr.NO_RELOAD:
|
14 |
+
###################
|
15 |
+
### Load Data
|
16 |
+
###################
|
17 |
+
|
18 |
+
# gather ELO data
|
19 |
+
latest_elo_file_local = download_latest_data_from_space(
|
20 |
+
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
|
21 |
+
)
|
22 |
+
|
23 |
+
with open(latest_elo_file_local, "rb") as fin:
|
24 |
+
elo_results = pickle.load(fin)
|
25 |
+
|
26 |
+
arena_dfs = {}
|
27 |
+
for k in KEY_TO_CATEGORY_NAME.keys():
|
28 |
+
if k not in elo_results:
|
29 |
+
continue
|
30 |
+
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
|
31 |
+
|
32 |
+
# gather open llm leaderboard data
|
33 |
+
latest_leaderboard_file_local = download_latest_data_from_space(
|
34 |
+
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
|
35 |
+
)
|
36 |
+
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
|
37 |
+
|
38 |
+
###################
|
39 |
+
### Prepare Data
|
40 |
+
###################
|
41 |
+
|
42 |
+
# merge leaderboard data with ELO data
|
43 |
+
merged_dfs = {}
|
44 |
+
for k, v in arena_dfs.items():
|
45 |
+
merged_dfs[k] = (
|
46 |
+
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
|
47 |
+
.sort_values("rating", ascending=False)
|
48 |
+
.reset_index(drop=True)
|
49 |
)
|
50 |
|
51 |
+
# add release dates into the merged data
|
52 |
+
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
|
53 |
+
for k, v in merged_dfs.items():
|
54 |
+
merged_dfs[k] = pd.merge(
|
55 |
+
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
df = merged_dfs["Overall"]
|
59 |
+
df["License"] = df["License"].apply(
|
60 |
+
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
|
61 |
+
)
|
62 |
+
df["Release Date"] = pd.to_datetime(df["Release Date"])
|
63 |
+
df["Month-Year"] = df["Release Date"].dt.to_period("M")
|
64 |
+
df["rating"] = df["rating"].round()
|
65 |
+
|
66 |
+
|
67 |
+
###################
|
68 |
+
### Plot Data
|
69 |
+
###################
|
70 |
+
|
71 |
+
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
|
72 |
+
min_elo_score = df["rating"].min().round()
|
73 |
+
max_elo_score = df["rating"].max().round()
|
74 |
+
upper_models_per_month = int(
|
75 |
+
df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
def build_plot(min_score, max_models_per_month, toggle_annotations):
|
80 |
+
|
81 |
+
filtered_df = df[(df["rating"] >= min_score)]
|
82 |
+
filtered_df = (
|
83 |
+
filtered_df.groupby(["Month-Year", "License"])
|
84 |
+
.apply(lambda x: x.nlargest(max_models_per_month, "rating"))
|
85 |
+
.reset_index(drop=True)
|
86 |
+
)
|
87 |
+
|
88 |
+
fig = px.scatter(
|
89 |
+
filtered_df,
|
90 |
+
x="Release Date",
|
91 |
+
y="rating",
|
92 |
+
color="License",
|
93 |
+
hover_name="Model",
|
94 |
+
hover_data=["Organization", "License"],
|
95 |
+
trendline="ols",
|
96 |
+
title=f"Proprietary vs Open LLMs (LMSYS Arena ELO as of {date_updated})",
|
97 |
+
labels={"rating": "Arena ELO", "Release Date": "Release Date"},
|
98 |
+
height=700,
|
99 |
+
template="seaborn",
|
100 |
+
)
|
101 |
+
|
102 |
+
fig.update_traces(marker=dict(size=10, opacity=0.6))
|
103 |
|
104 |
+
if toggle_annotations:
|
105 |
+
# get the points to annotate (only the highest rated model per month per license)
|
106 |
+
idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
|
107 |
+
"rating"
|
108 |
+
].idxmax()
|
109 |
+
points_to_annotate_df = filtered_df.loc[idx_to_annotate]
|
110 |
+
|
111 |
+
for i, row in points_to_annotate_df.iterrows():
|
112 |
+
fig.add_annotation(
|
113 |
+
x=row["Release Date"],
|
114 |
+
y=row["rating"],
|
115 |
+
text=row["Model"],
|
116 |
+
showarrow=True,
|
117 |
+
arrowhead=0,
|
118 |
+
)
|
119 |
+
|
120 |
+
return fig
|
121 |
+
|
122 |
+
|
123 |
+
demo = gr.Blocks()
|
124 |
+
|
125 |
+
with demo:
|
126 |
+
gr.Markdown("# Proprietary vs Open LLMs (LMSYS Arena ELO)")
|
127 |
with gr.Row():
|
128 |
+
min_score = gr.Slider(
|
129 |
+
minimum=min_elo_score,
|
130 |
+
maximum=max_elo_score,
|
131 |
+
value=800,
|
132 |
+
step=50,
|
133 |
+
label="Minimum ELO Score",
|
134 |
+
)
|
135 |
+
max_models_per_month = gr.Slider(
|
136 |
+
value=upper_models_per_month,
|
137 |
+
minimum=1,
|
138 |
+
maximum=upper_models_per_month,
|
139 |
+
step=1,
|
140 |
+
label="Max Models per Month (per License)",
|
141 |
)
|
142 |
+
toggle_annotations = gr.Radio(
|
143 |
+
choices=[True, False], label="Overlay Best Model Name", value=False
|
144 |
+
)
|
145 |
+
|
146 |
+
# Show plot
|
147 |
+
plot = gr.Plot()
|
148 |
+
demo.load(
|
149 |
+
fn=build_plot,
|
150 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
151 |
+
outputs=plot,
|
152 |
+
)
|
153 |
+
min_score.change(
|
154 |
+
fn=build_plot,
|
155 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
156 |
+
outputs=plot,
|
157 |
+
)
|
158 |
+
max_models_per_month.change(
|
159 |
+
fn=build_plot,
|
160 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
161 |
+
outputs=plot,
|
162 |
+
)
|
163 |
+
toggle_annotations.change(
|
164 |
+
fn=build_plot,
|
165 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
166 |
+
outputs=plot,
|
167 |
+
)
|
168 |
|
169 |
+
demo.launch()
|
170 |
+
# if __name__ == "__main__":
|
release_date_mapping.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
{
|
8 |
"key": "gpt-4-1106-preview",
|
9 |
"Model": "GPT-4-1106-preview",
|
10 |
-
"Release Date": "
|
11 |
},
|
12 |
{
|
13 |
"key": "claude-3-opus-20240229",
|
|
|
7 |
{
|
8 |
"key": "gpt-4-1106-preview",
|
9 |
"Model": "GPT-4-1106-preview",
|
10 |
+
"Release Date": "2023-11-06"
|
11 |
},
|
12 |
{
|
13 |
"key": "claude-3-opus-20240229",
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
huggingface_hub
|
2 |
pandas
|
3 |
plotly
|
4 |
-
gradio
|
|
|
|
1 |
huggingface_hub
|
2 |
pandas
|
3 |
plotly
|
4 |
+
gradio
|
5 |
+
statsmodels
|
utils.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal
|
2 |
+
|
3 |
+
from huggingface_hub import HfFileSystem, hf_hub_download
|
4 |
+
|
5 |
+
KEY_TO_CATEGORY_NAME = {
|
6 |
+
"full": "Overall",
|
7 |
+
"coding": "Coding",
|
8 |
+
"long_user": "Longer Query",
|
9 |
+
"english": "English",
|
10 |
+
"chinese": "Chinese",
|
11 |
+
"french": "French",
|
12 |
+
"no_tie": "Exclude Ties",
|
13 |
+
"no_short": "Exclude Short Query (< 5 tokens)",
|
14 |
+
"no_refusal": "Exclude Refusal",
|
15 |
+
}
|
16 |
+
CAT_NAME_TO_EXPLANATION = {
|
17 |
+
"Overall": "Overall Questions",
|
18 |
+
"Coding": "Coding: whether conversation contains code snippets",
|
19 |
+
"Longer Query": "Longer Query (>= 500 tokens)",
|
20 |
+
"English": "English Prompts",
|
21 |
+
"Chinese": "Chinese Prompts",
|
22 |
+
"French": "French Prompts",
|
23 |
+
"Exclude Ties": "Exclude Ties and Bothbad",
|
24 |
+
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
25 |
+
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
26 |
+
}
|
27 |
+
|
28 |
+
PROPRIETARY_LICENSES = [
|
29 |
+
"Proprietary",
|
30 |
+
]
|
31 |
+
|
32 |
+
|
33 |
+
def download_latest_data_from_space(
|
34 |
+
repo_id: str, file_type: Literal["pkl", "csv"]
|
35 |
+
) -> str:
|
36 |
+
"""
|
37 |
+
Downloads the latest data file of the specified file type from the given repository space.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
repo_id (str): The ID of the repository space.
|
41 |
+
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
str: The local file path of the downloaded data file.
|
45 |
+
"""
|
46 |
+
|
47 |
+
def extract_date(filename):
|
48 |
+
return filename.split("/")[-1].split(".")[0].split("_")[-1]
|
49 |
+
|
50 |
+
fs = HfFileSystem()
|
51 |
+
data_file_path = f"spaces/{repo_id}/*.{file_type}"
|
52 |
+
files = fs.glob(data_file_path)
|
53 |
+
latest_file = sorted(files, key=extract_date, reverse=True)[0]
|
54 |
+
|
55 |
+
latest_filepath_local = hf_hub_download(
|
56 |
+
repo_id=repo_id,
|
57 |
+
filename=latest_file.split("/")[-1],
|
58 |
+
repo_type="space",
|
59 |
+
)
|
60 |
+
return latest_filepath_local
|