Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import statsmodels.api as sm | |
# Set the layout to wide | |
st.set_page_config(layout="wide") | |
def prep_rankings_table(df, y_column): | |
# Create a copy of the dataframe. | |
df_copy = df.copy() | |
# Select the columns we care about, sort by the y column, and reset the index. | |
df_copy = ( | |
df_copy[ | |
[ | |
"model_name", | |
y_column, | |
"num_words_mean", | |
] | |
] | |
.sort_values(y_column, ascending=False) | |
.reset_index() | |
) | |
# Create a rank column. | |
df_copy["rank"] = df_copy.index + 1 | |
# Round the y column. | |
df_copy[y_column] = df_copy[y_column].round(2) | |
# Fix the order. | |
df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]] | |
return df_copy | |
def app(): | |
st.title("AlpacaEval Visualizations") | |
st.markdown("## Win rate vs. overall mean length") | |
# Load the data | |
df = pd.read_json("data/model_win_rates.json") | |
# Add a model name column for hover labels | |
df["model_name"] = df.index.astype(str) | |
# Define the preset groups | |
presets = { | |
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][ | |
"model_name" | |
].tolist(), | |
"claude": df[df["model_name"].str.contains("claude", case=False)][ | |
"model_name" | |
].tolist(), | |
"moa": df[df["model_name"].str.contains("moa", case=False)][ | |
"model_name" | |
].tolist(), | |
"llama": df[df["model_name"].str.contains("llama", case=False)][ | |
"model_name" | |
].tolist(), | |
"custom": [], | |
} | |
# Add radio button for preset groups | |
preset_selection = st.radio( | |
"Select a preset group of models or choose 'custom' to select manually", | |
options=["custom", "gpt", "claude", "moa", "llama"], | |
) | |
# Add multiselect for custom model selection | |
if preset_selection == "custom": | |
selected_models = st.multiselect( | |
"Select models to highlight", options=df["model_name"].unique() | |
) | |
else: | |
selected_models = presets[preset_selection] | |
def create_scatter_plot(df, y_column, selected_models, title): | |
fig = go.Figure() | |
# Add scatter plots for num_words_mean and num_tokens_mean | |
fig.add_trace( | |
go.Scatter( | |
x=df["num_words_mean"], | |
y=df[y_column], | |
mode="markers", | |
name="words", | |
text=df["model_name"], | |
marker=dict(size=5, color="skyblue"), | |
showlegend=True, | |
visible="legendonly", # Make 'words' trace initially visible only in legend | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=df["num_tokens_mean"], | |
y=df[y_column], | |
mode="markers", | |
name="tokens", | |
text=df["model_name"], | |
marker=dict(size=5, color="orange"), | |
showlegend=True, | |
) | |
) | |
# Highlight selected models | |
if selected_models: | |
selected_data = df[df["model_name"].isin(selected_models)] | |
fig.add_trace( | |
go.Scatter( | |
x=selected_data["num_words_mean"], | |
y=selected_data[y_column], | |
mode="markers", | |
name="selected words", | |
text=selected_data["model_name"], | |
marker=dict(size=10, color="blue"), | |
showlegend=True, | |
visible="legendonly", # Make 'selected words' trace initially visible only in legend | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=selected_data["num_tokens_mean"], | |
y=selected_data[y_column], | |
mode="markers", | |
name="selected tokens", | |
text=selected_data["model_name"], | |
marker=dict(size=10, color="orangered"), | |
showlegend=True, | |
) | |
) | |
# Add trendlines | |
def add_trendline(fig, x, y, name, color, visibility="legendonly"): | |
X = sm.add_constant(df[x]) | |
model = sm.OLS(df[y], X).fit() | |
trendline = model.predict(X) | |
fig.add_trace( | |
go.Scatter( | |
x=df[x], | |
y=trendline, | |
mode="lines", | |
name=f"{name} trendline", | |
line=dict(color=color, width=2), | |
visible=visibility, # Control the initial visibility | |
) | |
) | |
return model.rsquared | |
r_squared_words = add_trendline( | |
fig, "num_words_mean", y_column, "words", "blue" | |
) | |
r_squared_tokens = add_trendline( | |
fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True | |
) | |
# Update layout with titles and labels | |
fig.update_layout( | |
xaxis_title="Mean length", | |
yaxis_title=( | |
"Win rate" | |
if y_column == "win_rate" | |
else ( | |
"LC Win Rate" | |
if y_column == "length_controlled_winrate" | |
else "Discrete Win Rate" | |
) | |
), | |
title=title, | |
legend_title="Legend", | |
) | |
return fig, r_squared_words, r_squared_tokens | |
y_column1 = "length_controlled_winrate" | |
y_column2 = "win_rate" | |
y_column3 = "discrete_win_rate" | |
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot( | |
df, y_column1, selected_models, "Length-Controlled Win Rate" | |
) | |
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot( | |
df, y_column2, selected_models, "Win Rate" | |
) | |
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot( | |
df, y_column3, selected_models, "Discrete Win Rate" | |
) | |
# Create tabs for each chart | |
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"]) | |
with tab1: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig1) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "length_controlled_winrate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}" | |
) | |
with tab2: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig2) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "win_rate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}" | |
) | |
with tab3: | |
col1, col2 = st.columns([3, 2]) | |
col1.plotly_chart(fig3) | |
col2.markdown("#### Rankings") | |
prepped_df = prep_rankings_table(df, "discrete_win_rate") | |
col2.dataframe( | |
prepped_df, | |
hide_index=True, | |
) | |
with st.expander("Trendline R²"): | |
st.markdown( | |
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}" | |
) | |
with st.expander("Raw data"): | |
st.dataframe(df) | |
if __name__ == "__main__": | |
app() | |