justinxzhao's picture
Initial version of AlpacaEval Visualizations
ca1e7f4
raw
history blame
7.81 kB
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
# Set the layout to wide
st.set_page_config(layout="wide")
def prep_rankings_table(df, y_column):
# Create a copy of the dataframe.
df_copy = df.copy()
# Select the columns we care about, sort by the y column, and reset the index.
df_copy = (
df_copy[
[
"model_name",
y_column,
"num_words_mean",
]
]
.sort_values(y_column, ascending=False)
.reset_index()
)
# Create a rank column.
df_copy["rank"] = df_copy.index + 1
# Round the y column.
df_copy[y_column] = df_copy[y_column].round(2)
# Fix the order.
df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
return df_copy
def app():
st.title("AlpacaEval Visualizations")
st.markdown("## Win rate vs. overall mean length")
# Load the data
df = pd.read_json("data/model_win_rates.json")
# Add a model name column for hover labels
df["model_name"] = df.index.astype(str)
# Define the preset groups
presets = {
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
"model_name"
].tolist(),
"claude": df[df["model_name"].str.contains("claude", case=False)][
"model_name"
].tolist(),
"moa": df[df["model_name"].str.contains("moa", case=False)][
"model_name"
].tolist(),
"llama": df[df["model_name"].str.contains("llama", case=False)][
"model_name"
].tolist(),
"custom": [],
}
# Add radio button for preset groups
preset_selection = st.radio(
"Select a preset group of models or choose 'custom' to select manually",
options=["custom", "gpt", "claude", "moa", "llama"],
)
# Add multiselect for custom model selection
if preset_selection == "custom":
selected_models = st.multiselect(
"Select models to highlight", options=df["model_name"].unique()
)
else:
selected_models = presets[preset_selection]
def create_scatter_plot(df, y_column, selected_models, title):
fig = go.Figure()
# Add scatter plots for num_words_mean and num_tokens_mean
fig.add_trace(
go.Scatter(
x=df["num_words_mean"],
y=df[y_column],
mode="markers",
name="words",
text=df["model_name"],
marker=dict(size=5, color="skyblue"),
showlegend=True,
visible="legendonly", # Make 'words' trace initially visible only in legend
)
)
fig.add_trace(
go.Scatter(
x=df["num_tokens_mean"],
y=df[y_column],
mode="markers",
name="tokens",
text=df["model_name"],
marker=dict(size=5, color="orange"),
showlegend=True,
)
)
# Highlight selected models
if selected_models:
selected_data = df[df["model_name"].isin(selected_models)]
fig.add_trace(
go.Scatter(
x=selected_data["num_words_mean"],
y=selected_data[y_column],
mode="markers",
name="selected words",
text=selected_data["model_name"],
marker=dict(size=10, color="blue"),
showlegend=True,
visible="legendonly", # Make 'selected words' trace initially visible only in legend
)
)
fig.add_trace(
go.Scatter(
x=selected_data["num_tokens_mean"],
y=selected_data[y_column],
mode="markers",
name="selected tokens",
text=selected_data["model_name"],
marker=dict(size=10, color="orangered"),
showlegend=True,
)
)
# Add trendlines
def add_trendline(fig, x, y, name, color, visibility="legendonly"):
X = sm.add_constant(df[x])
model = sm.OLS(df[y], X).fit()
trendline = model.predict(X)
fig.add_trace(
go.Scatter(
x=df[x],
y=trendline,
mode="lines",
name=f"{name} trendline",
line=dict(color=color, width=2),
visible=visibility, # Control the initial visibility
)
)
return model.rsquared
r_squared_words = add_trendline(
fig, "num_words_mean", y_column, "words", "blue"
)
r_squared_tokens = add_trendline(
fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True
)
# Update layout with titles and labels
fig.update_layout(
xaxis_title="Mean length",
yaxis_title=(
"Win rate"
if y_column == "win_rate"
else (
"LC Win Rate"
if y_column == "length_controlled_winrate"
else "Discrete Win Rate"
)
),
title=title,
legend_title="Legend",
)
return fig, r_squared_words, r_squared_tokens
y_column1 = "length_controlled_winrate"
y_column2 = "win_rate"
y_column3 = "discrete_win_rate"
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
df, y_column1, selected_models, "Length-Controlled Win Rate"
)
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
df, y_column2, selected_models, "Win Rate"
)
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
df, y_column3, selected_models, "Discrete Win Rate"
)
# Create tabs for each chart
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
with tab1:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig1)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "length_controlled_winrate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
)
with tab2:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig2)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "win_rate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
)
with tab3:
col1, col2 = st.columns([3, 2])
col1.plotly_chart(fig3)
col2.markdown("#### Rankings")
prepped_df = prep_rankings_table(df, "discrete_win_rate")
col2.dataframe(
prepped_df,
hide_index=True,
)
with st.expander("Trendline R²"):
st.markdown(
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
)
with st.expander("Raw data"):
st.dataframe(df)
if __name__ == "__main__":
app()