Spaces:
Runtime error
Runtime error
justinxzhao
commited on
Commit
·
707a231
1
Parent(s):
ed3afcd
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
Browse files- .gitattributes +2 -0
- .gitignore +2 -1
- app.py +577 -167
- data/df_response_judging.jsonl +3 -0
- data/df_responses.jsonl +3 -0
- data/model_win_rates.json +0 -1
- data/model_win_rates.jsonl +53 -0
- prep_data.py +8 -2
- prep_data_annotations.py +62 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/df_response_judging.jsonl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/df_responses.jsonl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
env/
|
2 |
-
submodules/
|
|
|
|
1 |
env/
|
2 |
+
submodules/
|
3 |
+
.DS_Store
|
app.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3 |
import plotly.express as px
|
4 |
import plotly.graph_objects as go
|
5 |
import statsmodels.api as sm
|
|
|
6 |
|
7 |
# Set the layout to wide
|
8 |
st.set_page_config(layout="wide")
|
@@ -36,206 +37,615 @@ def prep_rankings_table(df, y_column):
|
|
36 |
return df_copy
|
37 |
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def app():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
st.title("AlpacaEval Visualizations")
|
41 |
|
42 |
-
st.
|
43 |
|
44 |
# Load the data
|
45 |
-
df = pd.read_json("data/model_win_rates.
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# Define the preset groups
|
51 |
-
presets = {
|
52 |
-
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
|
53 |
-
"model_name"
|
54 |
-
].tolist(),
|
55 |
-
"claude": df[df["model_name"].str.contains("claude", case=False)][
|
56 |
-
"model_name"
|
57 |
-
].tolist(),
|
58 |
-
"moa": df[df["model_name"].str.contains("moa", case=False)][
|
59 |
-
"model_name"
|
60 |
-
].tolist(),
|
61 |
-
"llama": df[df["model_name"].str.contains("llama", case=False)][
|
62 |
-
"model_name"
|
63 |
-
].tolist(),
|
64 |
-
"custom": [],
|
65 |
-
}
|
66 |
-
|
67 |
-
# Add radio button for preset groups
|
68 |
-
preset_selection = st.radio(
|
69 |
-
"Select a preset group of models or choose 'custom' to select manually",
|
70 |
-
options=["custom", "gpt", "claude", "moa", "llama"],
|
71 |
)
|
72 |
|
73 |
-
#
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
)
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
112 |
fig.add_trace(
|
113 |
go.Scatter(
|
114 |
-
x=
|
115 |
-
y=
|
116 |
mode="markers",
|
117 |
-
name="
|
118 |
-
text=
|
119 |
-
marker=dict(size=
|
120 |
showlegend=True,
|
121 |
)
|
122 |
)
|
123 |
fig.add_trace(
|
124 |
go.Scatter(
|
125 |
-
x=
|
126 |
-
y=
|
127 |
mode="markers",
|
128 |
-
name="
|
129 |
-
text=
|
130 |
-
marker=dict(size=
|
131 |
showlegend=True,
|
132 |
-
visible="legendonly", # Make '
|
133 |
)
|
134 |
)
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
)
|
151 |
-
return model.rsquared
|
152 |
|
153 |
-
|
154 |
-
fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
)
|
156 |
-
|
157 |
-
|
158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
170 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
),
|
172 |
-
title=title,
|
173 |
-
legend_title="Legend",
|
174 |
)
|
175 |
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
y_column2 = "win_rate"
|
180 |
-
y_column3 = "discrete_win_rate"
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
col1.plotly_chart(fig1)
|
198 |
-
col2.markdown("#### Rankings")
|
199 |
-
prepped_df = prep_rankings_table(df, "length_controlled_winrate")
|
200 |
-
col2.dataframe(
|
201 |
-
prepped_df,
|
202 |
-
hide_index=True,
|
203 |
-
)
|
204 |
-
with st.expander("Trendline R²"):
|
205 |
-
st.markdown(
|
206 |
-
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
|
207 |
-
)
|
208 |
-
|
209 |
-
with tab2:
|
210 |
-
col1, col2 = st.columns([3, 2])
|
211 |
-
col1.plotly_chart(fig2)
|
212 |
-
col2.markdown("#### Rankings")
|
213 |
-
prepped_df = prep_rankings_table(df, "win_rate")
|
214 |
-
col2.dataframe(
|
215 |
-
prepped_df,
|
216 |
-
hide_index=True,
|
217 |
-
)
|
218 |
-
with st.expander("Trendline R²"):
|
219 |
-
st.markdown(
|
220 |
-
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
|
221 |
-
)
|
222 |
-
|
223 |
-
with tab3:
|
224 |
-
col1, col2 = st.columns([3, 2])
|
225 |
-
col1.plotly_chart(fig3)
|
226 |
-
col2.markdown("#### Rankings")
|
227 |
-
prepped_df = prep_rankings_table(df, "discrete_win_rate")
|
228 |
-
col2.dataframe(
|
229 |
-
prepped_df,
|
230 |
-
hide_index=True,
|
231 |
-
)
|
232 |
-
with st.expander("Trendline R²"):
|
233 |
-
st.markdown(
|
234 |
-
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
|
235 |
-
)
|
236 |
-
|
237 |
-
with st.expander("Raw data"):
|
238 |
-
st.dataframe(df)
|
239 |
|
240 |
|
241 |
if __name__ == "__main__":
|
|
|
3 |
import plotly.express as px
|
4 |
import plotly.graph_objects as go
|
5 |
import statsmodels.api as sm
|
6 |
+
import random
|
7 |
|
8 |
# Set the layout to wide
|
9 |
st.set_page_config(layout="wide")
|
|
|
37 |
return df_copy
|
38 |
|
39 |
|
40 |
+
def get_preference(preference_score):
|
41 |
+
rounded_preference_score = int(preference_score.round(0).iloc[0])
|
42 |
+
return get_preference_from_rounded_score(rounded_preference_score)
|
43 |
+
# if rounded_preference_score == 2:
|
44 |
+
# return "[2>1]"
|
45 |
+
# elif rounded_preference_score == 1:
|
46 |
+
# return "[1>2]"
|
47 |
+
|
48 |
+
|
49 |
+
def get_preference_from_rounded_score(score):
|
50 |
+
if score == 2:
|
51 |
+
return "[2>1]"
|
52 |
+
elif score == 1:
|
53 |
+
return "[1>2]"
|
54 |
+
return "[1=2]"
|
55 |
+
# raise ValueError(f"Invalid score: {score}")
|
56 |
+
|
57 |
+
|
58 |
def app():
|
59 |
+
fixed_model = "gpt4_1106_preview"
|
60 |
+
|
61 |
+
# Ensure to initialize session state variables if they do not exist
|
62 |
+
if "selected_instruction" not in st.session_state:
|
63 |
+
st.session_state.selected_instruction = None
|
64 |
+
|
65 |
+
if "selected_model" not in st.session_state:
|
66 |
+
st.session_state.selected_model = "gpt4"
|
67 |
+
|
68 |
+
if "selected_judge" not in st.session_state:
|
69 |
+
st.session_state.selected_judge = None
|
70 |
+
|
71 |
+
if "selected_dataset" not in st.session_state:
|
72 |
+
st.session_state.selected_dataset = "NEW"
|
73 |
+
|
74 |
+
if "instruction_options" not in st.session_state:
|
75 |
+
st.session_state.instruction_options = []
|
76 |
+
|
77 |
+
# Function to update the instruction options based on selected dataset
|
78 |
+
def update_instruction_options():
|
79 |
+
selected_dataset = st.session_state.dataset_selector
|
80 |
+
if selected_dataset == "all" or selected_dataset == "NEW":
|
81 |
+
instruction_options = df_response_judging["instruction"].unique().tolist()
|
82 |
+
elif (
|
83 |
+
selected_dataset == "None"
|
84 |
+
or selected_dataset is None
|
85 |
+
or str(selected_dataset) == ""
|
86 |
+
):
|
87 |
+
instruction_options = (
|
88 |
+
df_response_judging[pd.isna(df_response_judging["dataset"])][
|
89 |
+
"instruction"
|
90 |
+
]
|
91 |
+
.unique()
|
92 |
+
.tolist()
|
93 |
+
)
|
94 |
+
else:
|
95 |
+
instruction_options = (
|
96 |
+
df_response_judging[df_response_judging["dataset"] == selected_dataset][
|
97 |
+
"instruction"
|
98 |
+
]
|
99 |
+
.unique()
|
100 |
+
.tolist()
|
101 |
+
)
|
102 |
+
|
103 |
+
st.session_state.instruction_options = instruction_options
|
104 |
+
|
105 |
+
def update_instruction():
|
106 |
+
st.session_state.selected_instruction = st.session_state.instruction_selector
|
107 |
+
|
108 |
+
def update_model():
|
109 |
+
st.session_state.selected_model = st.session_state.model_selector
|
110 |
+
|
111 |
+
def update_judge():
|
112 |
+
st.session_state.selected_judge = st.session_state.judge_selector
|
113 |
+
|
114 |
+
def randomize_selection():
|
115 |
+
st.session_state.dataset_selector = random.choice(
|
116 |
+
["all"] + df_response_judging["dataset"].dropna().unique().tolist()
|
117 |
+
)
|
118 |
+
st.session_state.selected_model = random.choice(model_options)
|
119 |
+
update_instruction_options()
|
120 |
+
st.session_state.selected_instruction = random.choice(
|
121 |
+
st.session_state.instruction_options
|
122 |
+
)
|
123 |
+
|
124 |
st.title("AlpacaEval Visualizations")
|
125 |
|
126 |
+
outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])
|
127 |
|
128 |
# Load the data
|
129 |
+
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
|
130 |
+
# df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
|
131 |
+
df_response_judging = pd.read_json(
|
132 |
+
"data/df_response_judging.jsonl", lines=True, orient="records"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
|
135 |
+
# Prepare the model selector options
|
136 |
+
model_options = df_response_judging["generator_2"].unique().tolist()
|
137 |
+
|
138 |
+
with outer_tabs[0]:
|
139 |
+
# Define the preset groups
|
140 |
+
presets = {
|
141 |
+
"gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
|
142 |
+
"model_name"
|
143 |
+
].tolist(),
|
144 |
+
"claude": df[df["model_name"].str.contains("claude", case=False)][
|
145 |
+
"model_name"
|
146 |
+
].tolist(),
|
147 |
+
"moa": df[df["model_name"].str.contains("moa", case=False)][
|
148 |
+
"model_name"
|
149 |
+
].tolist(),
|
150 |
+
"llama": df[df["model_name"].str.contains("llama", case=False)][
|
151 |
+
"model_name"
|
152 |
+
].tolist(),
|
153 |
+
"custom": [],
|
154 |
+
}
|
155 |
+
|
156 |
+
# Add radio button for preset groups
|
157 |
+
preset_selection = st.radio(
|
158 |
+
"Select a preset group of models or choose 'custom' to select manually.",
|
159 |
+
options=["custom", "gpt", "claude", "moa", "llama"],
|
160 |
+
)
|
161 |
+
|
162 |
+
st.divider()
|
163 |
+
|
164 |
+
# Add multiselect for custom model selection
|
165 |
+
if preset_selection == "custom":
|
166 |
+
selected_models = st.multiselect(
|
167 |
+
"Select models to highlight", options=df["model_name"].unique()
|
168 |
+
)
|
169 |
+
else:
|
170 |
+
selected_models = presets[preset_selection]
|
171 |
+
|
172 |
+
def create_scatter_plot(df, y_column, selected_models, title):
|
173 |
+
fig = go.Figure()
|
174 |
+
|
175 |
+
# Add scatter plots for num_words_mean and num_tokens_mean
|
176 |
fig.add_trace(
|
177 |
go.Scatter(
|
178 |
+
x=df["num_words_mean"],
|
179 |
+
y=df[y_column],
|
180 |
mode="markers",
|
181 |
+
name="words",
|
182 |
+
text=df["model_name"],
|
183 |
+
marker=dict(size=5, color="skyblue"),
|
184 |
showlegend=True,
|
185 |
)
|
186 |
)
|
187 |
fig.add_trace(
|
188 |
go.Scatter(
|
189 |
+
x=df["num_tokens_mean"],
|
190 |
+
y=df[y_column],
|
191 |
mode="markers",
|
192 |
+
name="tokens",
|
193 |
+
text=df["model_name"],
|
194 |
+
marker=dict(size=5, color="orange"),
|
195 |
showlegend=True,
|
196 |
+
visible="legendonly", # Make 'words' trace initially visible only in legend
|
197 |
)
|
198 |
)
|
199 |
|
200 |
+
# Highlight selected models
|
201 |
+
if selected_models:
|
202 |
+
selected_data = df[df["model_name"].isin(selected_models)]
|
203 |
+
fig.add_trace(
|
204 |
+
go.Scatter(
|
205 |
+
x=selected_data["num_words_mean"],
|
206 |
+
y=selected_data[y_column],
|
207 |
+
mode="markers",
|
208 |
+
name="selected words",
|
209 |
+
text=selected_data["model_name"],
|
210 |
+
marker=dict(size=10, color="blue"),
|
211 |
+
showlegend=True,
|
212 |
+
)
|
213 |
)
|
214 |
+
fig.add_trace(
|
215 |
+
go.Scatter(
|
216 |
+
x=selected_data["num_tokens_mean"],
|
217 |
+
y=selected_data[y_column],
|
218 |
+
mode="markers",
|
219 |
+
name="selected tokens",
|
220 |
+
text=selected_data["model_name"],
|
221 |
+
marker=dict(size=10, color="orangered"),
|
222 |
+
showlegend=True,
|
223 |
+
visible="legendonly", # Make 'selected words' trace initially visible only in legend
|
224 |
+
)
|
225 |
+
)
|
226 |
+
|
227 |
+
# Add trendlines
|
228 |
+
def add_trendline(fig, x, y, name, color, visibility="legendonly"):
|
229 |
+
X = sm.add_constant(df[x])
|
230 |
+
model = sm.OLS(df[y], X).fit()
|
231 |
+
trendline = model.predict(X)
|
232 |
+
fig.add_trace(
|
233 |
+
go.Scatter(
|
234 |
+
x=df[x],
|
235 |
+
y=trendline,
|
236 |
+
mode="lines",
|
237 |
+
name=f"{name} trendline",
|
238 |
+
line=dict(color=color, width=2),
|
239 |
+
visible=visibility, # Control the initial visibility
|
240 |
+
)
|
241 |
+
)
|
242 |
+
return model.rsquared
|
243 |
+
|
244 |
+
r_squared_words = add_trendline(
|
245 |
+
fig, "num_words_mean", y_column, "words", "blue", visibility=True
|
246 |
+
)
|
247 |
+
r_squared_tokens = add_trendline(
|
248 |
+
fig, "num_tokens_mean", y_column, "tokens", "orangered"
|
249 |
)
|
|
|
250 |
|
251 |
+
# Update layout with titles and labels
|
252 |
+
fig.update_layout(
|
253 |
+
xaxis_title="Mean length",
|
254 |
+
yaxis_title=(
|
255 |
+
"Win rate"
|
256 |
+
if y_column == "win_rate"
|
257 |
+
else (
|
258 |
+
"LC Win Rate"
|
259 |
+
if y_column == "length_controlled_winrate"
|
260 |
+
else "Discrete Win Rate"
|
261 |
+
)
|
262 |
+
),
|
263 |
+
title=title,
|
264 |
+
legend_title="Legend",
|
265 |
+
)
|
266 |
+
|
267 |
+
return fig, r_squared_words, r_squared_tokens
|
268 |
+
|
269 |
+
st.markdown("## Overall win rate")
|
270 |
+
y_column1 = "length_controlled_winrate"
|
271 |
+
y_column2 = "win_rate"
|
272 |
+
y_column3 = "discrete_win_rate"
|
273 |
+
|
274 |
+
fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
|
275 |
+
df, y_column1, selected_models, "Length-Controlled Win Rate"
|
276 |
)
|
277 |
+
fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
|
278 |
+
df, y_column2, selected_models, "Win Rate"
|
279 |
)
|
280 |
+
fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
|
281 |
+
df, y_column3, selected_models, "Discrete Win Rate"
|
282 |
+
)
|
283 |
+
|
284 |
+
# Create tabs for each chart
|
285 |
+
tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
|
286 |
|
287 |
+
with tab1:
|
288 |
+
col1, col2 = st.columns([3, 2])
|
289 |
+
col1.plotly_chart(fig1)
|
290 |
+
col2.markdown("#### Rankings")
|
291 |
+
prepped_df = prep_rankings_table(df, "length_controlled_winrate")
|
292 |
+
col2.dataframe(
|
293 |
+
prepped_df,
|
294 |
+
hide_index=True,
|
295 |
+
)
|
296 |
+
with st.expander("Trendline R²"):
|
297 |
+
st.markdown(
|
298 |
+
f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
|
299 |
)
|
300 |
+
|
301 |
+
with tab2:
|
302 |
+
col1, col2 = st.columns([3, 2])
|
303 |
+
col1.plotly_chart(fig2)
|
304 |
+
col2.markdown("#### Rankings")
|
305 |
+
prepped_df = prep_rankings_table(df, "win_rate")
|
306 |
+
col2.dataframe(
|
307 |
+
prepped_df,
|
308 |
+
hide_index=True,
|
309 |
+
)
|
310 |
+
with st.expander("Trendline R²"):
|
311 |
+
st.markdown(
|
312 |
+
f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
|
313 |
+
)
|
314 |
+
|
315 |
+
with tab3:
|
316 |
+
col1, col2 = st.columns([3, 2])
|
317 |
+
col1.plotly_chart(fig3)
|
318 |
+
col2.markdown("#### Rankings")
|
319 |
+
prepped_df = prep_rankings_table(df, "discrete_win_rate")
|
320 |
+
col2.dataframe(
|
321 |
+
prepped_df,
|
322 |
+
hide_index=True,
|
323 |
+
)
|
324 |
+
with st.expander("Trendline R²"):
|
325 |
+
st.markdown(
|
326 |
+
f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
|
327 |
+
)
|
328 |
+
|
329 |
+
st.markdown("## Length bias in battles")
|
330 |
+
|
331 |
+
df_response_judging_copy = df_response_judging.copy()
|
332 |
+
if not selected_models:
|
333 |
+
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
|
334 |
+
"output_1"
|
335 |
+
].apply(lambda x: len(x.split()))
|
336 |
+
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
|
337 |
+
"output_2"
|
338 |
+
].apply(lambda x: len(x.split()))
|
339 |
+
df_response_judging_copy["output_num_words_diff"] = (
|
340 |
+
df_response_judging_copy["output_1_num_words"]
|
341 |
+
- df_response_judging_copy["output_2_num_words"]
|
342 |
+
)
|
343 |
+
df_response_judging_copy["assigned_preference"] = (
|
344 |
+
df_response_judging_copy["preference"]
|
345 |
+
.round(0)
|
346 |
+
.apply(get_preference_from_rounded_score)
|
347 |
+
)
|
348 |
+
else:
|
349 |
+
df_response_judging_copy = df_response_judging_copy[
|
350 |
+
df_response_judging_copy["generator_2"].isin(selected_models)
|
351 |
+
]
|
352 |
+
df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
|
353 |
+
"output_1"
|
354 |
+
].apply(lambda x: len(x.split()))
|
355 |
+
df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
|
356 |
+
"output_2"
|
357 |
+
].apply(lambda x: len(x.split()))
|
358 |
+
df_response_judging_copy["output_num_words_diff"] = (
|
359 |
+
df_response_judging_copy["output_1_num_words"]
|
360 |
+
- df_response_judging_copy["output_2_num_words"]
|
361 |
+
)
|
362 |
+
df_response_judging_copy["assigned_preference"] = (
|
363 |
+
df_response_judging_copy["preference"]
|
364 |
+
.round(0)
|
365 |
+
.apply(get_preference_from_rounded_score)
|
366 |
+
)
|
367 |
+
|
368 |
+
col1, col2 = st.columns(2)
|
369 |
+
fig = px.scatter(
|
370 |
+
df_response_judging_copy,
|
371 |
+
x="output_1_num_words",
|
372 |
+
y="output_2_num_words",
|
373 |
+
color="assigned_preference",
|
374 |
+
title=f"Pairwise preference based on response length",
|
375 |
+
labels={
|
376 |
+
"output_1_num_words": f"{fixed_model} (1) number of words",
|
377 |
+
"output_2_num_words": "Target model (2) number of words",
|
378 |
+
},
|
379 |
+
color_discrete_map={
|
380 |
+
"[1>2]": "blue",
|
381 |
+
"[2>1]": "orangered",
|
382 |
+
"[1=2]": "green",
|
383 |
+
},
|
384 |
+
)
|
385 |
+
col1.plotly_chart(fig)
|
386 |
+
|
387 |
+
# Plot of output_num_words_diff histogram, colored by assigned_preference.
|
388 |
+
fig = px.histogram(
|
389 |
+
df_response_judging_copy,
|
390 |
+
x="output_num_words_diff",
|
391 |
+
color="assigned_preference",
|
392 |
+
title=f"Pairwise preference counts based on difference in response length",
|
393 |
+
color_discrete_map={
|
394 |
+
"[1>2]": "blue",
|
395 |
+
"[2>1]": "orangered",
|
396 |
+
"[1=2]": "green",
|
397 |
+
},
|
398 |
+
range_x=[-500, 500],
|
399 |
+
labels={
|
400 |
+
"output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model"
|
401 |
+
},
|
402 |
+
)
|
403 |
+
col2.plotly_chart(fig)
|
404 |
+
|
405 |
+
with st.expander("Raw data"):
|
406 |
+
st.dataframe(df)
|
407 |
+
|
408 |
+
# Data explorer
|
409 |
+
with outer_tabs[1]:
|
410 |
+
# Add randomize button at the top of the app
|
411 |
+
st.markdown("## Choose example")
|
412 |
+
st.button(
|
413 |
+
":game_die: Randomize!",
|
414 |
+
on_click=randomize_selection,
|
415 |
+
type="primary",
|
416 |
+
)
|
417 |
+
|
418 |
+
left_col, right_col = st.columns([1, 3])
|
419 |
+
|
420 |
+
st.session_state.selected_dataset = left_col.selectbox(
|
421 |
+
"Select Dataset",
|
422 |
+
["all"] + df_response_judging["dataset"].dropna().unique().tolist(),
|
423 |
+
key="dataset_selector",
|
424 |
+
on_change=update_instruction_options,
|
425 |
+
)
|
426 |
+
update_instruction_options()
|
427 |
+
st.session_state.selected_instruction = right_col.selectbox(
|
428 |
+
f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)",
|
429 |
+
st.session_state.instruction_options,
|
430 |
+
key="instruction_selector",
|
431 |
+
on_change=update_instruction,
|
432 |
+
index=(
|
433 |
+
st.session_state.instruction_options.index(
|
434 |
+
st.session_state.selected_instruction
|
435 |
+
)
|
436 |
+
if st.session_state.selected_instruction
|
437 |
+
in st.session_state.instruction_options
|
438 |
+
else 0
|
439 |
),
|
|
|
|
|
440 |
)
|
441 |
|
442 |
+
# All the models.
|
443 |
+
all_models_judgings_details = df_response_judging[
|
444 |
+
(df_response_judging["generator_1"] == fixed_model)
|
445 |
+
& (
|
446 |
+
df_response_judging["instruction"]
|
447 |
+
== st.session_state.selected_instruction
|
448 |
+
)
|
449 |
+
]
|
450 |
|
451 |
+
st.divider()
|
|
|
|
|
452 |
|
453 |
+
st.markdown(f"## Selected instruction")
|
454 |
+
st.info(st.session_state.selected_instruction)
|
455 |
+
|
456 |
+
st.divider()
|
457 |
+
|
458 |
+
st.markdown(f"## Overall Battles")
|
459 |
+
all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
|
460 |
+
"output_1"
|
461 |
+
].apply(lambda x: len(x.split()))
|
462 |
+
all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[
|
463 |
+
"output_2"
|
464 |
+
].apply(lambda x: len(x.split()))
|
465 |
+
all_models_judgings_details["output_num_words_diff"] = (
|
466 |
+
all_models_judgings_details["output_1_num_words"]
|
467 |
+
- all_models_judgings_details["output_2_num_words"]
|
468 |
+
)
|
469 |
+
all_models_judgings_details["assigned_preference"] = (
|
470 |
+
all_models_judgings_details["preference"]
|
471 |
+
.round(0)
|
472 |
+
.apply(get_preference_from_rounded_score)
|
473 |
+
)
|
474 |
+
|
475 |
+
# st.write(all_models_judgings_details)
|
476 |
+
|
477 |
+
col1, col2, col3 = st.columns(3)
|
478 |
+
|
479 |
+
fig = px.histogram(
|
480 |
+
all_models_judgings_details,
|
481 |
+
x="output_num_words_diff",
|
482 |
+
color="assigned_preference",
|
483 |
+
title=f"Pairwise preference counts based on difference in response length",
|
484 |
+
color_discrete_map={
|
485 |
+
"[1>2]": "blue",
|
486 |
+
"[2>1]": "orangered",
|
487 |
+
"[1=2]": "green",
|
488 |
+
},
|
489 |
+
range_x=[-500, 500],
|
490 |
+
labels={
|
491 |
+
"output_num_words_diff": "Difference in number of words between response 1 and 2.",
|
492 |
+
"assigned_preference": "Assigned Preference",
|
493 |
+
},
|
494 |
+
)
|
495 |
+
col1.plotly_chart(fig)
|
496 |
+
|
497 |
+
# Plot of assigned preference counts.
|
498 |
+
fig = px.histogram(
|
499 |
+
all_models_judgings_details,
|
500 |
+
x="assigned_preference",
|
501 |
+
title=f"Assigned preferences for {fixed_model} vs. all models",
|
502 |
+
)
|
503 |
+
col2.plotly_chart(fig)
|
504 |
+
|
505 |
+
# Models that are better than the fixed model.
|
506 |
+
num_words_for_fixed_model = len(
|
507 |
+
all_models_judgings_details.iloc[0]["output_1"].split()
|
508 |
+
)
|
509 |
+
better_models = all_models_judgings_details[
|
510 |
+
all_models_judgings_details["assigned_preference"] == "[2>1]"
|
511 |
+
]
|
512 |
+
|
513 |
+
shorter_models = better_models[
|
514 |
+
better_models["output_2_num_words"] <= num_words_for_fixed_model
|
515 |
+
]
|
516 |
+
longer_models = better_models[
|
517 |
+
better_models["output_2_num_words"] > num_words_for_fixed_model
|
518 |
+
]
|
519 |
+
col3.markdown(
|
520 |
+
f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
|
521 |
+
)
|
522 |
+
if shorter_models.size != 0:
|
523 |
+
shorter_models_string = ""
|
524 |
+
for _, shorter_model in shorter_models.iterrows():
|
525 |
+
if shorter_model["generator_2"] != fixed_model:
|
526 |
+
shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n"
|
527 |
+
col3.markdown("**With shorter or equal length responses:**")
|
528 |
+
col3.markdown(shorter_models_string)
|
529 |
+
else:
|
530 |
+
col3.write("None")
|
531 |
+
if longer_models.size != 0:
|
532 |
+
longer_models_string = ""
|
533 |
+
for _, longer_model in longer_models.iterrows():
|
534 |
+
if longer_model["generator_2"] != fixed_model:
|
535 |
+
longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n"
|
536 |
+
col3.markdown("**With longer responses:**")
|
537 |
+
col3.markdown(longer_models_string)
|
538 |
+
else:
|
539 |
+
col3.write("None")
|
540 |
+
|
541 |
+
# Judging details.
|
542 |
+
st.markdown(f"## Individual Battle Details")
|
543 |
+
judging_details = df_response_judging[
|
544 |
+
(df_response_judging["generator_1"] == fixed_model)
|
545 |
+
& (df_response_judging["generator_2"] == st.session_state.selected_model)
|
546 |
+
& (
|
547 |
+
df_response_judging["instruction"]
|
548 |
+
== st.session_state.selected_instruction
|
549 |
+
)
|
550 |
+
]
|
551 |
+
|
552 |
+
# if not judging_details.empty:
|
553 |
+
if not judging_details["preference"].empty:
|
554 |
+
preference = get_preference(judging_details["preference"])
|
555 |
+
if preference == "[1>2]":
|
556 |
+
st.write(
|
557 |
+
f"**{fixed_model}** is better than **{st.session_state.selected_model}**"
|
558 |
+
)
|
559 |
+
else:
|
560 |
+
st.write(
|
561 |
+
f"**{st.session_state.selected_model}** is better than **{fixed_model}**"
|
562 |
+
)
|
563 |
+
st.write(
|
564 |
+
f"- **Score:** {judging_details['preference'].round(2).item()}\n- **Assigned preference:** {preference}"
|
565 |
+
)
|
566 |
+
|
567 |
+
with st.expander("Additional information"):
|
568 |
+
st.write(
|
569 |
+
judging_details[
|
570 |
+
[
|
571 |
+
"instruction",
|
572 |
+
"time_per_example",
|
573 |
+
"price_per_example",
|
574 |
+
"raw_completion",
|
575 |
+
]
|
576 |
+
]
|
577 |
+
)
|
578 |
+
|
579 |
+
# Create two columns for model selectors
|
580 |
+
st.markdown("## Responses")
|
581 |
+
col1, col2 = st.columns(2)
|
582 |
+
|
583 |
+
with col1:
|
584 |
+
st.selectbox(
|
585 |
+
"Reference model",
|
586 |
+
[fixed_model],
|
587 |
+
key="fixed_model",
|
588 |
+
)
|
589 |
+
|
590 |
+
# Get the response string for the fixed model
|
591 |
+
if st.session_state.selected_instruction:
|
592 |
+
preference = get_preference(judging_details["preference"])
|
593 |
+
response_details_fixed = df_response_judging[
|
594 |
+
(
|
595 |
+
df_response_judging["instruction"]
|
596 |
+
== st.session_state.selected_instruction
|
597 |
+
)
|
598 |
+
& (df_response_judging["generator_1"] == fixed_model)
|
599 |
+
].iloc[0]
|
600 |
+
|
601 |
+
st.write(
|
602 |
+
f'Number of words: {len(response_details_fixed["output_1"].split())}'
|
603 |
+
)
|
604 |
+
|
605 |
+
# Display the response string
|
606 |
+
if preference == "[1>2]":
|
607 |
+
st.success(response_details_fixed["output_1"])
|
608 |
+
else:
|
609 |
+
st.error(response_details_fixed["output_1"])
|
610 |
+
|
611 |
+
with col2:
|
612 |
+
st.session_state.selected_model = st.selectbox(
|
613 |
+
"Select Model",
|
614 |
+
model_options,
|
615 |
+
key="model_selector",
|
616 |
+
on_change=update_model,
|
617 |
+
index=(
|
618 |
+
model_options.index(st.session_state.selected_model)
|
619 |
+
if st.session_state.selected_model
|
620 |
+
else 0
|
621 |
+
),
|
622 |
+
)
|
623 |
+
|
624 |
+
# Get the response string for the selected model
|
625 |
+
if (
|
626 |
+
st.session_state.selected_model
|
627 |
+
and st.session_state.selected_instruction
|
628 |
+
):
|
629 |
+
response_details_dynamic = df_response_judging[
|
630 |
+
(
|
631 |
+
df_response_judging["instruction"]
|
632 |
+
== st.session_state.selected_instruction
|
633 |
+
)
|
634 |
+
& (
|
635 |
+
df_response_judging["generator_2"]
|
636 |
+
== st.session_state.selected_model
|
637 |
+
)
|
638 |
+
].iloc[0]
|
639 |
+
|
640 |
+
st.write(
|
641 |
+
f'Number of words: {len(response_details_dynamic["output_2"].split())}'
|
642 |
+
)
|
643 |
|
644 |
+
# Display the response string
|
645 |
+
if preference == "[2>1]":
|
646 |
+
st.success(response_details_dynamic["output_2"])
|
647 |
+
else:
|
648 |
+
st.error(response_details_dynamic["output_2"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
|
650 |
|
651 |
if __name__ == "__main__":
|
data/df_response_judging.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5514fedae0d0375f5711fad6fdbf37c8e5e09178d16c29052db0256d09cf2240
|
3 |
+
size 214865144
|
data/df_responses.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbdf08ff250eb5104ee69244672a4baa184b8ae2f928b5c56344102d20926c0d
|
3 |
+
size 89555570
|
data/model_win_rates.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"num_words_mean":{"gpt-3.5-turbo-0301":136,"gpt-3.5-turbo-1106_verbose":166,"vicuna-13b-v1.5-togetherai":177,"Qwen1.5-1.8B-Chat":426,"recycled-wizardlm-7b-v1.0":235,"aligner-2b_claude-3-opus-20240229":257,"Qwen1.5-110B-Chat":253,"claude-3-opus-20240229":216,"llama-2-7b-chat-hf":241,"mistral-medium":241,"vicuna-33b-v1.3":237,"cohere":315,"claude-2":174,"guanaco-65b":203,"Mixtral-8x7B-Instruct-v0.1":238,"openchat-v2-w-13b":249,"falcon-7b-instruct":74,"wizardlm-13b-v1.1":244,"Meta-Llama-3-8B-Instruct":301,"FsfairX-Zephyr-Chat-v0.1":342,"Infinity-Instruct-3M-0613-Mistral-7B":79,"Qwen1.5-72B-Chat":243,"xwinlm-7b-v0.1":304,"Mixtral-8x22B-Instruct-v0.1":229,"vicuna-13b-v1.5":177,"dbrx-instruct":235,"zephyr-7b-alpha":208,"tulu-2-dpo-13b":257,"Qwen1.5-7B-Chat":254,"Together-MoA-Lite":297,"cut-13b":269,"Meta-Llama-3-70B-Instruct":299,"vicuna-13b-v1.3":189,"claude-instant-1.2":179,"airoboros-65b":236,"openbuddy-llama2-13b-v11.1":177,"phi-2":102,"Together-MoA":272,"mistral-large-2402":218,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":296,"pairrm-tulu-2-13b":236,"recycled-wizardlm-7b-v2.0":251,"Storm-7B-best-of-64":336,"vicuna-7b":175,"claude-3-sonnet-20240229":221,"Mistral-7B-Instruct-v0.2":261,"Samba-CoE-v0.1":190,"claude":176,"Nanbeige2-8B-Chat":415,"REBEL-Llama-3-8B-Instruct":341,"chatglm2-6b":175,"gpt-4o-2024-05-13":286,"gpt4_1106_preview_verbose":378,"TempNet-LLaMA2-Chat-13B-v0.1":253,"text_davinci_001":50,"Mixtral-8x7B-Instruct-v0.1_verbose":336,"baize-v2-7b":189,"phi-2-dpo":270,"alpaca-farm-ppo-human":135,"Nanbeige2-16B-Chat":284,"gpt4_0613":183,"pythia-12b-mix-sft":147,"alpaca-7b-neft":170,"Qwen1.5-14B-Chat":252,"gpt-4-0125-preview":313,"guanaco-33b":220,"oasst-sft-llama-33b":125,"gpt4_0613_verbose":237,"llama-2-chat-7b-evol70k-neft":260,"gpt35_turbo_instruct":166,"platolm-7b":209,"llama-2-13b-chat-hf":249,"Nanbeige-Plus-Chat-v0.1":391,"openchat-v2-13b":249,"mistral-orpo-beta":263,"Snorkel-Mistral-PairRM-DPO-best-of-16":393,"tulu-2-dpo-7b":278,"alpaca-7b_verbose":90,"OpenHermes-2.5-Mistral-7B":176,"claude-2.1_verbose":228,"ultralm-13b-v2.0":231,"deita-7b-v1.0":228,"minichat-1.5-3b":242,"Qwen-14B-Chat":167,"airoboros-33b":238,"alpaca-farm-ppo-sim-gpt4-20k":82,"ultralm-13b":181,"openbuddy-falcon-40b-v9":182,"openchat8192-13b":268,"wizardlm-13b":163,"vicuna-13b":175,"merlinite-7B-AOT":281,"gpt4_0314":215,"gpt4_0613_concise":99,"jina-chat":106,"Contextual-KTO-Mistral-PairRM":381,"xwinlm-13b-v0.1":300,"LMCocktail-10.7B-v1":182,"SPPO-Mistral7B-PairRM-ExPO":344,"Mixtral-8x7B-Instruct-v0.1_concise":144,"gpt4_1106_preview_concise":177,"Mistral-7B-ReMax-v0.1":234,"Llama-3-Instruct-8B-SimPO-ExPO":261,"dolphin-2.2.1-mistral-7b":182,"humpback-llama2-70b":178,"openpipe-moa-gpt-4-turbo-v1":272,"vicuna-7b-v1.5":181,"Starling-LM-7B-alpha":301,"falcon-40b-instruct":109,"Samba-CoE-v0.2-best-of-16":225,"opencoderplus-15b":262,"xwinlm-70b-v0.1":282,"wizardlm-13b-v1.2":264,"aligner-2b_qwen1.5-72b-chat":280,"internlm2-chat-7b-ExPO":364,"claude-2.1":177,"vicuna-7b-v1.3":184,"oasst-rlhf-llama-33b":181,"zephyr-7b-alpha-ExPO":201,"openchat-v3.1-13b":235,"SPPO-Llama-3-Instruct-8B-PairRM":317,"minotaur-13b":138,"tulu-2-dpo-13b-ExPO":264,"zephyr-7b-beta-ExPO":224,"tulu-2-dpo-7b-ExPO":277,"Llama-3-Instruct-8B-SimPO":272,"baize-v2-13b":155,"guanaco-7b":233,"ultralm-13b-v2.0-best-of-16":273,"claude-2.1_concise":91,"openchat-13b":260,"tulu-2-dpo-70b":231,"deepseek-llm-67b-chat":189,"humpback-llama-65b":196,"tulu-2-dpo-70b-ExPO":276,"TempNet-LLaMA2-Chat-7B-v0.1":246,"nous-hermes-13b":139,"gpt-3.5-turbo-0613":207,"alpaca-7b_concise":58,"baichuan-13b-chat":225,"claude-3-5-sonnet-20240620":228,"gpt-3.5-turbo-1106":126,"minichat-3b":145,"Storm-7B":300,"oasst-sft-pythia-12b":118,"Conifer-7B-DPO":205,"Snorkel-Mistral-PairRM-DPO":414,"internlm2-chat-20b-ExPO":507,"Samba-CoE-v0.2":210,"gemini-pro":228,"pairrm-tulu-2-70b":264,"text_davinci_003":52,"gpt4":215,"Yi-34B-Chat":339,"Starling-LM-7B-beta-ExPO":336,"pairrm-Yi-34B-Chat":349,"gpt4_1106_preview":323,"evo-7b":280,"zephyr-7b-beta":229,"guanaco-13b":308,"alpaca-7b":66,"internlm2-chat-20b-ppo":371,"gemma-2b-it":165,"pairrm-zephyr-7b-beta":236,"evo-v2-7b":274,"causallm-14b":228,"SPPO-Mistral7B-PairRM":322,"gpt-3.5-turbo-1106_concise":68,"openbuddy-llama-65b-v8":194,"claude2-alpaca-13b":186,"Starling-LM-7B-alpha-ExPO":288,"openbuddy-falcon-7b-v6":192,"gemma-7b-it":176,"phi-2-sft":175,"gpt4_gamed":11,"llama-2-70b-chat-hf":292,"openbuddy-llama2-70b-v10.1":178,"wizardlm-70b":249,"ultralm-13b-best-of-16":311},"num_words_std":{"gpt-3.5-turbo-0301":109,"gpt-3.5-turbo-1106_verbose":110,"vicuna-13b-v1.5-togetherai":119,"Qwen1.5-1.8B-Chat":358,"recycled-wizardlm-7b-v1.0":129,"aligner-2b_claude-3-opus-20240229":145,"Qwen1.5-110B-Chat":153,"claude-3-opus-20240229":113,"llama-2-7b-chat-hf":134,"mistral-medium":166,"vicuna-33b-v1.3":149,"cohere":179,"claude-2":90,"guanaco-65b":157,"Mixtral-8x7B-Instruct-v0.1":163,"openchat-v2-w-13b":151,"falcon-7b-instruct":92,"wizardlm-13b-v1.1":156,"Meta-Llama-3-8B-Instruct":167,"FsfairX-Zephyr-Chat-v0.1":187,"Infinity-Instruct-3M-0613-Mistral-7B":78,"Qwen1.5-72B-Chat":144,"xwinlm-7b-v0.1":207,"Mixtral-8x22B-Instruct-v0.1":168,"vicuna-13b-v1.5":122,"dbrx-instruct":143,"zephyr-7b-alpha":189,"tulu-2-dpo-13b":309,"Qwen1.5-7B-Chat":143,"Together-MoA-Lite":161,"cut-13b":159,"Meta-Llama-3-70B-Instruct":171,"vicuna-13b-v1.3":119,"claude-instant-1.2":104,"airoboros-65b":288,"openbuddy-llama2-13b-v11.1":122,"phi-2":209,"Together-MoA":151,"mistral-large-2402":148,"openbuddy-llama-30b-v7.1":115,"TempNet-LLaMA2-Chat-70B-v0.1":196,"pairrm-tulu-2-13b":135,"recycled-wizardlm-7b-v2.0":134,"Storm-7B-best-of-64":195,"vicuna-7b":124,"claude-3-sonnet-20240229":129,"Mistral-7B-Instruct-v0.2":201,"Samba-CoE-v0.1":138,"claude":97,"Nanbeige2-8B-Chat":173,"REBEL-Llama-3-8B-Instruct":273,"chatglm2-6b":225,"gpt-4o-2024-05-13":190,"gpt4_1106_preview_verbose":190,"TempNet-LLaMA2-Chat-13B-v0.1":136,"text_davinci_001":51,"Mixtral-8x7B-Instruct-v0.1_verbose":169,"baize-v2-7b":133,"phi-2-dpo":137,"alpaca-farm-ppo-human":125,"Nanbeige2-16B-Chat":151,"gpt4_0613":124,"pythia-12b-mix-sft":122,"alpaca-7b-neft":82,"Qwen1.5-14B-Chat":145,"gpt-4-0125-preview":180,"guanaco-33b":170,"oasst-sft-llama-33b":115,"gpt4_0613_verbose":127,"llama-2-chat-7b-evol70k-neft":113,"gpt35_turbo_instruct":157,"platolm-7b":145,"llama-2-13b-chat-hf":135,"Nanbeige-Plus-Chat-v0.1":159,"openchat-v2-13b":151,"mistral-orpo-beta":161,"Snorkel-Mistral-PairRM-DPO-best-of-16":185,"tulu-2-dpo-7b":415,"alpaca-7b_verbose":68,"OpenHermes-2.5-Mistral-7B":154,"claude-2.1_verbose":94,"ultralm-13b-v2.0":137,"deita-7b-v1.0":160,"minichat-1.5-3b":157,"Qwen-14B-Chat":124,"airoboros-33b":246,"alpaca-farm-ppo-sim-gpt4-20k":53,"ultralm-13b":108,"openbuddy-falcon-40b-v9":124,"openchat8192-13b":182,"wizardlm-13b":117,"vicuna-13b":115,"merlinite-7B-AOT":128,"gpt4_0314":160,"gpt4_0613_concise":79,"jina-chat":76,"Contextual-KTO-Mistral-PairRM":205,"xwinlm-13b-v0.1":188,"LMCocktail-10.7B-v1":118,"SPPO-Mistral7B-PairRM-ExPO":144,"Mixtral-8x7B-Instruct-v0.1_concise":131,"gpt4_1106_preview_concise":130,"Mistral-7B-ReMax-v0.1":132,"Llama-3-Instruct-8B-SimPO-ExPO":134,"dolphin-2.2.1-mistral-7b":167,"humpback-llama2-70b":133,"openpipe-moa-gpt-4-turbo-v1":137,"vicuna-7b-v1.5":117,"Starling-LM-7B-alpha":194,"falcon-40b-instruct":111,"Samba-CoE-v0.2-best-of-16":140,"opencoderplus-15b":194,"xwinlm-70b-v0.1":169,"wizardlm-13b-v1.2":188,"aligner-2b_qwen1.5-72b-chat":151,"internlm2-chat-7b-ExPO":172,"claude-2.1":92,"vicuna-7b-v1.3":117,"oasst-rlhf-llama-33b":174,"zephyr-7b-alpha-ExPO":149,"openchat-v3.1-13b":156,"SPPO-Llama-3-Instruct-8B-PairRM":148,"minotaur-13b":109,"tulu-2-dpo-13b-ExPO":139,"zephyr-7b-beta-ExPO":157,"tulu-2-dpo-7b-ExPO":144,"Llama-3-Instruct-8B-SimPO":141,"baize-v2-13b":110,"guanaco-7b":200,"ultralm-13b-v2.0-best-of-16":131,"claude-2.1_concise":68,"openchat-13b":168,"tulu-2-dpo-70b":151,"deepseek-llm-67b-chat":122,"humpback-llama-65b":122,"tulu-2-dpo-70b-ExPO":140,"TempNet-LLaMA2-Chat-7B-v0.1":140,"nous-hermes-13b":130,"gpt-3.5-turbo-0613":149,"alpaca-7b_concise":45,"baichuan-13b-chat":301,"claude-3-5-sonnet-20240620":142,"gpt-3.5-turbo-1106":103,"minichat-3b":119,"Storm-7B":135,"oasst-sft-pythia-12b":120,"Conifer-7B-DPO":126,"Snorkel-Mistral-PairRM-DPO":241,"internlm2-chat-20b-ExPO":206,"Samba-CoE-v0.2":138,"gemini-pro":164,"pairrm-tulu-2-70b":212,"text_davinci_003":66,"gpt4":156,"Yi-34B-Chat":178,"Starling-LM-7B-beta-ExPO":106,"pairrm-Yi-34B-Chat":189,"gpt4_1106_preview":181,"evo-7b":176,"zephyr-7b-beta":160,"guanaco-13b":288,"alpaca-7b":52,"internlm2-chat-20b-ppo":268,"gemma-2b-it":128,"pairrm-zephyr-7b-beta":178,"evo-v2-7b":170,"causallm-14b":166,"SPPO-Mistral7B-PairRM":148,"gpt-3.5-turbo-1106_concise":57,"openbuddy-llama-65b-v8":113,"claude2-alpaca-13b":127,"Starling-LM-7B-alpha-ExPO":174,"openbuddy-falcon-7b-v6":126,"gemma-7b-it":118,"phi-2-sft":123,"gpt4_gamed":60,"llama-2-70b-chat-hf":174,"openbuddy-llama2-70b-v10.1":118,"wizardlm-70b":144,"ultralm-13b-best-of-16":132},"win_rate":{"gpt-3.5-turbo-0301":9.6224532951,"gpt-3.5-turbo-1106_verbose":12.7631698103,"vicuna-13b-v1.5-togetherai":6.9582753694,"Qwen1.5-1.8B-Chat":3.7055568157,"recycled-wizardlm-7b-v1.0":6.6327499605,"aligner-2b_claude-3-opus-20240229":34.4633736232,"Qwen1.5-110B-Chat":33.7770952757,"claude-3-opus-20240229":29.1052695333,"llama-2-7b-chat-hf":4.9613395472,"mistral-medium":21.8557725437,"vicuna-33b-v1.3":12.7059479215,"cohere":12.9014552097,"claude-2":17.1882403567,"guanaco-65b":6.8584945134,"Mixtral-8x7B-Instruct-v0.1":18.2553176264,"openchat-v2-w-13b":9.6153441584,"falcon-7b-instruct":2.1466175532,"wizardlm-13b-v1.1":11.2339095729,"Meta-Llama-3-8B-Instruct":22.5699026093,"FsfairX-Zephyr-Chat-v0.1":35.9464864409,"Infinity-Instruct-3M-0613-Mistral-7B":15.7478281307,"Qwen1.5-72B-Chat":26.4982833956,"xwinlm-7b-v0.1":11.2456517378,"Mixtral-8x22B-Instruct-v0.1":22.2101705475,"vicuna-13b-v1.5":6.7221220149,"dbrx-instruct":19.7553327319,"zephyr-7b-alpha":8.3526639682,"tulu-2-dpo-13b":10.1197883883,"Qwen1.5-7B-Chat":11.7709270696,"Together-MoA-Lite":56.5930456223,"cut-13b":10.7790892025,"Meta-Llama-3-70B-Instruct":33.1778569588,"vicuna-13b-v1.3":7.1372403865,"claude-instant-1.2":16.1273996216,"airoboros-65b":9.3889501497,"openbuddy-llama2-13b-v11.1":6.1747164895,"phi-2":2.350209543,"Together-MoA":59.8688062333,"mistral-large-2402":21.4387759814,"openbuddy-llama-30b-v7.1":6.130014614,"TempNet-LLaMA2-Chat-70B-v0.1":15.0518944202,"pairrm-tulu-2-13b":13.8319010168,"recycled-wizardlm-7b-v2.0":7.3371293705,"Storm-7B-best-of-64":63.0409907519,"vicuna-7b":4.1626111623,"claude-3-sonnet-20240229":25.5563252923,"Mistral-7B-Instruct-v0.2":14.7227726577,"Samba-CoE-v0.1":16.8355018701,"claude":16.9853436124,"Nanbeige2-8B-Chat":39.354502072,"REBEL-Llama-3-8B-Instruct":34.3064238313,"chatglm2-6b":2.7621847965,"gpt-4o-2024-05-13":51.3275757825,"gpt4_1106_preview_verbose":64.303601471,"TempNet-LLaMA2-Chat-13B-v0.1":7.7284050659,"text_davinci_001":2.7640052311,"Mixtral-8x7B-Instruct-v0.1_verbose":24.6140630502,"baize-v2-7b":3.4048149775,"phi-2-dpo":7.7570957018,"alpaca-farm-ppo-human":4.100426815,"Nanbeige2-16B-Chat":37.0360860499,"gpt4_0613":15.7550380876,"pythia-12b-mix-sft":2.578090281,"alpaca-7b-neft":3.1321786695,"Qwen1.5-14B-Chat":18.6458143619,"gpt-4-0125-preview":54.9665397329,"guanaco-33b":5.002493725,"oasst-sft-llama-33b":4.7703909916,"gpt4_0613_verbose":23.2373600435,"llama-2-chat-7b-evol70k-neft":7.6023835122,"gpt35_turbo_instruct":8.4624465044,"platolm-7b":6.3208280585,"llama-2-13b-chat-hf":7.7023099579,"Nanbeige-Plus-Chat-v0.1":56.7030097302,"openchat-v2-13b":8.4350756447,"mistral-orpo-beta":12.5654087946,"Snorkel-Mistral-PairRM-DPO-best-of-16":34.8601328913,"tulu-2-dpo-7b":8.1975153845,"alpaca-7b_verbose":2.9331016025,"OpenHermes-2.5-Mistral-7B":10.3404157058,"claude-2.1_verbose":24.3540710901,"ultralm-13b-v2.0":7.5046229557,"deita-7b-v1.0":12.6466394724,"minichat-1.5-3b":6.5534430528,"Qwen-14B-Chat":7.5023334847,"airoboros-33b":9.0531603961,"alpaca-farm-ppo-sim-gpt4-20k":3.4503419871,"ultralm-13b":5.0745903805,"openbuddy-falcon-40b-v9":5.9557428463,"openchat8192-13b":7.472766808,"wizardlm-13b":5.8781525894,"vicuna-13b":5.8311031845,"merlinite-7B-AOT":29.8963508407,"gpt4_0314":22.0732589287,"gpt4_0613_concise":9.4003205746,"jina-chat":7.7861303934,"Contextual-KTO-Mistral-PairRM":33.2273552,"xwinlm-13b-v0.1":17.4279347502,"LMCocktail-10.7B-v1":13.1534309174,"SPPO-Mistral7B-PairRM-ExPO":35.4431306717,"Mixtral-8x7B-Instruct-v0.1_concise":13.7440401548,"gpt4_1106_preview_concise":22.9201944405,"Mistral-7B-ReMax-v0.1":15.999331369,"Llama-3-Instruct-8B-SimPO-ExPO":40.6328540086,"dolphin-2.2.1-mistral-7b":9.0397997282,"humpback-llama2-70b":10.1217715026,"openpipe-moa-gpt-4-turbo-v1":63.1549345123,"vicuna-7b-v1.5":4.7974939392,"Starling-LM-7B-alpha":14.2459235216,"falcon-40b-instruct":3.3429188225,"Samba-CoE-v0.2-best-of-16":26.9882543183,"opencoderplus-15b":7.406222451,"xwinlm-70b-v0.1":21.8129570739,"wizardlm-13b-v1.2":12.0274803428,"aligner-2b_qwen1.5-72b-chat":31.773037737,"internlm2-chat-7b-ExPO":28.0678174371,"claude-2.1":15.7335067364,"vicuna-7b-v1.3":4.6425118575,"oasst-rlhf-llama-33b":6.2964347858,"zephyr-7b-alpha-ExPO":10.5593543457,"openchat-v3.1-13b":11.0822304894,"SPPO-Llama-3-Instruct-8B-PairRM":39.6728609061,"minotaur-13b":5.7389636691,"tulu-2-dpo-13b-ExPO":15.5514054294,"zephyr-7b-beta-ExPO":11.0611168323,"tulu-2-dpo-7b-ExPO":11.529221039,"Llama-3-Instruct-8B-SimPO":40.5297749846,"baize-v2-13b":4.5905453306,"guanaco-7b":2.8800022662,"ultralm-13b-v2.0-best-of-16":13.8533734712,"claude-2.1_concise":9.2271252406,"openchat-13b":8.0223860109,"tulu-2-dpo-70b":15.9828543741,"deepseek-llm-67b-chat":12.0934222649,"humpback-llama-65b":9.4251390478,"tulu-2-dpo-70b-ExPO":22.9806197059,"TempNet-LLaMA2-Chat-7B-v0.1":5.4301432647,"nous-hermes-13b":5.4118789332,"gpt-3.5-turbo-0613":14.0957985739,"alpaca-7b_concise":1.9911763835,"baichuan-13b-chat":1.9921455615,"claude-3-5-sonnet-20240620":40.5602140968,"gpt-3.5-turbo-1106":9.177964562,"minichat-3b":3.0071507064,"Storm-7B":50.2688690553,"oasst-sft-pythia-12b":1.7901140832,"Conifer-7B-DPO":11.3135856492,"Snorkel-Mistral-PairRM-DPO":30.2200527007,"internlm2-chat-20b-ExPO":46.1853674689,"Samba-CoE-v0.2":21.8473786693,"gemini-pro":18.1776445406,"pairrm-tulu-2-70b":18.6389629674,"text_davinci_003":1.9621476654,"gpt4":23.5767893148,"Yi-34B-Chat":29.6599467188,"Starling-LM-7B-beta-ExPO":29.6008518479,"pairrm-Yi-34B-Chat":31.2412829468,"gpt4_1106_preview":50.0,"evo-7b":15.5774373995,"zephyr-7b-beta":10.9928857554,"guanaco-13b":3.4695968597,"alpaca-7b":2.5914505402,"internlm2-chat-20b-ppo":21.7491545005,"gemma-2b-it":3.4019714381,"pairrm-zephyr-7b-beta":12.8412782556,"evo-v2-7b":20.8341130226,"causallm-14b":11.14616087,"SPPO-Mistral7B-PairRM":32.2453123638,"gpt-3.5-turbo-1106_concise":7.4158649776,"openbuddy-llama-65b-v8":8.7706501509,"claude2-alpaca-13b":7.4373513248,"Starling-LM-7B-alpha-ExPO":18.1797559203,"openbuddy-falcon-7b-v6":3.521174372,"gemma-7b-it":6.9372943797,"phi-2-sft":3.9775677752,"gpt4_gamed":3.7383373714,"llama-2-70b-chat-hf":13.8882583437,"openbuddy-llama2-70b-v10.1":8.0964220963,"wizardlm-70b":14.3838960868,"ultralm-13b-best-of-16":11.3073149478},"standard_error":{"gpt-3.5-turbo-0301":0.9129656687,"gpt-3.5-turbo-1106_verbose":1.0442468192,"vicuna-13b-v1.5-togetherai":0.7825381738,"Qwen1.5-1.8B-Chat":0.5811750995,"recycled-wizardlm-7b-v1.0":0.7713329914,"aligner-2b_claude-3-opus-20240229":1.3146665263,"Qwen1.5-110B-Chat":1.3776163154,"claude-3-opus-20240229":1.3941539442,"llama-2-7b-chat-hf":0.6691754517,"mistral-medium":1.2682402187,"vicuna-33b-v1.3":0.9992557843,"cohere":1.0141034031,"claude-2":1.1748282562,"guanaco-65b":0.8048449272,"Mixtral-8x7B-Instruct-v0.1":1.1885585969,"openchat-v2-w-13b":0.8908241711,"falcon-7b-instruct":0.4542257929,"wizardlm-13b-v1.1":0.9502711246,"Meta-Llama-3-8B-Instruct":1.2575802331,"FsfairX-Zephyr-Chat-v0.1":1.4410058098,"Infinity-Instruct-3M-0613-Mistral-7B":1.1194852006,"Qwen1.5-72B-Chat":1.3042361649,"xwinlm-7b-v0.1":0.9455447881,"Mixtral-8x22B-Instruct-v0.1":1.2780740057,"vicuna-13b-v1.5":0.7674173991,"dbrx-instruct":1.2063251121,"zephyr-7b-alpha":0.8664491645,"tulu-2-dpo-13b":0.929813366,"Qwen1.5-7B-Chat":0.9544463489,"Together-MoA-Lite":1.4464848562,"cut-13b":0.9428953579,"Meta-Llama-3-70B-Instruct":1.3886514096,"vicuna-13b-v1.3":0.7846846272,"claude-instant-1.2":1.1341036838,"airoboros-65b":0.8816208133,"openbuddy-llama2-13b-v11.1":0.753544387,"phi-2":0.4496590406,"Together-MoA":1.4343056045,"mistral-large-2402":1.2485232545,"openbuddy-llama-30b-v7.1":0.7645283386,"TempNet-LLaMA2-Chat-70B-v0.1":1.0801507581,"pairrm-tulu-2-13b":1.0835284665,"recycled-wizardlm-7b-v2.0":0.8012012288,"Storm-7B-best-of-64":1.4253258915,"vicuna-7b":0.6135107768,"claude-3-sonnet-20240229":1.3419811052,"Mistral-7B-Instruct-v0.2":1.0785266447,"Samba-CoE-v0.1":1.1180386125,"claude":1.1687959793,"Nanbeige2-8B-Chat":1.4524224246,"REBEL-Llama-3-8B-Instruct":1.3914900256,"chatglm2-6b":0.5020758951,"gpt-4o-2024-05-13":1.470009459,"gpt4_1106_preview_verbose":1.3348590089,"TempNet-LLaMA2-Chat-13B-v0.1":0.8268032188,"text_davinci_001":0.5177668864,"Mixtral-8x7B-Instruct-v0.1_verbose":1.2975757386,"baize-v2-7b":0.5826293992,"phi-2-dpo":0.8357079426,"alpaca-farm-ppo-human":0.6304721407,"Nanbeige2-16B-Chat":1.4340261273,"gpt4_0613":1.0754642482,"pythia-12b-mix-sft":0.5127326717,"alpaca-7b-neft":0.5522241753,"Qwen1.5-14B-Chat":1.1351340211,"gpt-4-0125-preview":1.4286740089,"guanaco-33b":0.6697115752,"oasst-sft-llama-33b":0.6385940189,"gpt4_0613_verbose":1.2835395056,"llama-2-chat-7b-evol70k-neft":0.8110538776,"gpt35_turbo_instruct":0.8724086934,"platolm-7b":0.7405704765,"llama-2-13b-chat-hf":0.8286143394,"Nanbeige-Plus-Chat-v0.1":1.482841875,"openchat-v2-13b":0.8235980231,"mistral-orpo-beta":0.9929774686,"Snorkel-Mistral-PairRM-DPO-best-of-16":1.3599450437,"tulu-2-dpo-7b":0.8749615125,"alpaca-7b_verbose":0.5302092824,"OpenHermes-2.5-Mistral-7B":0.9356553899,"claude-2.1_verbose":1.29358621,"ultralm-13b-v2.0":0.8150376948,"deita-7b-v1.0":1.0352555321,"minichat-1.5-3b":0.7674159339,"Qwen-14B-Chat":0.8147265702,"airoboros-33b":0.8607792116,"alpaca-farm-ppo-sim-gpt4-20k":0.5834901038,"ultralm-13b":0.6707048924,"openbuddy-falcon-40b-v9":0.7388621614,"openchat8192-13b":0.8038094305,"wizardlm-13b":0.704420227,"vicuna-13b":0.7422829864,"merlinite-7B-AOT":1.3666520485,"gpt4_0314":1.2466725495,"gpt4_0613_concise":0.9010212759,"jina-chat":0.8398450576,"Contextual-KTO-Mistral-PairRM":1.3779687478,"xwinlm-13b-v0.1":1.1450161467,"LMCocktail-10.7B-v1":1.0457195357,"SPPO-Mistral7B-PairRM-ExPO":1.3981308966,"Mixtral-8x7B-Instruct-v0.1_concise":1.0718682992,"gpt4_1106_preview_concise":1.2325177143,"Mistral-7B-ReMax-v0.1":1.1288683901,"Llama-3-Instruct-8B-SimPO-ExPO":1.4439449942,"dolphin-2.2.1-mistral-7b":0.8892901247,"humpback-llama2-70b":0.9401806122,"openpipe-moa-gpt-4-turbo-v1":1.4229800988,"vicuna-7b-v1.5":0.6655960677,"Starling-LM-7B-alpha":1.0685460609,"falcon-40b-instruct":0.5541127159,"Samba-CoE-v0.2-best-of-16":1.318903,"opencoderplus-15b":0.8024858021,"xwinlm-70b-v0.1":1.2303274476,"wizardlm-13b-v1.2":0.9717618177,"aligner-2b_qwen1.5-72b-chat":1.2392772646,"internlm2-chat-7b-ExPO":1.3159792318,"claude-2.1":1.1203158654,"vicuna-7b-v1.3":0.6420919828,"oasst-rlhf-llama-33b":0.7417944201,"zephyr-7b-alpha-ExPO":0.9774634449,"openchat-v3.1-13b":0.9501308701,"SPPO-Llama-3-Instruct-8B-PairRM":1.4247223562,"minotaur-13b":0.7271241247,"tulu-2-dpo-13b-ExPO":1.1714853384,"zephyr-7b-beta-ExPO":1.0204784889,"tulu-2-dpo-7b-ExPO":1.0497814893,"Llama-3-Instruct-8B-SimPO":1.4225744647,"baize-v2-13b":0.6497033227,"guanaco-7b":0.5202924149,"ultralm-13b-v2.0-best-of-16":1.049344706,"claude-2.1_concise":0.8921752289,"openchat-13b":0.8368334957,"tulu-2-dpo-70b":1.1457861368,"deepseek-llm-67b-chat":1.0173843633,"humpback-llama-65b":0.9300866723,"tulu-2-dpo-70b-ExPO":1.3591734083,"TempNet-LLaMA2-Chat-7B-v0.1":0.7210775889,"nous-hermes-13b":0.7081240036,"gpt-3.5-turbo-0613":1.0371186215,"alpaca-7b_concise":0.4437510224,"baichuan-13b-chat":0.4176985079,"claude-3-5-sonnet-20240620":1.4679655404,"gpt-3.5-turbo-1106":0.8904117512,"minichat-3b":0.5041245962,"Storm-7B":1.4728176781,"oasst-sft-pythia-12b":0.3985580883,"Conifer-7B-DPO":0.9870897936,"Snorkel-Mistral-PairRM-DPO":1.3328273013,"internlm2-chat-20b-ExPO":1.4638315246,"Samba-CoE-v0.2":1.2171089783,"gemini-pro":1.1588503791,"pairrm-tulu-2-70b":1.19249667,"text_davinci_003":0.4346747594,"gpt4":1.2757042012,"Yi-34B-Chat":1.3225712598,"Starling-LM-7B-beta-ExPO":1.3252049543,"pairrm-Yi-34B-Chat":1.3482437399,"gpt4_1106_preview":0.0,"evo-7b":1.0835570389,"zephyr-7b-beta":0.9617876718,"guanaco-13b":0.5518606726,"alpaca-7b":0.4870855383,"internlm2-chat-20b-ppo":1.244366241,"gemma-2b-it":0.538998125,"pairrm-zephyr-7b-beta":1.0535874942,"evo-v2-7b":1.2159901798,"causallm-14b":0.9544127301,"SPPO-Mistral7B-PairRM":1.390800011,"gpt-3.5-turbo-1106_concise":0.8374438114,"openbuddy-llama-65b-v8":0.8871992619,"claude2-alpaca-13b":0.8249428868,"Starling-LM-7B-alpha-ExPO":1.2498324796,"openbuddy-falcon-7b-v6":0.5655836443,"gemma-7b-it":0.7869665732,"phi-2-sft":0.6098271417,"gpt4_gamed":0.6278799634,"llama-2-70b-chat-hf":1.0799847727,"openbuddy-llama2-70b-v10.1":0.8498371494,"wizardlm-70b":1.0395048913,"ultralm-13b-best-of-16":0.9418434059},"n_wins":{"gpt-3.5-turbo-0301":71.0,"gpt-3.5-turbo-1106_verbose":94.0,"vicuna-13b-v1.5-togetherai":53.0,"Qwen1.5-1.8B-Chat":27.0,"recycled-wizardlm-7b-v1.0":53.0,"aligner-2b_claude-3-opus-20240229":225.0,"Qwen1.5-110B-Chat":255.0,"claude-3-opus-20240229":223.0,"llama-2-7b-chat-hf":38.0,"mistral-medium":164.0,"vicuna-33b-v1.3":90.0,"cohere":96.0,"claude-2":131.0,"guanaco-65b":54.0,"Mixtral-8x7B-Instruct-v0.1":135.0,"openchat-v2-w-13b":67.0,"falcon-7b-instruct":16.0,"wizardlm-13b-v1.1":79.0,"Meta-Llama-3-8B-Instruct":176.0,"FsfairX-Zephyr-Chat-v0.1":285.0,"Infinity-Instruct-3M-0613-Mistral-7B":118.0,"Qwen1.5-72B-Chat":201.0,"xwinlm-7b-v0.1":77.0,"Mixtral-8x22B-Instruct-v0.1":174.0,"vicuna-13b-v1.5":48.0,"dbrx-instruct":147.0,"zephyr-7b-alpha":59.0,"tulu-2-dpo-13b":75.0,"Qwen1.5-7B-Chat":80.0,"Together-MoA-Lite":456.0,"cut-13b":83.0,"Meta-Llama-3-70B-Instruct":266.0,"vicuna-13b-v1.3":50.0,"claude-instant-1.2":120.0,"airoboros-65b":67.0,"openbuddy-llama2-13b-v11.1":42.0,"phi-2":15.0,"Together-MoA":490.0,"mistral-large-2402":166.0,"openbuddy-llama-30b-v7.1":47.0,"TempNet-LLaMA2-Chat-70B-v0.1":111.0,"pairrm-tulu-2-13b":110.0,"recycled-wizardlm-7b-v2.0":50.0,"Storm-7B-best-of-64":519.0,"vicuna-7b":28.0,"claude-3-sonnet-20240229":193.0,"Mistral-7B-Instruct-v0.2":113.0,"Samba-CoE-v0.1":124.0,"claude":129.0,"Nanbeige2-8B-Chat":323.0,"REBEL-Llama-3-8B-Instruct":268.0,"chatglm2-6b":19.0,"gpt-4o-2024-05-13":429.0,"gpt4_1106_preview_verbose":525.0,"TempNet-LLaMA2-Chat-13B-v0.1":56.0,"text_davinci_001":23.0,"Mixtral-8x7B-Instruct-v0.1_verbose":194.0,"baize-v2-7b":26.0,"phi-2-dpo":57.0,"alpaca-farm-ppo-human":32.0,"Nanbeige2-16B-Chat":288.0,"gpt4_0613":117.0,"pythia-12b-mix-sft":19.0,"alpaca-7b-neft":22.0,"Qwen1.5-14B-Chat":137.0,"gpt-4-0125-preview":446.0,"guanaco-33b":37.0,"oasst-sft-llama-33b":36.0,"gpt4_0613_verbose":171.0,"llama-2-chat-7b-evol70k-neft":57.0,"gpt35_turbo_instruct":66.0,"platolm-7b":42.0,"llama-2-13b-chat-hf":60.0,"Nanbeige-Plus-Chat-v0.1":456.0,"openchat-v2-13b":56.0,"mistral-orpo-beta":95.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":270.0,"tulu-2-dpo-7b":64.0,"alpaca-7b_verbose":22.0,"OpenHermes-2.5-Mistral-7B":75.0,"claude-2.1_verbose":191.0,"ultralm-13b-v2.0":51.0,"deita-7b-v1.0":96.0,"minichat-1.5-3b":48.0,"Qwen-14B-Chat":57.0,"airoboros-33b":64.0,"alpaca-farm-ppo-sim-gpt4-20k":26.0,"ultralm-13b":38.0,"openbuddy-falcon-40b-v9":45.0,"openchat8192-13b":51.0,"wizardlm-13b":42.0,"vicuna-13b":44.0,"merlinite-7B-AOT":234.0,"gpt4_0314":172.0,"gpt4_0613_concise":71.0,"jina-chat":59.0,"Contextual-KTO-Mistral-PairRM":260.0,"xwinlm-13b-v0.1":129.0,"LMCocktail-10.7B-v1":104.0,"SPPO-Mistral7B-PairRM-ExPO":274.0,"Mixtral-8x7B-Instruct-v0.1_concise":105.0,"gpt4_1106_preview_concise":172.0,"Mistral-7B-ReMax-v0.1":120.0,"Llama-3-Instruct-8B-SimPO-ExPO":325.0,"dolphin-2.2.1-mistral-7b":68.0,"humpback-llama2-70b":77.0,"openpipe-moa-gpt-4-turbo-v1":515.0,"vicuna-7b-v1.5":35.0,"Starling-LM-7B-alpha":102.0,"falcon-40b-instruct":27.0,"Samba-CoE-v0.2-best-of-16":201.0,"opencoderplus-15b":52.0,"xwinlm-70b-v0.1":166.0,"wizardlm-13b-v1.2":82.0,"aligner-2b_qwen1.5-72b-chat":180.0,"internlm2-chat-7b-ExPO":209.0,"claude-2.1":115.0,"vicuna-7b-v1.3":31.0,"oasst-rlhf-llama-33b":44.0,"zephyr-7b-alpha-ExPO":79.0,"openchat-v3.1-13b":80.0,"SPPO-Llama-3-Instruct-8B-PairRM":310.0,"minotaur-13b":42.0,"tulu-2-dpo-13b-ExPO":121.0,"zephyr-7b-beta-ExPO":89.0,"tulu-2-dpo-7b-ExPO":91.0,"Llama-3-Instruct-8B-SimPO":319.0,"baize-v2-13b":32.0,"guanaco-7b":21.0,"ultralm-13b-v2.0-best-of-16":98.0,"claude-2.1_concise":72.0,"openchat-13b":58.0,"tulu-2-dpo-70b":119.0,"deepseek-llm-67b-chat":90.0,"humpback-llama-65b":70.0,"tulu-2-dpo-70b-ExPO":184.0,"TempNet-LLaMA2-Chat-7B-v0.1":39.0,"nous-hermes-13b":43.0,"gpt-3.5-turbo-0613":99.0,"alpaca-7b_concise":15.0,"baichuan-13b-chat":14.0,"claude-3-5-sonnet-20240620":312.0,"gpt-3.5-turbo-1106":64.0,"minichat-3b":22.0,"Storm-7B":397.0,"oasst-sft-pythia-12b":13.0,"Conifer-7B-DPO":87.0,"Snorkel-Mistral-PairRM-DPO":231.0,"internlm2-chat-20b-ExPO":375.0,"Samba-CoE-v0.2":159.0,"gemini-pro":135.0,"pairrm-tulu-2-70b":140.0,"text_davinci_003":14.0,"gpt4":179.0,"Yi-34B-Chat":219.0,"Starling-LM-7B-beta-ExPO":225.0,"pairrm-Yi-34B-Chat":239.0,"gpt4_1106_preview":0.0,"evo-7b":112.0,"zephyr-7b-beta":78.0,"guanaco-13b":22.0,"alpaca-7b":17.0,"internlm2-chat-20b-ppo":170.0,"gemma-2b-it":23.0,"pairrm-zephyr-7b-beta":98.0,"evo-v2-7b":158.0,"causallm-14b":81.0,"SPPO-Mistral7B-PairRM":249.0,"gpt-3.5-turbo-1106_concise":57.0,"openbuddy-llama-65b-v8":64.0,"claude2-alpaca-13b":59.0,"Starling-LM-7B-alpha-ExPO":148.0,"openbuddy-falcon-7b-v6":27.0,"gemma-7b-it":50.0,"phi-2-sft":28.0,"gpt4_gamed":32.0,"llama-2-70b-chat-hf":104.0,"openbuddy-llama2-70b-v10.1":57.0,"wizardlm-70b":106.0,"ultralm-13b-best-of-16":80.0},"n_wins_base":{"gpt-3.5-turbo-0301":733.0,"gpt-3.5-turbo-1106_verbose":709.0,"vicuna-13b-v1.5-togetherai":747.0,"Qwen1.5-1.8B-Chat":774.0,"recycled-wizardlm-7b-v1.0":752.0,"aligner-2b_claude-3-opus-20240229":475.0,"Qwen1.5-110B-Chat":545.0,"claude-3-opus-20240229":579.0,"llama-2-7b-chat-hf":766.0,"mistral-medium":639.0,"vicuna-33b-v1.3":711.0,"cohere":709.0,"claude-2":673.0,"guanaco-65b":751.0,"Mixtral-8x7B-Instruct-v0.1":668.0,"openchat-v2-w-13b":736.0,"falcon-7b-instruct":787.0,"wizardlm-13b-v1.1":723.0,"Meta-Llama-3-8B-Instruct":626.0,"FsfairX-Zephyr-Chat-v0.1":517.0,"Infinity-Instruct-3M-0613-Mistral-7B":687.0,"Qwen1.5-72B-Chat":600.0,"xwinlm-7b-v0.1":727.0,"Mixtral-8x22B-Instruct-v0.1":628.0,"vicuna-13b-v1.5":753.0,"dbrx-instruct":657.0,"zephyr-7b-alpha":745.0,"tulu-2-dpo-13b":728.0,"Qwen1.5-7B-Chat":721.0,"Together-MoA-Lite":347.0,"cut-13b":721.0,"Meta-Llama-3-70B-Instruct":537.0,"vicuna-13b-v1.3":751.0,"claude-instant-1.2":682.0,"airoboros-65b":735.0,"openbuddy-llama2-13b-v11.1":761.0,"phi-2":785.0,"Together-MoA":314.0,"mistral-large-2402":638.0,"openbuddy-llama-30b-v7.1":755.0,"TempNet-LLaMA2-Chat-70B-v0.1":691.0,"pairrm-tulu-2-13b":694.0,"recycled-wizardlm-7b-v2.0":755.0,"Storm-7B-best-of-64":286.0,"vicuna-7b":775.0,"claude-3-sonnet-20240229":608.0,"Mistral-7B-Instruct-v0.2":691.0,"Samba-CoE-v0.1":680.0,"claude":676.0,"Nanbeige2-8B-Chat":480.0,"REBEL-Llama-3-8B-Instruct":537.0,"chatglm2-6b":781.0,"gpt-4o-2024-05-13":369.0,"gpt4_1106_preview_verbose":268.0,"TempNet-LLaMA2-Chat-13B-v0.1":749.0,"text_davinci_001":777.0,"Mixtral-8x7B-Instruct-v0.1_verbose":609.0,"baize-v2-7b":779.0,"phi-2-dpo":748.0,"alpaca-farm-ppo-human":770.0,"Nanbeige2-16B-Chat":514.0,"gpt4_0613":684.0,"pythia-12b-mix-sft":786.0,"alpaca-7b-neft":783.0,"Qwen1.5-14B-Chat":664.0,"gpt-4-0125-preview":347.0,"guanaco-33b":768.0,"oasst-sft-llama-33b":764.0,"gpt4_0613_verbose":630.0,"llama-2-chat-7b-evol70k-neft":748.0,"gpt35_turbo_instruct":735.0,"platolm-7b":759.0,"llama-2-13b-chat-hf":744.0,"Nanbeige-Plus-Chat-v0.1":347.0,"openchat-v2-13b":746.0,"mistral-orpo-beta":707.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":533.0,"tulu-2-dpo-7b":740.0,"alpaca-7b_verbose":778.0,"OpenHermes-2.5-Mistral-7B":727.0,"claude-2.1_verbose":613.0,"ultralm-13b-v2.0":754.0,"deita-7b-v1.0":708.0,"minichat-1.5-3b":757.0,"Qwen-14B-Chat":742.0,"airoboros-33b":740.0,"alpaca-farm-ppo-sim-gpt4-20k":776.0,"ultralm-13b":765.0,"openbuddy-falcon-40b-v9":758.0,"openchat8192-13b":754.0,"wizardlm-13b":759.0,"vicuna-13b":759.0,"merlinite-7B-AOT":571.0,"gpt4_0314":627.0,"gpt4_0613_concise":729.0,"jina-chat":743.0,"Contextual-KTO-Mistral-PairRM":544.0,"xwinlm-13b-v0.1":672.0,"LMCocktail-10.7B-v1":700.0,"SPPO-Mistral7B-PairRM-ExPO":531.0,"Mixtral-8x7B-Instruct-v0.1_concise":700.0,"gpt4_1106_preview_concise":622.0,"Mistral-7B-ReMax-v0.1":683.0,"Llama-3-Instruct-8B-SimPO-ExPO":479.0,"dolphin-2.2.1-mistral-7b":734.0,"humpback-llama2-70b":727.0,"openpipe-moa-gpt-4-turbo-v1":283.0,"vicuna-7b-v1.5":767.0,"Starling-LM-7B-alpha":702.0,"falcon-40b-instruct":777.0,"Samba-CoE-v0.2-best-of-16":601.0,"opencoderplus-15b":750.0,"xwinlm-70b-v0.1":635.0,"wizardlm-13b-v1.2":720.0,"aligner-2b_qwen1.5-72b-chat":473.0,"internlm2-chat-7b-ExPO":595.0,"claude-2.1":688.0,"vicuna-7b-v1.3":771.0,"oasst-rlhf-llama-33b":759.0,"zephyr-7b-alpha-ExPO":725.0,"openchat-v3.1-13b":720.0,"SPPO-Llama-3-Instruct-8B-PairRM":494.0,"minotaur-13b":758.0,"tulu-2-dpo-13b-ExPO":679.0,"zephyr-7b-beta-ExPO":716.0,"tulu-2-dpo-7b-ExPO":714.0,"Llama-3-Instruct-8B-SimPO":485.0,"baize-v2-13b":770.0,"guanaco-7b":783.0,"ultralm-13b-v2.0-best-of-16":705.0,"claude-2.1_concise":730.0,"openchat-13b":746.0,"tulu-2-dpo-70b":683.0,"deepseek-llm-67b-chat":713.0,"humpback-llama-65b":734.0,"tulu-2-dpo-70b-ExPO":620.0,"TempNet-LLaMA2-Chat-7B-v0.1":765.0,"nous-hermes-13b":761.0,"gpt-3.5-turbo-0613":700.0,"alpaca-7b_concise":787.0,"baichuan-13b-chat":790.0,"claude-3-5-sonnet-20240620":493.0,"gpt-3.5-turbo-1106":737.0,"minichat-3b":779.0,"Storm-7B":408.0,"oasst-sft-pythia-12b":790.0,"Conifer-7B-DPO":717.0,"Snorkel-Mistral-PairRM-DPO":572.0,"internlm2-chat-20b-ExPO":430.0,"Samba-CoE-v0.2":645.0,"gemini-pro":665.0,"pairrm-tulu-2-70b":665.0,"text_davinci_003":787.0,"gpt4":618.0,"Yi-34B-Chat":582.0,"Starling-LM-7B-beta-ExPO":580.0,"pairrm-Yi-34B-Chat":563.0,"gpt4_1106_preview":0.0,"evo-7b":689.0,"zephyr-7b-beta":725.0,"guanaco-13b":780.0,"alpaca-7b":785.0,"internlm2-chat-20b-ppo":632.0,"gemma-2b-it":782.0,"pairrm-zephyr-7b-beta":706.0,"evo-v2-7b":644.0,"causallm-14b":720.0,"SPPO-Mistral7B-PairRM":556.0,"gpt-3.5-turbo-1106_concise":744.0,"openbuddy-llama-65b-v8":738.0,"claude2-alpaca-13b":746.0,"Starling-LM-7B-alpha-ExPO":657.0,"openbuddy-falcon-7b-v6":778.0,"gemma-7b-it":754.0,"phi-2-sft":777.0,"gpt4_gamed":771.0,"llama-2-70b-chat-hf":700.0,"openbuddy-llama2-70b-v10.1":744.0,"wizardlm-70b":697.0,"ultralm-13b-best-of-16":723.0},"n_draws":{"gpt-3.5-turbo-0301":1.0,"gpt-3.5-turbo-1106_verbose":2.0,"vicuna-13b-v1.5-togetherai":5.0,"Qwen1.5-1.8B-Chat":3.0,"recycled-wizardlm-7b-v1.0":0.0,"aligner-2b_claude-3-opus-20240229":105.0,"Qwen1.5-110B-Chat":5.0,"claude-3-opus-20240229":3.0,"llama-2-7b-chat-hf":1.0,"mistral-medium":2.0,"vicuna-33b-v1.3":4.0,"cohere":0.0,"claude-2":1.0,"guanaco-65b":0.0,"Mixtral-8x7B-Instruct-v0.1":2.0,"openchat-v2-w-13b":2.0,"falcon-7b-instruct":2.0,"wizardlm-13b-v1.1":3.0,"Meta-Llama-3-8B-Instruct":3.0,"FsfairX-Zephyr-Chat-v0.1":3.0,"Infinity-Instruct-3M-0613-Mistral-7B":0.0,"Qwen1.5-72B-Chat":4.0,"xwinlm-7b-v0.1":1.0,"Mixtral-8x22B-Instruct-v0.1":3.0,"vicuna-13b-v1.5":4.0,"dbrx-instruct":1.0,"zephyr-7b-alpha":1.0,"tulu-2-dpo-13b":2.0,"Qwen1.5-7B-Chat":4.0,"Together-MoA-Lite":2.0,"cut-13b":1.0,"Meta-Llama-3-70B-Instruct":2.0,"vicuna-13b-v1.3":4.0,"claude-instant-1.2":3.0,"airoboros-65b":3.0,"openbuddy-llama2-13b-v11.1":2.0,"phi-2":3.0,"Together-MoA":1.0,"mistral-large-2402":1.0,"openbuddy-llama-30b-v7.1":3.0,"TempNet-LLaMA2-Chat-70B-v0.1":2.0,"pairrm-tulu-2-13b":1.0,"recycled-wizardlm-7b-v2.0":0.0,"Storm-7B-best-of-64":0.0,"vicuna-7b":2.0,"claude-3-sonnet-20240229":4.0,"Mistral-7B-Instruct-v0.2":1.0,"Samba-CoE-v0.1":1.0,"claude":0.0,"Nanbeige2-8B-Chat":2.0,"REBEL-Llama-3-8B-Instruct":0.0,"chatglm2-6b":5.0,"gpt-4o-2024-05-13":7.0,"gpt4_1106_preview_verbose":12.0,"TempNet-LLaMA2-Chat-13B-v0.1":0.0,"text_davinci_001":3.0,"Mixtral-8x7B-Instruct-v0.1_verbose":2.0,"baize-v2-7b":0.0,"phi-2-dpo":0.0,"alpaca-farm-ppo-human":3.0,"Nanbeige2-16B-Chat":3.0,"gpt4_0613":4.0,"pythia-12b-mix-sft":0.0,"alpaca-7b-neft":0.0,"Qwen1.5-14B-Chat":4.0,"gpt-4-0125-preview":12.0,"guanaco-33b":0.0,"oasst-sft-llama-33b":5.0,"gpt4_0613_verbose":4.0,"llama-2-chat-7b-evol70k-neft":0.0,"gpt35_turbo_instruct":3.0,"platolm-7b":2.0,"llama-2-13b-chat-hf":1.0,"Nanbeige-Plus-Chat-v0.1":2.0,"openchat-v2-13b":3.0,"mistral-orpo-beta":3.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":2.0,"tulu-2-dpo-7b":1.0,"alpaca-7b_verbose":2.0,"OpenHermes-2.5-Mistral-7B":3.0,"claude-2.1_verbose":1.0,"ultralm-13b-v2.0":0.0,"deita-7b-v1.0":1.0,"minichat-1.5-3b":0.0,"Qwen-14B-Chat":6.0,"airoboros-33b":1.0,"alpaca-farm-ppo-sim-gpt4-20k":3.0,"ultralm-13b":2.0,"openbuddy-falcon-40b-v9":2.0,"openchat8192-13b":0.0,"wizardlm-13b":4.0,"vicuna-13b":2.0,"merlinite-7B-AOT":0.0,"gpt4_0314":6.0,"gpt4_0613_concise":5.0,"jina-chat":3.0,"Contextual-KTO-Mistral-PairRM":1.0,"xwinlm-13b-v0.1":4.0,"LMCocktail-10.7B-v1":1.0,"SPPO-Mistral7B-PairRM-ExPO":0.0,"Mixtral-8x7B-Instruct-v0.1_concise":0.0,"gpt4_1106_preview_concise":11.0,"Mistral-7B-ReMax-v0.1":2.0,"Llama-3-Instruct-8B-SimPO-ExPO":1.0,"dolphin-2.2.1-mistral-7b":3.0,"humpback-llama2-70b":1.0,"openpipe-moa-gpt-4-turbo-v1":7.0,"vicuna-7b-v1.5":3.0,"Starling-LM-7B-alpha":1.0,"falcon-40b-instruct":1.0,"Samba-CoE-v0.2-best-of-16":3.0,"opencoderplus-15b":3.0,"xwinlm-70b-v0.1":4.0,"wizardlm-13b-v1.2":3.0,"aligner-2b_qwen1.5-72b-chat":152.0,"internlm2-chat-7b-ExPO":1.0,"claude-2.1":2.0,"vicuna-7b-v1.3":3.0,"oasst-rlhf-llama-33b":2.0,"zephyr-7b-alpha-ExPO":1.0,"openchat-v3.1-13b":5.0,"SPPO-Llama-3-Instruct-8B-PairRM":1.0,"minotaur-13b":4.0,"tulu-2-dpo-13b-ExPO":5.0,"zephyr-7b-beta-ExPO":0.0,"tulu-2-dpo-7b-ExPO":0.0,"Llama-3-Instruct-8B-SimPO":1.0,"baize-v2-13b":3.0,"guanaco-7b":1.0,"ultralm-13b-v2.0-best-of-16":2.0,"claude-2.1_concise":3.0,"openchat-13b":1.0,"tulu-2-dpo-70b":3.0,"deepseek-llm-67b-chat":2.0,"humpback-llama-65b":1.0,"tulu-2-dpo-70b-ExPO":1.0,"TempNet-LLaMA2-Chat-7B-v0.1":1.0,"nous-hermes-13b":1.0,"gpt-3.5-turbo-0613":6.0,"alpaca-7b_concise":2.0,"baichuan-13b-chat":1.0,"claude-3-5-sonnet-20240620":0.0,"gpt-3.5-turbo-1106":4.0,"minichat-3b":4.0,"Storm-7B":0.0,"oasst-sft-pythia-12b":2.0,"Conifer-7B-DPO":1.0,"Snorkel-Mistral-PairRM-DPO":1.0,"internlm2-chat-20b-ExPO":0.0,"Samba-CoE-v0.2":1.0,"gemini-pro":5.0,"pairrm-tulu-2-70b":0.0,"text_davinci_003":4.0,"gpt4":8.0,"Yi-34B-Chat":4.0,"Starling-LM-7B-beta-ExPO":0.0,"pairrm-Yi-34B-Chat":3.0,"gpt4_1106_preview":805.0,"evo-7b":4.0,"zephyr-7b-beta":2.0,"guanaco-13b":3.0,"alpaca-7b":3.0,"internlm2-chat-20b-ppo":3.0,"gemma-2b-it":0.0,"pairrm-zephyr-7b-beta":1.0,"evo-v2-7b":3.0,"causallm-14b":4.0,"SPPO-Mistral7B-PairRM":0.0,"gpt-3.5-turbo-1106_concise":4.0,"openbuddy-llama-65b-v8":3.0,"claude2-alpaca-13b":0.0,"Starling-LM-7B-alpha-ExPO":0.0,"openbuddy-falcon-7b-v6":0.0,"gemma-7b-it":1.0,"phi-2-sft":0.0,"gpt4_gamed":2.0,"llama-2-70b-chat-hf":0.0,"openbuddy-llama2-70b-v10.1":4.0,"wizardlm-70b":2.0,"ultralm-13b-best-of-16":2.0},"n_total":{"gpt-3.5-turbo-0301":805.0,"gpt-3.5-turbo-1106_verbose":805.0,"vicuna-13b-v1.5-togetherai":805.0,"Qwen1.5-1.8B-Chat":804.0,"recycled-wizardlm-7b-v1.0":805.0,"aligner-2b_claude-3-opus-20240229":805.0,"Qwen1.5-110B-Chat":805.0,"claude-3-opus-20240229":805.0,"llama-2-7b-chat-hf":805.0,"mistral-medium":805.0,"vicuna-33b-v1.3":805.0,"cohere":805.0,"claude-2":805.0,"guanaco-65b":805.0,"Mixtral-8x7B-Instruct-v0.1":805.0,"openchat-v2-w-13b":805.0,"falcon-7b-instruct":805.0,"wizardlm-13b-v1.1":805.0,"Meta-Llama-3-8B-Instruct":805.0,"FsfairX-Zephyr-Chat-v0.1":805.0,"Infinity-Instruct-3M-0613-Mistral-7B":805.0,"Qwen1.5-72B-Chat":805.0,"xwinlm-7b-v0.1":805.0,"Mixtral-8x22B-Instruct-v0.1":805.0,"vicuna-13b-v1.5":805.0,"dbrx-instruct":805.0,"zephyr-7b-alpha":805.0,"tulu-2-dpo-13b":805.0,"Qwen1.5-7B-Chat":805.0,"Together-MoA-Lite":805.0,"cut-13b":805.0,"Meta-Llama-3-70B-Instruct":805.0,"vicuna-13b-v1.3":805.0,"claude-instant-1.2":805.0,"airoboros-65b":805.0,"openbuddy-llama2-13b-v11.1":805.0,"phi-2":803.0,"Together-MoA":805.0,"mistral-large-2402":805.0,"openbuddy-llama-30b-v7.1":805.0,"TempNet-LLaMA2-Chat-70B-v0.1":804.0,"pairrm-tulu-2-13b":805.0,"recycled-wizardlm-7b-v2.0":805.0,"Storm-7B-best-of-64":805.0,"vicuna-7b":805.0,"claude-3-sonnet-20240229":805.0,"Mistral-7B-Instruct-v0.2":805.0,"Samba-CoE-v0.1":805.0,"claude":805.0,"Nanbeige2-8B-Chat":805.0,"REBEL-Llama-3-8B-Instruct":805.0,"chatglm2-6b":805.0,"gpt-4o-2024-05-13":805.0,"gpt4_1106_preview_verbose":805.0,"TempNet-LLaMA2-Chat-13B-v0.1":805.0,"text_davinci_001":803.0,"Mixtral-8x7B-Instruct-v0.1_verbose":805.0,"baize-v2-7b":805.0,"phi-2-dpo":805.0,"alpaca-farm-ppo-human":805.0,"Nanbeige2-16B-Chat":805.0,"gpt4_0613":805.0,"pythia-12b-mix-sft":805.0,"alpaca-7b-neft":805.0,"Qwen1.5-14B-Chat":805.0,"gpt-4-0125-preview":805.0,"guanaco-33b":805.0,"oasst-sft-llama-33b":805.0,"gpt4_0613_verbose":805.0,"llama-2-chat-7b-evol70k-neft":805.0,"gpt35_turbo_instruct":804.0,"platolm-7b":803.0,"llama-2-13b-chat-hf":805.0,"Nanbeige-Plus-Chat-v0.1":805.0,"openchat-v2-13b":805.0,"mistral-orpo-beta":805.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":805.0,"tulu-2-dpo-7b":805.0,"alpaca-7b_verbose":802.0,"OpenHermes-2.5-Mistral-7B":805.0,"claude-2.1_verbose":805.0,"ultralm-13b-v2.0":805.0,"deita-7b-v1.0":805.0,"minichat-1.5-3b":805.0,"Qwen-14B-Chat":805.0,"airoboros-33b":805.0,"alpaca-farm-ppo-sim-gpt4-20k":805.0,"ultralm-13b":805.0,"openbuddy-falcon-40b-v9":805.0,"openchat8192-13b":805.0,"wizardlm-13b":805.0,"vicuna-13b":805.0,"merlinite-7B-AOT":805.0,"gpt4_0314":805.0,"gpt4_0613_concise":805.0,"jina-chat":805.0,"Contextual-KTO-Mistral-PairRM":805.0,"xwinlm-13b-v0.1":805.0,"LMCocktail-10.7B-v1":805.0,"SPPO-Mistral7B-PairRM-ExPO":805.0,"Mixtral-8x7B-Instruct-v0.1_concise":805.0,"gpt4_1106_preview_concise":805.0,"Mistral-7B-ReMax-v0.1":805.0,"Llama-3-Instruct-8B-SimPO-ExPO":805.0,"dolphin-2.2.1-mistral-7b":805.0,"humpback-llama2-70b":805.0,"openpipe-moa-gpt-4-turbo-v1":805.0,"vicuna-7b-v1.5":805.0,"Starling-LM-7B-alpha":805.0,"falcon-40b-instruct":805.0,"Samba-CoE-v0.2-best-of-16":805.0,"opencoderplus-15b":805.0,"xwinlm-70b-v0.1":805.0,"wizardlm-13b-v1.2":805.0,"aligner-2b_qwen1.5-72b-chat":805.0,"internlm2-chat-7b-ExPO":805.0,"claude-2.1":805.0,"vicuna-7b-v1.3":805.0,"oasst-rlhf-llama-33b":805.0,"zephyr-7b-alpha-ExPO":805.0,"openchat-v3.1-13b":805.0,"SPPO-Llama-3-Instruct-8B-PairRM":805.0,"minotaur-13b":804.0,"tulu-2-dpo-13b-ExPO":805.0,"zephyr-7b-beta-ExPO":805.0,"tulu-2-dpo-7b-ExPO":805.0,"Llama-3-Instruct-8B-SimPO":805.0,"baize-v2-13b":805.0,"guanaco-7b":805.0,"ultralm-13b-v2.0-best-of-16":805.0,"claude-2.1_concise":805.0,"openchat-13b":805.0,"tulu-2-dpo-70b":805.0,"deepseek-llm-67b-chat":805.0,"humpback-llama-65b":805.0,"tulu-2-dpo-70b-ExPO":805.0,"TempNet-LLaMA2-Chat-7B-v0.1":805.0,"nous-hermes-13b":805.0,"gpt-3.5-turbo-0613":805.0,"alpaca-7b_concise":804.0,"baichuan-13b-chat":805.0,"claude-3-5-sonnet-20240620":805.0,"gpt-3.5-turbo-1106":805.0,"minichat-3b":805.0,"Storm-7B":805.0,"oasst-sft-pythia-12b":805.0,"Conifer-7B-DPO":805.0,"Snorkel-Mistral-PairRM-DPO":804.0,"internlm2-chat-20b-ExPO":805.0,"Samba-CoE-v0.2":805.0,"gemini-pro":805.0,"pairrm-tulu-2-70b":805.0,"text_davinci_003":805.0,"gpt4":805.0,"Yi-34B-Chat":805.0,"Starling-LM-7B-beta-ExPO":805.0,"pairrm-Yi-34B-Chat":805.0,"gpt4_1106_preview":805.0,"evo-7b":805.0,"zephyr-7b-beta":805.0,"guanaco-13b":805.0,"alpaca-7b":805.0,"internlm2-chat-20b-ppo":805.0,"gemma-2b-it":805.0,"pairrm-zephyr-7b-beta":805.0,"evo-v2-7b":805.0,"causallm-14b":805.0,"SPPO-Mistral7B-PairRM":805.0,"gpt-3.5-turbo-1106_concise":805.0,"openbuddy-llama-65b-v8":805.0,"claude2-alpaca-13b":805.0,"Starling-LM-7B-alpha-ExPO":805.0,"openbuddy-falcon-7b-v6":805.0,"gemma-7b-it":805.0,"phi-2-sft":805.0,"gpt4_gamed":805.0,"llama-2-70b-chat-hf":804.0,"openbuddy-llama2-70b-v10.1":805.0,"wizardlm-70b":805.0,"ultralm-13b-best-of-16":805.0},"discrete_win_rate":{"gpt-3.5-turbo-0301":8.8819875776,"gpt-3.5-turbo-1106_verbose":11.801242236,"vicuna-13b-v1.5-togetherai":6.8944099379,"Qwen1.5-1.8B-Chat":3.5447761194,"recycled-wizardlm-7b-v1.0":6.5838509317,"aligner-2b_claude-3-opus-20240229":34.4720496894,"Qwen1.5-110B-Chat":31.9875776398,"claude-3-opus-20240229":27.8881987578,"llama-2-7b-chat-hf":4.7826086957,"mistral-medium":20.4968944099,"vicuna-33b-v1.3":11.4285714286,"cohere":11.9254658385,"claude-2":16.3354037267,"guanaco-65b":6.7080745342,"Mixtral-8x7B-Instruct-v0.1":16.8944099379,"openchat-v2-w-13b":8.4472049689,"falcon-7b-instruct":2.1118012422,"wizardlm-13b-v1.1":10.0,"Meta-Llama-3-8B-Instruct":22.049689441,"FsfairX-Zephyr-Chat-v0.1":35.5900621118,"Infinity-Instruct-3M-0613-Mistral-7B":14.6583850932,"Qwen1.5-72B-Chat":25.2173913043,"xwinlm-7b-v0.1":9.6273291925,"Mixtral-8x22B-Instruct-v0.1":21.801242236,"vicuna-13b-v1.5":6.2111801242,"dbrx-instruct":18.3229813665,"zephyr-7b-alpha":7.3913043478,"tulu-2-dpo-13b":9.4409937888,"Qwen1.5-7B-Chat":10.1863354037,"Together-MoA-Lite":56.7701863354,"cut-13b":10.3726708075,"Meta-Llama-3-70B-Instruct":33.1677018634,"vicuna-13b-v1.3":6.4596273292,"claude-instant-1.2":15.0931677019,"airoboros-65b":8.5093167702,"openbuddy-llama2-13b-v11.1":5.3416149068,"phi-2":2.0547945205,"Together-MoA":60.9316770186,"mistral-large-2402":20.6832298137,"openbuddy-llama-30b-v7.1":6.0248447205,"TempNet-LLaMA2-Chat-70B-v0.1":13.9303482587,"pairrm-tulu-2-13b":13.7267080745,"recycled-wizardlm-7b-v2.0":6.2111801242,"Storm-7B-best-of-64":64.4720496894,"vicuna-7b":3.602484472,"claude-3-sonnet-20240229":24.2236024845,"Mistral-7B-Instruct-v0.2":14.099378882,"Samba-CoE-v0.1":15.4658385093,"claude":16.0248447205,"Nanbeige2-8B-Chat":40.248447205,"REBEL-Llama-3-8B-Instruct":33.2919254658,"chatglm2-6b":2.6708074534,"gpt-4o-2024-05-13":53.7267080745,"gpt4_1106_preview_verbose":65.9627329193,"TempNet-LLaMA2-Chat-13B-v0.1":6.9565217391,"text_davinci_001":3.0510585305,"Mixtral-8x7B-Instruct-v0.1_verbose":24.2236024845,"baize-v2-7b":3.2298136646,"phi-2-dpo":7.0807453416,"alpaca-farm-ppo-human":4.1614906832,"Nanbeige2-16B-Chat":35.9627329193,"gpt4_0613":14.7826086957,"pythia-12b-mix-sft":2.3602484472,"alpaca-7b-neft":2.7329192547,"Qwen1.5-14B-Chat":17.2670807453,"gpt-4-0125-preview":56.149068323,"guanaco-33b":4.5962732919,"oasst-sft-llama-33b":4.7826086957,"gpt4_0613_verbose":21.4906832298,"llama-2-chat-7b-evol70k-neft":7.0807453416,"gpt35_turbo_instruct":8.3955223881,"platolm-7b":5.3549190535,"llama-2-13b-chat-hf":7.5155279503,"Nanbeige-Plus-Chat-v0.1":56.7701863354,"openchat-v2-13b":7.1428571429,"mistral-orpo-beta":11.9875776398,"Snorkel-Mistral-PairRM-DPO-best-of-16":33.6645962733,"tulu-2-dpo-7b":8.0124223602,"alpaca-7b_verbose":2.8678304239,"OpenHermes-2.5-Mistral-7B":9.5031055901,"claude-2.1_verbose":23.7888198758,"ultralm-13b-v2.0":6.3354037267,"deita-7b-v1.0":11.9875776398,"minichat-1.5-3b":5.9627329193,"Qwen-14B-Chat":7.4534161491,"airoboros-33b":8.0124223602,"alpaca-farm-ppo-sim-gpt4-20k":3.4161490683,"ultralm-13b":4.8447204969,"openbuddy-falcon-40b-v9":5.7142857143,"openchat8192-13b":6.3354037267,"wizardlm-13b":5.4658385093,"vicuna-13b":5.5900621118,"merlinite-7B-AOT":29.0683229814,"gpt4_0314":21.7391304348,"gpt4_0613_concise":9.1304347826,"jina-chat":7.5155279503,"Contextual-KTO-Mistral-PairRM":32.3602484472,"xwinlm-13b-v0.1":16.2732919255,"LMCocktail-10.7B-v1":12.9813664596,"SPPO-Mistral7B-PairRM-ExPO":34.0372670807,"Mixtral-8x7B-Instruct-v0.1_concise":13.0434782609,"gpt4_1106_preview_concise":22.049689441,"Mistral-7B-ReMax-v0.1":15.0310559006,"Llama-3-Instruct-8B-SimPO-ExPO":40.4347826087,"dolphin-2.2.1-mistral-7b":8.6335403727,"humpback-llama2-70b":9.6273291925,"openpipe-moa-gpt-4-turbo-v1":64.4099378882,"vicuna-7b-v1.5":4.5341614907,"Starling-LM-7B-alpha":12.7329192547,"falcon-40b-instruct":3.4161490683,"Samba-CoE-v0.2-best-of-16":25.1552795031,"opencoderplus-15b":6.6459627329,"xwinlm-70b-v0.1":20.8695652174,"wizardlm-13b-v1.2":10.3726708075,"aligner-2b_qwen1.5-72b-chat":31.801242236,"internlm2-chat-7b-ExPO":26.0248447205,"claude-2.1":14.4099378882,"vicuna-7b-v1.3":4.0372670807,"oasst-rlhf-llama-33b":5.5900621118,"zephyr-7b-alpha-ExPO":9.8757763975,"openchat-v3.1-13b":10.248447205,"SPPO-Llama-3-Instruct-8B-PairRM":38.5714285714,"minotaur-13b":5.4726368159,"tulu-2-dpo-13b-ExPO":15.3416149068,"zephyr-7b-beta-ExPO":11.0559006211,"tulu-2-dpo-7b-ExPO":11.3043478261,"Llama-3-Instruct-8B-SimPO":39.6894409938,"baize-v2-13b":4.1614906832,"guanaco-7b":2.6708074534,"ultralm-13b-v2.0-best-of-16":12.298136646,"claude-2.1_concise":9.1304347826,"openchat-13b":7.2670807453,"tulu-2-dpo-70b":14.9689440994,"deepseek-llm-67b-chat":11.3043478261,"humpback-llama-65b":8.7577639752,"tulu-2-dpo-70b-ExPO":22.9192546584,"TempNet-LLaMA2-Chat-7B-v0.1":4.9068322981,"nous-hermes-13b":5.4037267081,"gpt-3.5-turbo-0613":12.6708074534,"alpaca-7b_concise":1.9900497512,"baichuan-13b-chat":1.801242236,"claude-3-5-sonnet-20240620":38.7577639752,"gpt-3.5-turbo-1106":8.198757764,"minichat-3b":2.9813664596,"Storm-7B":49.3167701863,"oasst-sft-pythia-12b":1.7391304348,"Conifer-7B-DPO":10.8695652174,"Snorkel-Mistral-PairRM-DPO":28.7935323383,"internlm2-chat-20b-ExPO":46.5838509317,"Samba-CoE-v0.2":19.8136645963,"gemini-pro":17.0807453416,"pairrm-tulu-2-70b":17.3913043478,"text_davinci_003":1.9875776398,"gpt4":22.7329192547,"Yi-34B-Chat":27.4534161491,"Starling-LM-7B-beta-ExPO":27.950310559,"pairrm-Yi-34B-Chat":29.8757763975,"gpt4_1106_preview":50.0,"evo-7b":14.1614906832,"zephyr-7b-beta":9.8136645963,"guanaco-13b":2.9192546584,"alpaca-7b":2.298136646,"internlm2-chat-20b-ppo":21.3043478261,"gemma-2b-it":2.8571428571,"pairrm-zephyr-7b-beta":12.2360248447,"evo-v2-7b":19.8136645963,"causallm-14b":10.3105590062,"SPPO-Mistral7B-PairRM":30.9316770186,"gpt-3.5-turbo-1106_concise":7.3291925466,"openbuddy-llama-65b-v8":8.1366459627,"claude2-alpaca-13b":7.3291925466,"Starling-LM-7B-alpha-ExPO":18.3850931677,"openbuddy-falcon-7b-v6":3.3540372671,"gemma-7b-it":6.2732919255,"phi-2-sft":3.4782608696,"gpt4_gamed":4.099378882,"llama-2-70b-chat-hf":12.9353233831,"openbuddy-llama2-70b-v10.1":7.3291925466,"wizardlm-70b":13.2919254658,"ultralm-13b-best-of-16":10.0621118012},"length_controlled_winrate":{"gpt-3.5-turbo-0301":18.093241552,"gpt-3.5-turbo-1106_verbose":22.0009370217,"vicuna-13b-v1.5-togetherai":11.6853569655,"Qwen1.5-1.8B-Chat":2.5884988492,"recycled-wizardlm-7b-v1.0":6.901477322,"aligner-2b_claude-3-opus-20240229":41.8230717152,"Qwen1.5-110B-Chat":43.905552211,"claude-3-opus-20240229":40.5095080124,"llama-2-7b-chat-hf":5.3548212795,"mistral-medium":28.6143374017,"vicuna-33b-v1.3":17.5745753109,"cohere":10.8930208866,"claude-2":28.1551961416,"guanaco-65b":8.2529169916,"Mixtral-8x7B-Instruct-v0.1":23.6884826013,"openchat-v2-w-13b":12.030427771,"falcon-7b-instruct":4.0369375668,"wizardlm-13b-v1.1":13.9157205928,"Meta-Llama-3-8B-Instruct":22.9187846731,"FsfairX-Zephyr-Chat-v0.1":34.787447623,"Infinity-Instruct-3M-0613-Mistral-7B":25.5015577947,"Qwen1.5-72B-Chat":36.571754112,"xwinlm-7b-v0.1":10.8122056273,"Mixtral-8x22B-Instruct-v0.1":30.8788102941,"vicuna-13b-v1.5":10.4844382985,"dbrx-instruct":25.1853410397,"zephyr-7b-alpha":10.2897608887,"tulu-2-dpo-13b":11.5544794281,"Qwen1.5-7B-Chat":14.7484310443,"Together-MoA-Lite":59.1415240989,"cut-13b":12.1547817539,"Meta-Llama-3-70B-Instruct":34.4245971745,"vicuna-13b-v1.3":10.8431649437,"claude-instant-1.2":25.6122590254,"airoboros-65b":11.0076424064,"openbuddy-llama2-13b-v11.1":9.159089775,"phi-2":4.3986822709,"Together-MoA":65.3799697685,"mistral-large-2402":32.6520799853,"openbuddy-llama-30b-v7.1":10.2144949912,"TempNet-LLaMA2-Chat-70B-v0.1":15.8311627784,"pairrm-tulu-2-13b":17.405203698,"recycled-wizardlm-7b-v2.0":7.5216099553,"Storm-7B-best-of-64":61.637895572,"vicuna-7b":6.2772177385,"claude-3-sonnet-20240229":34.8724743624,"Mistral-7B-Instruct-v0.2":17.111251846,"Samba-CoE-v0.1":22.8658373348,"claude":27.2895044437,"Nanbeige2-8B-Chat":25.2417704867,"REBEL-Llama-3-8B-Instruct":31.4699427971,"chatglm2-6b":4.3592829268,"gpt-4o-2024-05-13":57.4568288333,"gpt4_1106_preview_verbose":51.5750079797,"TempNet-LLaMA2-Chat-13B-v0.1":8.5783553109,"text_davinci_001":9.0257288521,"Mixtral-8x7B-Instruct-v0.1_verbose":23.2231207809,"baize-v2-7b":4.382564905,"phi-2-dpo":7.7708946203,"alpaca-farm-ppo-human":6.4186032949,"Nanbeige2-16B-Chat":40.5912863493,"gpt4_0613":30.1833223167,"pythia-12b-mix-sft":4.2213618614,"alpaca-7b-neft":3.5091458375,"Qwen1.5-14B-Chat":23.8966467702,"gpt-4-0125-preview":56.3562938462,"guanaco-33b":5.6900190909,"oasst-sft-llama-33b":9.8664121438,"gpt4_0613_verbose":33.8212668866,"llama-2-chat-7b-evol70k-neft":7.5330526555,"gpt35_turbo_instruct":17.7278010829,"platolm-7b":10.5434020728,"llama-2-13b-chat-hf":8.4360145489,"Nanbeige-Plus-Chat-v0.1":44.4596624034,"openchat-v2-13b":10.3996073385,"mistral-orpo-beta":14.7167494307,"Snorkel-Mistral-PairRM-DPO-best-of-16":29.9743216131,"tulu-2-dpo-7b":9.2002656115,"alpaca-7b_verbose":6.8163068164,"OpenHermes-2.5-Mistral-7B":16.2485776967,"claude-2.1_verbose":30.2911791666,"ultralm-13b-v2.0":9.1290184442,"deita-7b-v1.0":16.0590135397,"minichat-1.5-3b":7.7016328215,"Qwen-14B-Chat":12.3787417907,"airoboros-33b":10.7190026781,"alpaca-farm-ppo-sim-gpt4-20k":7.1218081016,"ultralm-13b":7.1081913613,"openbuddy-falcon-40b-v9":8.9889364779,"openchat8192-13b":7.8970617346,"wizardlm-13b":9.8281507688,"vicuna-13b":9.2220600237,"merlinite-7B-AOT":31.721885287,"gpt4_0314":35.3070612164,"gpt4_0613_concise":21.5779909145,"jina-chat":15.8660040495,"Contextual-KTO-Mistral-PairRM":29.7058089397,"xwinlm-13b-v0.1":17.9189378982,"LMCocktail-10.7B-v1":18.9507103867,"SPPO-Mistral7B-PairRM-ExPO":31.9003876312,"Mixtral-8x7B-Instruct-v0.1_concise":22.9626094728,"gpt4_1106_preview_concise":41.8966015912,"Mistral-7B-ReMax-v0.1":20.5513677023,"Llama-3-Instruct-8B-SimPO-ExPO":45.807978034,"dolphin-2.2.1-mistral-7b":13.1214776504,"humpback-llama2-70b":16.2491642314,"openpipe-moa-gpt-4-turbo-v1":68.3786625033,"vicuna-7b-v1.5":7.6168927319,"Starling-LM-7B-alpha":14.6904710794,"falcon-40b-instruct":5.6075325447,"Samba-CoE-v0.2-best-of-16":31.5065442681,"opencoderplus-15b":8.1524101557,"xwinlm-70b-v0.1":24.6496860571,"wizardlm-13b-v1.2":14.4625906943,"aligner-2b_qwen1.5-72b-chat":36.7258688784,"internlm2-chat-7b-ExPO":22.6674802488,"claude-2.1":25.2519438861,"vicuna-7b-v1.3":7.1564609564,"oasst-rlhf-llama-33b":7.9709218373,"zephyr-7b-alpha-ExPO":13.6232522647,"openchat-v3.1-13b":14.5033879568,"SPPO-Llama-3-Instruct-8B-PairRM":38.5628066368,"minotaur-13b":11.4652513168,"tulu-2-dpo-13b-ExPO":17.6509979624,"zephyr-7b-beta-ExPO":14.0012119801,"tulu-2-dpo-7b-ExPO":11.6088057579,"Llama-3-Instruct-8B-SimPO":44.6804680926,"baize-v2-13b":7.012247205,"guanaco-7b":2.8711168131,"ultralm-13b-v2.0-best-of-16":14.1989875666,"claude-2.1_concise":18.2084579084,"openchat-13b":8.8060534912,"tulu-2-dpo-70b":21.2386100384,"deepseek-llm-67b-chat":17.8433840899,"humpback-llama-65b":12.7998599959,"tulu-2-dpo-70b-ExPO":25.7233081711,"TempNet-LLaMA2-Chat-7B-v0.1":5.7396138367,"nous-hermes-13b":9.7178634178,"gpt-3.5-turbo-0613":22.3525129805,"alpaca-7b_concise":4.4672516799,"baichuan-13b-chat":2.0621702536,"claude-3-5-sonnet-20240620":52.3667542714,"gpt-3.5-turbo-1106":19.300589035,"minichat-3b":5.7293328759,"Storm-7B":50.4080792281,"oasst-sft-pythia-12b":3.2701021145,"Conifer-7B-DPO":17.1124958828,"Snorkel-Mistral-PairRM-DPO":26.3914464573,"internlm2-chat-20b-ExPO":27.2257594808,"Samba-CoE-v0.2":27.6242673501,"gemini-pro":24.381776108,"pairrm-tulu-2-70b":21.4284039755,"text_davinci_003":4.5664105675,"gpt4":38.1280897444,"Yi-34B-Chat":27.1905478776,"Starling-LM-7B-beta-ExPO":26.4869564984,"pairrm-Yi-34B-Chat":28.8148408668,"gpt4_1106_preview":50.0,"evo-7b":16.4893860042,"zephyr-7b-beta":13.2031984931,"guanaco-13b":3.0037873296,"alpaca-7b":5.8754871633,"internlm2-chat-20b-ppo":18.7487394854,"gemma-2b-it":5.4374536204,"pairrm-zephyr-7b-beta":15.529867295,"evo-v2-7b":23.357705702,"causallm-14b":15.720325189,"SPPO-Mistral7B-PairRM":30.4941379652,"gpt-3.5-turbo-1106_concise":15.7695209839,"openbuddy-llama-65b-v8":12.4693562891,"claude2-alpaca-13b":11.4988982132,"Starling-LM-7B-alpha-ExPO":19.4741654606,"openbuddy-falcon-7b-v6":4.8261244822,"gemma-7b-it":10.4257604037,"phi-2-sft":5.8537876906,"gpt4_gamed":12.1887640576,"llama-2-70b-chat-hf":14.6896485884,"openbuddy-llama2-70b-v10.1":12.5721732723,"wizardlm-70b":17.5750607375,"ultralm-13b-best-of-16":9.8760888169},"lc_standard_error":{"gpt-3.5-turbo-0301":0.7864976807,"gpt-3.5-turbo-1106_verbose":0.8544953416,"vicuna-13b-v1.5-togetherai":0.6243797898,"Qwen1.5-1.8B-Chat":0.2021610274,"recycled-wizardlm-7b-v1.0":0.4105893841,"aligner-2b_claude-3-opus-20240229":0.7776876699,"Qwen1.5-110B-Chat":0.8945807936,"claude-3-opus-20240229":0.8837504763,"llama-2-7b-chat-hf":0.3326400931,"mistral-medium":0.9075464438,"vicuna-33b-v1.3":0.7099362877,"cohere":0.5206791146,"claude-2":0.8779084794,"guanaco-65b":0.46281361,"Mixtral-8x7B-Instruct-v0.1":0.9011105015,"openchat-v2-w-13b":0.5657607148,"falcon-7b-instruct":0.268726544,"wizardlm-13b-v1.1":0.6712555976,"Meta-Llama-3-8B-Instruct":0.849800882,"FsfairX-Zephyr-Chat-v0.1":0.7594505141,"Infinity-Instruct-3M-0613-Mistral-7B":0.7760697229,"Qwen1.5-72B-Chat":0.9357421321,"xwinlm-7b-v0.1":0.5519849159,"Mixtral-8x22B-Instruct-v0.1":0.9518125819,"vicuna-13b-v1.5":0.5980193852,"dbrx-instruct":0.8999456518,"zephyr-7b-alpha":0.5879820221,"tulu-2-dpo-13b":0.6494943093,"Qwen1.5-7B-Chat":0.6490365375,"Together-MoA-Lite":0.7580510219,"cut-13b":0.6383138465,"Meta-Llama-3-70B-Instruct":0.8691832384,"vicuna-13b-v1.3":0.6100962742,"claude-instant-1.2":0.87464248,"airoboros-65b":0.6004520879,"openbuddy-llama2-13b-v11.1":0.5636847159,"phi-2":0.1293627793,"Together-MoA":0.7392392837,"mistral-large-2402":0.9044632955,"openbuddy-llama-30b-v7.1":0.6099418552,"TempNet-LLaMA2-Chat-70B-v0.1":0.7195404924,"pairrm-tulu-2-13b":0.7958946232,"recycled-wizardlm-7b-v2.0":0.4355543699,"Storm-7B-best-of-64":0.6799412402,"vicuna-7b":0.3964740967,"claude-3-sonnet-20240229":0.949844689,"Mistral-7B-Instruct-v0.2":0.7875592102,"Samba-CoE-v0.1":0.7405123259,"claude":0.858614564,"Nanbeige2-8B-Chat":0.5909370499,"REBEL-Llama-3-8B-Instruct":0.8138922262,"chatglm2-6b":0.2913010016,"gpt-4o-2024-05-13":0.7774399385,"gpt4_1106_preview_verbose":0.8313707608,"TempNet-LLaMA2-Chat-13B-v0.1":0.4783538284,"text_davinci_001":0.2169278281,"Mixtral-8x7B-Instruct-v0.1_verbose":0.7975932103,"baize-v2-7b":0.3307775329,"phi-2-dpo":0.420015191,"alpaca-farm-ppo-human":0.4202234849,"Nanbeige2-16B-Chat":0.8504106275,"gpt4_0613":0.7874508454,"pythia-12b-mix-sft":0.2932467883,"alpaca-7b-neft":0.2516233369,"Qwen1.5-14B-Chat":0.7729838609,"gpt-4-0125-preview":0.7731843456,"guanaco-33b":0.3195322556,"oasst-sft-llama-33b":0.5204539206,"gpt4_0613_verbose":0.8842151461,"llama-2-chat-7b-evol70k-neft":0.4277221418,"gpt35_turbo_instruct":0.3748783811,"platolm-7b":0.3937696385,"llama-2-13b-chat-hf":0.5161956367,"Nanbeige-Plus-Chat-v0.1":0.7209678864,"openchat-v2-13b":0.5398936504,"mistral-orpo-beta":0.6895695724,"Snorkel-Mistral-PairRM-DPO-best-of-16":0.7464891533,"tulu-2-dpo-7b":0.5465634637,"alpaca-7b_verbose":0.2437107339,"OpenHermes-2.5-Mistral-7B":0.7206735233,"claude-2.1_verbose":0.6612722747,"ultralm-13b-v2.0":0.5248779977,"deita-7b-v1.0":0.7398615266,"minichat-1.5-3b":0.4364271175,"Qwen-14B-Chat":0.6714412819,"airoboros-33b":0.5566576337,"alpaca-farm-ppo-sim-gpt4-20k":0.456168214,"ultralm-13b":0.4337345632,"openbuddy-falcon-40b-v9":0.545771106,"openchat8192-13b":0.4356316711,"wizardlm-13b":0.5385026234,"vicuna-13b":0.5388256266,"merlinite-7B-AOT":0.8150560619,"gpt4_0314":0.8997916758,"gpt4_0613_concise":0.7524372534,"jina-chat":0.6805565304,"Contextual-KTO-Mistral-PairRM":0.7122554396,"xwinlm-13b-v0.1":0.7513299972,"LMCocktail-10.7B-v1":0.8369176162,"SPPO-Mistral7B-PairRM-ExPO":0.7655500294,"Mixtral-8x7B-Instruct-v0.1_concise":0.8710401023,"gpt4_1106_preview_concise":0.7406558917,"Mistral-7B-ReMax-v0.1":0.807838924,"Llama-3-Instruct-8B-SimPO-ExPO":0.8703329817,"dolphin-2.2.1-mistral-7b":0.6251596825,"humpback-llama2-70b":0.6984941388,"openpipe-moa-gpt-4-turbo-v1":0.7309418615,"vicuna-7b-v1.5":0.4868743581,"Starling-LM-7B-alpha":0.658381614,"falcon-40b-instruct":0.3565968022,"Samba-CoE-v0.2-best-of-16":0.7338723477,"opencoderplus-15b":0.4567320517,"xwinlm-70b-v0.1":0.9059240217,"wizardlm-13b-v1.2":0.6741078562,"aligner-2b_qwen1.5-72b-chat":0.6787999003,"internlm2-chat-7b-ExPO":0.629923982,"claude-2.1":0.7515108894,"vicuna-7b-v1.3":0.4355620786,"oasst-rlhf-llama-33b":0.4061516205,"zephyr-7b-alpha-ExPO":0.7160268998,"openchat-v3.1-13b":0.6974328561,"SPPO-Llama-3-Instruct-8B-PairRM":0.8694594533,"minotaur-13b":0.368757115,"tulu-2-dpo-13b-ExPO":0.5166082438,"zephyr-7b-beta-ExPO":0.5303710259,"tulu-2-dpo-7b-ExPO":0.4355576278,"Llama-3-Instruct-8B-SimPO":0.8789917177,"baize-v2-13b":0.4685705196,"guanaco-7b":0.2018869696,"ultralm-13b-v2.0-best-of-16":0.6555243163,"claude-2.1_concise":0.6338526283,"openchat-13b":0.4470052867,"tulu-2-dpo-70b":0.8610574163,"deepseek-llm-67b-chat":0.7439504148,"humpback-llama-65b":0.6567402094,"tulu-2-dpo-70b-ExPO":0.4593179402,"TempNet-LLaMA2-Chat-7B-v0.1":0.3407340673,"nous-hermes-13b":0.5572824918,"gpt-3.5-turbo-0613":0.8045156377,"alpaca-7b_concise":0.2820018938,"baichuan-13b-chat":0.1525670221,"claude-3-5-sonnet-20240620":0.7976856335,"gpt-3.5-turbo-1106":0.7682908268,"minichat-3b":0.3565910812,"Storm-7B":0.7188927916,"oasst-sft-pythia-12b":0.2064079261,"Conifer-7B-DPO":0.7602280224,"Snorkel-Mistral-PairRM-DPO":0.6739888325,"internlm2-chat-20b-ExPO":0.5877331102,"Samba-CoE-v0.2":0.6875926799,"gemini-pro":0.8158961767,"pairrm-tulu-2-70b":0.8359305763,"text_davinci_003":0.3109387936,"gpt4":0.9069675584,"Yi-34B-Chat":0.7470363322,"Starling-LM-7B-beta-ExPO":0.7549415682,"pairrm-Yi-34B-Chat":0.8310750322,"gpt4_1106_preview":0.0,"evo-7b":0.502828858,"zephyr-7b-beta":0.6521227924,"guanaco-13b":0.2069624951,"alpaca-7b":0.3755224975,"internlm2-chat-20b-ppo":0.7522583795,"gemma-2b-it":0.3236386036,"pairrm-zephyr-7b-beta":0.7455357676,"evo-v2-7b":0.6353106561,"causallm-14b":0.7103430968,"SPPO-Mistral7B-PairRM":0.8458266977,"gpt-3.5-turbo-1106_concise":0.7318554971,"openbuddy-llama-65b-v8":0.6457736922,"claude2-alpaca-13b":0.6646440129,"Starling-LM-7B-alpha-ExPO":0.4701002864,"openbuddy-falcon-7b-v6":0.3350353845,"gemma-7b-it":0.4807679381,"phi-2-sft":0.3931141644,"gpt4_gamed":0.3987510662,"llama-2-70b-chat-hf":0.6625475757,"openbuddy-llama2-70b-v10.1":0.6740810303,"wizardlm-70b":0.7233004015,"ultralm-13b-best-of-16":0.4814959281},"num_tokens_mean":{"gpt-3.5-turbo-0301":179,"gpt-3.5-turbo-1106_verbose":214,"vicuna-13b-v1.5-togetherai":231,"Qwen1.5-1.8B-Chat":586,"recycled-wizardlm-7b-v1.0":299,"aligner-2b_claude-3-opus-20240229":354,"Qwen1.5-110B-Chat":346,"claude-3-opus-20240229":292,"llama-2-7b-chat-hf":302,"mistral-medium":327,"vicuna-33b-v1.3":315,"cohere":396,"claude-2":227,"guanaco-65b":262,"Mixtral-8x7B-Instruct-v0.1":311,"openchat-v2-w-13b":335,"falcon-7b-instruct":106,"wizardlm-13b-v1.1":325,"Meta-Llama-3-8B-Instruct":412,"FsfairX-Zephyr-Chat-v0.1":505,"Infinity-Instruct-3M-0613-Mistral-7B":106,"Qwen1.5-72B-Chat":342,"xwinlm-7b-v0.1":407,"Mixtral-8x22B-Instruct-v0.1":307,"vicuna-13b-v1.5":229,"dbrx-instruct":310,"zephyr-7b-alpha":283,"tulu-2-dpo-13b":370,"Qwen1.5-7B-Chat":342,"Together-MoA-Lite":420,"cut-13b":348,"Meta-Llama-3-70B-Instruct":416,"vicuna-13b-v1.3":244,"claude-instant-1.2":233,"airoboros-65b":312,"openbuddy-llama2-13b-v11.1":228,"phi-2":142,"Together-MoA":386,"mistral-large-2402":290,"openbuddy-llama-30b-v7.1":208,"TempNet-LLaMA2-Chat-70B-v0.1":384,"pairrm-tulu-2-13b":313,"recycled-wizardlm-7b-v2.0":317,"Storm-7B-best-of-64":486,"vicuna-7b":224,"claude-3-sonnet-20240229":297,"Mistral-7B-Instruct-v0.2":362,"Samba-CoE-v0.1":258,"claude":229,"Nanbeige2-8B-Chat":561,"REBEL-Llama-3-8B-Instruct":509,"chatglm2-6b":234,"gpt-4o-2024-05-13":406,"gpt4_1106_preview_verbose":505,"TempNet-LLaMA2-Chat-13B-v0.1":320,"text_davinci_001":67,"Mixtral-8x7B-Instruct-v0.1_verbose":438,"baize-v2-7b":247,"phi-2-dpo":356,"alpaca-farm-ppo-human":169,"Nanbeige2-16B-Chat":394,"gpt4_0613":245,"pythia-12b-mix-sft":197,"alpaca-7b-neft":216,"Qwen1.5-14B-Chat":348,"gpt-4-0125-preview":417,"guanaco-33b":283,"oasst-sft-llama-33b":161,"gpt4_0613_verbose":313,"llama-2-chat-7b-evol70k-neft":330,"gpt35_turbo_instruct":219,"platolm-7b":272,"llama-2-13b-chat-hf":316,"Nanbeige-Plus-Chat-v0.1":529,"openchat-v2-13b":330,"mistral-orpo-beta":344,"Snorkel-Mistral-PairRM-DPO-best-of-16":526,"tulu-2-dpo-7b":399,"alpaca-7b_verbose":119,"OpenHermes-2.5-Mistral-7B":237,"claude-2.1_verbose":292,"ultralm-13b-v2.0":299,"deita-7b-v1.0":303,"minichat-1.5-3b":322,"Qwen-14B-Chat":223,"airoboros-33b":315,"alpaca-farm-ppo-sim-gpt4-20k":104,"ultralm-13b":233,"openbuddy-falcon-40b-v9":238,"openchat8192-13b":359,"wizardlm-13b":211,"vicuna-13b":224,"merlinite-7B-AOT":385,"gpt4_0314":289,"gpt4_0613_concise":138,"jina-chat":138,"Contextual-KTO-Mistral-PairRM":506,"xwinlm-13b-v0.1":400,"LMCocktail-10.7B-v1":244,"SPPO-Mistral7B-PairRM-ExPO":459,"Mixtral-8x7B-Instruct-v0.1_concise":195,"gpt4_1106_preview_concise":244,"Mistral-7B-ReMax-v0.1":314,"Llama-3-Instruct-8B-SimPO-ExPO":370,"dolphin-2.2.1-mistral-7b":243,"humpback-llama2-70b":234,"openpipe-moa-gpt-4-turbo-v1":377,"vicuna-7b-v1.5":234,"Starling-LM-7B-alpha":392,"falcon-40b-instruct":148,"Samba-CoE-v0.2-best-of-16":312,"opencoderplus-15b":354,"xwinlm-70b-v0.1":381,"wizardlm-13b-v1.2":354,"aligner-2b_qwen1.5-72b-chat":383,"internlm2-chat-7b-ExPO":494,"claude-2.1":228,"vicuna-7b-v1.3":238,"oasst-rlhf-llama-33b":243,"zephyr-7b-alpha-ExPO":267,"openchat-v3.1-13b":318,"SPPO-Llama-3-Instruct-8B-PairRM":443,"minotaur-13b":186,"tulu-2-dpo-13b-ExPO":350,"zephyr-7b-beta-ExPO":297,"tulu-2-dpo-7b-ExPO":373,"Llama-3-Instruct-8B-SimPO":385,"baize-v2-13b":201,"guanaco-7b":311,"ultralm-13b-v2.0-best-of-16":354,"claude-2.1_concise":119,"openchat-13b":349,"tulu-2-dpo-70b":313,"deepseek-llm-67b-chat":250,"humpback-llama-65b":254,"tulu-2-dpo-70b-ExPO":368,"TempNet-LLaMA2-Chat-7B-v0.1":308,"nous-hermes-13b":187,"gpt-3.5-turbo-0613":270,"alpaca-7b_concise":77,"baichuan-13b-chat":310,"claude-3-5-sonnet-20240620":315,"gpt-3.5-turbo-1106":164,"minichat-3b":186,"Storm-7B":429,"oasst-sft-pythia-12b":158,"Conifer-7B-DPO":266,"Snorkel-Mistral-PairRM-DPO":566,"internlm2-chat-20b-ExPO":693,"Samba-CoE-v0.2":292,"gemini-pro":313,"pairrm-tulu-2-70b":405,"text_davinci_003":69,"gpt4":287,"Yi-34B-Chat":447,"Starling-LM-7B-beta-ExPO":447,"pairrm-Yi-34B-Chat":464,"gpt4_1106_preview":431,"evo-7b":382,"zephyr-7b-beta":311,"guanaco-13b":416,"alpaca-7b":85,"internlm2-chat-20b-ppo":513,"gemma-2b-it":222,"pairrm-zephyr-7b-beta":315,"evo-v2-7b":375,"causallm-14b":302,"SPPO-Mistral7B-PairRM":431,"gpt-3.5-turbo-1106_concise":90,"openbuddy-llama-65b-v8":249,"claude2-alpaca-13b":243,"Starling-LM-7B-alpha-ExPO":376,"openbuddy-falcon-7b-v6":248,"gemma-7b-it":241,"phi-2-sft":225,"gpt4_gamed":15,"llama-2-70b-chat-hf":377,"openbuddy-llama2-70b-v10.1":230,"wizardlm-70b":332,"ultralm-13b-best-of-16":400},"num_tokens_std":{"gpt-3.5-turbo-0301":150,"gpt-3.5-turbo-1106_verbose":144,"vicuna-13b-v1.5-togetherai":163,"Qwen1.5-1.8B-Chat":611,"recycled-wizardlm-7b-v1.0":186,"aligner-2b_claude-3-opus-20240229":206,"Qwen1.5-110B-Chat":211,"claude-3-opus-20240229":156,"llama-2-7b-chat-hf":170,"mistral-medium":305,"vicuna-33b-v1.3":198,"cohere":225,"claude-2":114,"guanaco-65b":212,"Mixtral-8x7B-Instruct-v0.1":219,"openchat-v2-w-13b":210,"falcon-7b-instruct":177,"wizardlm-13b-v1.1":213,"Meta-Llama-3-8B-Instruct":231,"FsfairX-Zephyr-Chat-v0.1":311,"Infinity-Instruct-3M-0613-Mistral-7B":102,"Qwen1.5-72B-Chat":346,"xwinlm-7b-v0.1":308,"Mixtral-8x22B-Instruct-v0.1":253,"vicuna-13b-v1.5":164,"dbrx-instruct":193,"zephyr-7b-alpha":272,"tulu-2-dpo-13b":592,"Qwen1.5-7B-Chat":199,"Together-MoA-Lite":233,"cut-13b":208,"Meta-Llama-3-70B-Instruct":238,"vicuna-13b-v1.3":167,"claude-instant-1.2":132,"airoboros-65b":372,"openbuddy-llama2-13b-v11.1":175,"phi-2":312,"Together-MoA":217,"mistral-large-2402":191,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":258,"pairrm-tulu-2-13b":194,"recycled-wizardlm-7b-v2.0":173,"Storm-7B-best-of-64":287,"vicuna-7b":161,"claude-3-sonnet-20240229":178,"Mistral-7B-Instruct-v0.2":360,"Samba-CoE-v0.1":194,"claude":123,"Nanbeige2-8B-Chat":239,"REBEL-Llama-3-8B-Instruct":463,"chatglm2-6b":352,"gpt-4o-2024-05-13":273,"gpt4_1106_preview_verbose":261,"TempNet-LLaMA2-Chat-13B-v0.1":178,"text_davinci_001":78,"Mixtral-8x7B-Instruct-v0.1_verbose":225,"baize-v2-7b":183,"phi-2-dpo":188,"alpaca-farm-ppo-human":171,"Nanbeige2-16B-Chat":214,"gpt4_0613":164,"pythia-12b-mix-sft":181,"alpaca-7b-neft":97,"Qwen1.5-14B-Chat":224,"gpt-4-0125-preview":242,"guanaco-33b":236,"oasst-sft-llama-33b":145,"gpt4_0613_verbose":166,"llama-2-chat-7b-evol70k-neft":143,"gpt35_turbo_instruct":241,"platolm-7b":187,"llama-2-13b-chat-hf":175,"Nanbeige-Plus-Chat-v0.1":216,"openchat-v2-13b":205,"mistral-orpo-beta":205,"Snorkel-Mistral-PairRM-DPO-best-of-16":255,"tulu-2-dpo-7b":738,"alpaca-7b_verbose":111,"OpenHermes-2.5-Mistral-7B":225,"claude-2.1_verbose":120,"ultralm-13b-v2.0":188,"deita-7b-v1.0":221,"minichat-1.5-3b":215,"Qwen-14B-Chat":227,"airoboros-33b":324,"alpaca-farm-ppo-sim-gpt4-20k":68,"ultralm-13b":158,"openbuddy-falcon-40b-v9":198,"openchat8192-13b":257,"wizardlm-13b":155,"vicuna-13b":153,"merlinite-7B-AOT":180,"gpt4_0314":212,"gpt4_0613_concise":110,"jina-chat":101,"Contextual-KTO-Mistral-PairRM":272,"xwinlm-13b-v0.1":268,"LMCocktail-10.7B-v1":168,"SPPO-Mistral7B-PairRM-ExPO":193,"Mixtral-8x7B-Instruct-v0.1_concise":185,"gpt4_1106_preview_concise":184,"Mistral-7B-ReMax-v0.1":188,"Llama-3-Instruct-8B-SimPO-ExPO":191,"dolphin-2.2.1-mistral-7b":218,"humpback-llama2-70b":214,"openpipe-moa-gpt-4-turbo-v1":193,"vicuna-7b-v1.5":165,"Starling-LM-7B-alpha":250,"falcon-40b-instruct":182,"Samba-CoE-v0.2-best-of-16":199,"opencoderplus-15b":272,"xwinlm-70b-v0.1":267,"wizardlm-13b-v1.2":308,"aligner-2b_qwen1.5-72b-chat":210,"internlm2-chat-7b-ExPO":232,"claude-2.1":114,"vicuna-7b-v1.3":159,"oasst-rlhf-llama-33b":257,"zephyr-7b-alpha-ExPO":204,"openchat-v3.1-13b":224,"SPPO-Llama-3-Instruct-8B-PairRM":208,"minotaur-13b":164,"tulu-2-dpo-13b-ExPO":203,"zephyr-7b-beta-ExPO":217,"tulu-2-dpo-7b-ExPO":217,"Llama-3-Instruct-8B-SimPO":200,"baize-v2-13b":149,"guanaco-7b":292,"ultralm-13b-v2.0-best-of-16":169,"claude-2.1_concise":88,"openchat-13b":241,"tulu-2-dpo-70b":281,"deepseek-llm-67b-chat":206,"humpback-llama-65b":156,"tulu-2-dpo-70b-ExPO":203,"TempNet-LLaMA2-Chat-7B-v0.1":178,"nous-hermes-13b":201,"gpt-3.5-turbo-0613":194,"alpaca-7b_concise":70,"baichuan-13b-chat":400,"claude-3-5-sonnet-20240620":200,"gpt-3.5-turbo-1106":134,"minichat-3b":156,"Storm-7B":188,"oasst-sft-pythia-12b":186,"Conifer-7B-DPO":163,"Snorkel-Mistral-PairRM-DPO":342,"internlm2-chat-20b-ExPO":283,"Samba-CoE-v0.2":196,"gemini-pro":227,"pairrm-tulu-2-70b":438,"text_davinci_003":89,"gpt4":203,"Yi-34B-Chat":235,"Starling-LM-7B-beta-ExPO":143,"pairrm-Yi-34B-Chat":276,"gpt4_1106_preview":249,"evo-7b":241,"zephyr-7b-beta":238,"guanaco-13b":402,"alpaca-7b":69,"internlm2-chat-20b-ppo":403,"gemma-2b-it":168,"pairrm-zephyr-7b-beta":250,"evo-v2-7b":235,"causallm-14b":234,"SPPO-Mistral7B-PairRM":198,"gpt-3.5-turbo-1106_concise":80,"openbuddy-llama-65b-v8":152,"claude2-alpaca-13b":176,"Starling-LM-7B-alpha-ExPO":220,"openbuddy-falcon-7b-v6":189,"gemma-7b-it":160,"phi-2-sft":164,"gpt4_gamed":81,"llama-2-70b-chat-hf":222,"openbuddy-llama2-70b-v10.1":161,"wizardlm-70b":231,"ultralm-13b-best-of-16":164}}
|
|
|
|
data/model_win_rates.jsonl
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"num_words_mean":257,"num_words_std":145,"win_rate":34.4633736232,"standard_error":1.3146665263,"n_wins":225.0,"n_wins_base":475.0,"n_draws":105.0,"n_total":805.0,"discrete_win_rate":34.4720496894,"length_controlled_winrate":41.8230717152,"lc_standard_error":0.7776876699,"num_tokens_mean":354,"num_tokens_std":206,"model_name":"aligner-2b_claude-3-opus-20240229"}
|
2 |
+
{"num_words_mean":253,"num_words_std":153,"win_rate":33.7770952757,"standard_error":1.3776163154,"n_wins":255.0,"n_wins_base":545.0,"n_draws":5.0,"n_total":805.0,"discrete_win_rate":31.9875776398,"length_controlled_winrate":43.905552211,"lc_standard_error":0.8945807936,"num_tokens_mean":346,"num_tokens_std":211,"model_name":"Qwen1.5-110B-Chat"}
|
3 |
+
{"num_words_mean":216,"num_words_std":113,"win_rate":29.1052695333,"standard_error":1.3941539442,"n_wins":223.0,"n_wins_base":579.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":27.8881987578,"length_controlled_winrate":40.5095080124,"lc_standard_error":0.8837504763,"num_tokens_mean":292,"num_tokens_std":156,"model_name":"claude-3-opus-20240229"}
|
4 |
+
{"num_words_mean":241,"num_words_std":166,"win_rate":21.8557725437,"standard_error":1.2682402187,"n_wins":164.0,"n_wins_base":639.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":20.4968944099,"length_controlled_winrate":28.6143374017,"lc_standard_error":0.9075464438,"num_tokens_mean":327,"num_tokens_std":305,"model_name":"mistral-medium"}
|
5 |
+
{"num_words_mean":174,"num_words_std":90,"win_rate":17.1882403567,"standard_error":1.1748282562,"n_wins":131.0,"n_wins_base":673.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":16.3354037267,"length_controlled_winrate":28.1551961416,"lc_standard_error":0.8779084794,"num_tokens_mean":227,"num_tokens_std":114,"model_name":"claude-2"}
|
6 |
+
{"num_words_mean":342,"num_words_std":187,"win_rate":35.9464864409,"standard_error":1.4410058098,"n_wins":285.0,"n_wins_base":517.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.5900621118,"length_controlled_winrate":34.787447623,"lc_standard_error":0.7594505141,"num_tokens_mean":505,"num_tokens_std":311,"model_name":"FsfairX-Zephyr-Chat-v0.1"}
|
7 |
+
{"num_words_mean":79,"num_words_std":78,"win_rate":15.7478281307,"standard_error":1.1194852006,"n_wins":118.0,"n_wins_base":687.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":14.6583850932,"length_controlled_winrate":25.5015577947,"lc_standard_error":0.7760697229,"num_tokens_mean":106,"num_tokens_std":102,"model_name":"Infinity-Instruct-3M-0613-Mistral-7B"}
|
8 |
+
{"num_words_mean":243,"num_words_std":144,"win_rate":26.4982833956,"standard_error":1.3042361649,"n_wins":201.0,"n_wins_base":600.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":25.2173913043,"length_controlled_winrate":36.571754112,"lc_standard_error":0.9357421321,"num_tokens_mean":342,"num_tokens_std":346,"model_name":"Qwen1.5-72B-Chat"}
|
9 |
+
{"num_words_mean":229,"num_words_std":168,"win_rate":22.2101705475,"standard_error":1.2780740057,"n_wins":174.0,"n_wins_base":628.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":21.801242236,"length_controlled_winrate":30.8788102941,"lc_standard_error":0.9518125819,"num_tokens_mean":307,"num_tokens_std":253,"model_name":"Mixtral-8x22B-Instruct-v0.1"}
|
10 |
+
{"num_words_mean":235,"num_words_std":143,"win_rate":19.7553327319,"standard_error":1.2063251121,"n_wins":147.0,"n_wins_base":657.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":18.3229813665,"length_controlled_winrate":25.1853410397,"lc_standard_error":0.8999456518,"num_tokens_mean":310,"num_tokens_std":193,"model_name":"dbrx-instruct"}
|
11 |
+
{"num_words_mean":297,"num_words_std":161,"win_rate":56.5930456223,"standard_error":1.4464848562,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":59.1415240989,"lc_standard_error":0.7580510219,"num_tokens_mean":420,"num_tokens_std":233,"model_name":"Together-MoA-Lite"}
|
12 |
+
{"num_words_mean":299,"num_words_std":171,"win_rate":33.1778569588,"standard_error":1.3886514096,"n_wins":266.0,"n_wins_base":537.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.1677018634,"length_controlled_winrate":34.4245971745,"lc_standard_error":0.8691832384,"num_tokens_mean":416,"num_tokens_std":238,"model_name":"Meta-Llama-3-70B-Instruct"}
|
13 |
+
{"num_words_mean":179,"num_words_std":104,"win_rate":16.1273996216,"standard_error":1.1341036838,"n_wins":120.0,"n_wins_base":682.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":15.0931677019,"length_controlled_winrate":25.6122590254,"lc_standard_error":0.87464248,"num_tokens_mean":233,"num_tokens_std":132,"model_name":"claude-instant-1.2"}
|
14 |
+
{"num_words_mean":272,"num_words_std":151,"win_rate":59.8688062333,"standard_error":1.4343056045,"n_wins":490.0,"n_wins_base":314.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":60.9316770186,"length_controlled_winrate":65.3799697685,"lc_standard_error":0.7392392837,"num_tokens_mean":386,"num_tokens_std":217,"model_name":"Together-MoA"}
|
15 |
+
{"num_words_mean":218,"num_words_std":148,"win_rate":21.4387759814,"standard_error":1.2485232545,"n_wins":166.0,"n_wins_base":638.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":20.6832298137,"length_controlled_winrate":32.6520799853,"lc_standard_error":0.9044632955,"num_tokens_mean":290,"num_tokens_std":191,"model_name":"mistral-large-2402"}
|
16 |
+
{"num_words_mean":336,"num_words_std":195,"win_rate":63.0409907519,"standard_error":1.4253258915,"n_wins":519.0,"n_wins_base":286.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":64.4720496894,"length_controlled_winrate":61.637895572,"lc_standard_error":0.6799412402,"num_tokens_mean":486,"num_tokens_std":287,"model_name":"Storm-7B-best-of-64"}
|
17 |
+
{"num_words_mean":221,"num_words_std":129,"win_rate":25.5563252923,"standard_error":1.3419811052,"n_wins":193.0,"n_wins_base":608.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":24.2236024845,"length_controlled_winrate":34.8724743624,"lc_standard_error":0.949844689,"num_tokens_mean":297,"num_tokens_std":178,"model_name":"claude-3-sonnet-20240229"}
|
18 |
+
{"num_words_mean":176,"num_words_std":97,"win_rate":16.9853436124,"standard_error":1.1687959793,"n_wins":129.0,"n_wins_base":676.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":16.0248447205,"length_controlled_winrate":27.2895044437,"lc_standard_error":0.858614564,"num_tokens_mean":229,"num_tokens_std":123,"model_name":"claude"}
|
19 |
+
{"num_words_mean":415,"num_words_std":173,"win_rate":39.354502072,"standard_error":1.4524224246,"n_wins":323.0,"n_wins_base":480.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":40.248447205,"length_controlled_winrate":25.2417704867,"lc_standard_error":0.5909370499,"num_tokens_mean":561,"num_tokens_std":239,"model_name":"Nanbeige2-8B-Chat"}
|
20 |
+
{"num_words_mean":341,"num_words_std":273,"win_rate":34.3064238313,"standard_error":1.3914900256,"n_wins":268.0,"n_wins_base":537.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":33.2919254658,"length_controlled_winrate":31.4699427971,"lc_standard_error":0.8138922262,"num_tokens_mean":509,"num_tokens_std":463,"model_name":"REBEL-Llama-3-8B-Instruct"}
|
21 |
+
{"num_words_mean":286,"num_words_std":190,"win_rate":51.3275757825,"standard_error":1.470009459,"n_wins":429.0,"n_wins_base":369.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":53.7267080745,"length_controlled_winrate":57.4568288333,"lc_standard_error":0.7774399385,"num_tokens_mean":406,"num_tokens_std":273,"model_name":"gpt-4o-2024-05-13"}
|
22 |
+
{"num_words_mean":378,"num_words_std":190,"win_rate":64.303601471,"standard_error":1.3348590089,"n_wins":525.0,"n_wins_base":268.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":65.9627329193,"length_controlled_winrate":51.5750079797,"lc_standard_error":0.8313707608,"num_tokens_mean":505,"num_tokens_std":261,"model_name":"gpt4_1106_preview_verbose"}
|
23 |
+
{"num_words_mean":284,"num_words_std":151,"win_rate":37.0360860499,"standard_error":1.4340261273,"n_wins":288.0,"n_wins_base":514.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.9627329193,"length_controlled_winrate":40.5912863493,"lc_standard_error":0.8504106275,"num_tokens_mean":394,"num_tokens_std":214,"model_name":"Nanbeige2-16B-Chat"}
|
24 |
+
{"num_words_mean":183,"num_words_std":124,"win_rate":15.7550380876,"standard_error":1.0754642482,"n_wins":117.0,"n_wins_base":684.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":14.7826086957,"length_controlled_winrate":30.1833223167,"lc_standard_error":0.7874508454,"num_tokens_mean":245,"num_tokens_std":164,"model_name":"gpt4_0613"}
|
25 |
+
{"num_words_mean":313,"num_words_std":180,"win_rate":54.9665397329,"standard_error":1.4286740089,"n_wins":446.0,"n_wins_base":347.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":56.149068323,"length_controlled_winrate":56.3562938462,"lc_standard_error":0.7731843456,"num_tokens_mean":417,"num_tokens_std":242,"model_name":"gpt-4-0125-preview"}
|
26 |
+
{"num_words_mean":237,"num_words_std":127,"win_rate":23.2373600435,"standard_error":1.2835395056,"n_wins":171.0,"n_wins_base":630.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":21.4906832298,"length_controlled_winrate":33.8212668866,"lc_standard_error":0.8842151461,"num_tokens_mean":313,"num_tokens_std":166,"model_name":"gpt4_0613_verbose"}
|
27 |
+
{"num_words_mean":391,"num_words_std":159,"win_rate":56.7030097302,"standard_error":1.482841875,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":44.4596624034,"lc_standard_error":0.7209678864,"num_tokens_mean":529,"num_tokens_std":216,"model_name":"Nanbeige-Plus-Chat-v0.1"}
|
28 |
+
{"num_words_mean":393,"num_words_std":185,"win_rate":34.8601328913,"standard_error":1.3599450437,"n_wins":270.0,"n_wins_base":533.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.6645962733,"length_controlled_winrate":29.9743216131,"lc_standard_error":0.7464891533,"num_tokens_mean":526,"num_tokens_std":255,"model_name":"Snorkel-Mistral-PairRM-DPO-best-of-16"}
|
29 |
+
{"num_words_mean":228,"num_words_std":94,"win_rate":24.3540710901,"standard_error":1.29358621,"n_wins":191.0,"n_wins_base":613.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":23.7888198758,"length_controlled_winrate":30.2911791666,"lc_standard_error":0.6612722747,"num_tokens_mean":292,"num_tokens_std":120,"model_name":"claude-2.1_verbose"}
|
30 |
+
{"num_words_mean":281,"num_words_std":128,"win_rate":29.8963508407,"standard_error":1.3666520485,"n_wins":234.0,"n_wins_base":571.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":29.0683229814,"length_controlled_winrate":31.721885287,"lc_standard_error":0.8150560619,"num_tokens_mean":385,"num_tokens_std":180,"model_name":"merlinite-7B-AOT"}
|
31 |
+
{"num_words_mean":215,"num_words_std":160,"win_rate":22.0732589287,"standard_error":1.2466725495,"n_wins":172.0,"n_wins_base":627.0,"n_draws":6.0,"n_total":805.0,"discrete_win_rate":21.7391304348,"length_controlled_winrate":35.3070612164,"lc_standard_error":0.8997916758,"num_tokens_mean":289,"num_tokens_std":212,"model_name":"gpt4_0314"}
|
32 |
+
{"num_words_mean":381,"num_words_std":205,"win_rate":33.2273552,"standard_error":1.3779687478,"n_wins":260.0,"n_wins_base":544.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":32.3602484472,"length_controlled_winrate":29.7058089397,"lc_standard_error":0.7122554396,"num_tokens_mean":506,"num_tokens_std":272,"model_name":"Contextual-KTO-Mistral-PairRM"}
|
33 |
+
{"num_words_mean":344,"num_words_std":144,"win_rate":35.4431306717,"standard_error":1.3981308966,"n_wins":274.0,"n_wins_base":531.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":34.0372670807,"length_controlled_winrate":31.9003876312,"lc_standard_error":0.7655500294,"num_tokens_mean":459,"num_tokens_std":193,"model_name":"SPPO-Mistral7B-PairRM-ExPO"}
|
34 |
+
{"num_words_mean":177,"num_words_std":130,"win_rate":22.9201944405,"standard_error":1.2325177143,"n_wins":172.0,"n_wins_base":622.0,"n_draws":11.0,"n_total":805.0,"discrete_win_rate":22.049689441,"length_controlled_winrate":41.8966015912,"lc_standard_error":0.7406558917,"num_tokens_mean":244,"num_tokens_std":184,"model_name":"gpt4_1106_preview_concise"}
|
35 |
+
{"num_words_mean":261,"num_words_std":134,"win_rate":40.6328540086,"standard_error":1.4439449942,"n_wins":325.0,"n_wins_base":479.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":40.4347826087,"length_controlled_winrate":45.807978034,"lc_standard_error":0.8703329817,"num_tokens_mean":370,"num_tokens_std":191,"model_name":"Llama-3-Instruct-8B-SimPO-ExPO"}
|
36 |
+
{"num_words_mean":272,"num_words_std":137,"win_rate":63.1549345123,"standard_error":1.4229800988,"n_wins":515.0,"n_wins_base":283.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":64.4099378882,"length_controlled_winrate":68.3786625033,"lc_standard_error":0.7309418615,"num_tokens_mean":377,"num_tokens_std":193,"model_name":"openpipe-moa-gpt-4-turbo-v1"}
|
37 |
+
{"num_words_mean":225,"num_words_std":140,"win_rate":26.9882543183,"standard_error":1.318903,"n_wins":201.0,"n_wins_base":601.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":25.1552795031,"length_controlled_winrate":31.5065442681,"lc_standard_error":0.7338723477,"num_tokens_mean":312,"num_tokens_std":199,"model_name":"Samba-CoE-v0.2-best-of-16"}
|
38 |
+
{"num_words_mean":280,"num_words_std":151,"win_rate":31.773037737,"standard_error":1.2392772646,"n_wins":180.0,"n_wins_base":473.0,"n_draws":152.0,"n_total":805.0,"discrete_win_rate":31.801242236,"length_controlled_winrate":36.7258688784,"lc_standard_error":0.6787999003,"num_tokens_mean":383,"num_tokens_std":210,"model_name":"aligner-2b_qwen1.5-72b-chat"}
|
39 |
+
{"num_words_mean":177,"num_words_std":92,"win_rate":15.7335067364,"standard_error":1.1203158654,"n_wins":115.0,"n_wins_base":688.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":14.4099378882,"length_controlled_winrate":25.2519438861,"lc_standard_error":0.7515108894,"num_tokens_mean":228,"num_tokens_std":114,"model_name":"claude-2.1"}
|
40 |
+
{"num_words_mean":317,"num_words_std":148,"win_rate":39.6728609061,"standard_error":1.4247223562,"n_wins":310.0,"n_wins_base":494.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":38.5714285714,"length_controlled_winrate":38.5628066368,"lc_standard_error":0.8694594533,"num_tokens_mean":443,"num_tokens_std":208,"model_name":"SPPO-Llama-3-Instruct-8B-PairRM"}
|
41 |
+
{"num_words_mean":272,"num_words_std":141,"win_rate":40.5297749846,"standard_error":1.4225744647,"n_wins":319.0,"n_wins_base":485.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":39.6894409938,"length_controlled_winrate":44.6804680926,"lc_standard_error":0.8789917177,"num_tokens_mean":385,"num_tokens_std":200,"model_name":"Llama-3-Instruct-8B-SimPO"}
|
42 |
+
{"num_words_mean":276,"num_words_std":140,"win_rate":22.9806197059,"standard_error":1.3591734083,"n_wins":184.0,"n_wins_base":620.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":22.9192546584,"length_controlled_winrate":25.7233081711,"lc_standard_error":0.4593179402,"num_tokens_mean":368,"num_tokens_std":203,"model_name":"tulu-2-dpo-70b-ExPO"}
|
43 |
+
{"num_words_mean":228,"num_words_std":142,"win_rate":40.5602140968,"standard_error":1.4679655404,"n_wins":312.0,"n_wins_base":493.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":38.7577639752,"length_controlled_winrate":52.3667542714,"lc_standard_error":0.7976856335,"num_tokens_mean":315,"num_tokens_std":200,"model_name":"claude-3-5-sonnet-20240620"}
|
44 |
+
{"num_words_mean":300,"num_words_std":135,"win_rate":50.2688690553,"standard_error":1.4728176781,"n_wins":397.0,"n_wins_base":408.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":49.3167701863,"length_controlled_winrate":50.4080792281,"lc_standard_error":0.7188927916,"num_tokens_mean":429,"num_tokens_std":188,"model_name":"Storm-7B"}
|
45 |
+
{"num_words_mean":414,"num_words_std":241,"win_rate":30.2200527007,"standard_error":1.3328273013,"n_wins":231.0,"n_wins_base":572.0,"n_draws":1.0,"n_total":804.0,"discrete_win_rate":28.7935323383,"length_controlled_winrate":26.3914464573,"lc_standard_error":0.6739888325,"num_tokens_mean":566,"num_tokens_std":342,"model_name":"Snorkel-Mistral-PairRM-DPO"}
|
46 |
+
{"num_words_mean":507,"num_words_std":206,"win_rate":46.1853674689,"standard_error":1.4638315246,"n_wins":375.0,"n_wins_base":430.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":46.5838509317,"length_controlled_winrate":27.2257594808,"lc_standard_error":0.5877331102,"num_tokens_mean":693,"num_tokens_std":283,"model_name":"internlm2-chat-20b-ExPO"}
|
47 |
+
{"num_words_mean":210,"num_words_std":138,"win_rate":21.8473786693,"standard_error":1.2171089783,"n_wins":159.0,"n_wins_base":645.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":19.8136645963,"length_controlled_winrate":27.6242673501,"lc_standard_error":0.6875926799,"num_tokens_mean":292,"num_tokens_std":196,"model_name":"Samba-CoE-v0.2"}
|
48 |
+
{"num_words_mean":215,"num_words_std":156,"win_rate":23.5767893148,"standard_error":1.2757042012,"n_wins":179.0,"n_wins_base":618.0,"n_draws":8.0,"n_total":805.0,"discrete_win_rate":22.7329192547,"length_controlled_winrate":38.1280897444,"lc_standard_error":0.9069675584,"num_tokens_mean":287,"num_tokens_std":203,"model_name":"gpt4"}
|
49 |
+
{"num_words_mean":339,"num_words_std":178,"win_rate":29.6599467188,"standard_error":1.3225712598,"n_wins":219.0,"n_wins_base":582.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":27.4534161491,"length_controlled_winrate":27.1905478776,"lc_standard_error":0.7470363322,"num_tokens_mean":447,"num_tokens_std":235,"model_name":"Yi-34B-Chat"}
|
50 |
+
{"num_words_mean":336,"num_words_std":106,"win_rate":29.6008518479,"standard_error":1.3252049543,"n_wins":225.0,"n_wins_base":580.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":27.950310559,"length_controlled_winrate":26.4869564984,"lc_standard_error":0.7549415682,"num_tokens_mean":447,"num_tokens_std":143,"model_name":"Starling-LM-7B-beta-ExPO"}
|
51 |
+
{"num_words_mean":349,"num_words_std":189,"win_rate":31.2412829468,"standard_error":1.3482437399,"n_wins":239.0,"n_wins_base":563.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":29.8757763975,"length_controlled_winrate":28.8148408668,"lc_standard_error":0.8310750322,"num_tokens_mean":464,"num_tokens_std":276,"model_name":"pairrm-Yi-34B-Chat"}
|
52 |
+
{"num_words_mean":323,"num_words_std":181,"win_rate":50.0,"standard_error":0.0,"n_wins":0.0,"n_wins_base":0.0,"n_draws":805.0,"n_total":805.0,"discrete_win_rate":50.0,"length_controlled_winrate":50.0,"lc_standard_error":0.0,"num_tokens_mean":431,"num_tokens_std":249,"model_name":"gpt4_1106_preview"}
|
53 |
+
{"num_words_mean":322,"num_words_std":148,"win_rate":32.2453123638,"standard_error":1.390800011,"n_wins":249.0,"n_wins_base":556.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":30.9316770186,"length_controlled_winrate":30.4941379652,"lc_standard_error":0.8458266977,"num_tokens_mean":431,"num_tokens_std":198,"model_name":"SPPO-Mistral7B-PairRM"}
|
prep_data.py
CHANGED
@@ -20,7 +20,9 @@ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
|
|
20 |
if os.path.isdir(model_dir):
|
21 |
model_output_file = os.path.join(model_dir, "model_outputs.json")
|
22 |
if os.path.exists(model_output_file):
|
23 |
-
|
|
|
|
|
24 |
|
25 |
|
26 |
def get_num_words(text):
|
@@ -42,6 +44,7 @@ model_name_to_num_words = {}
|
|
42 |
model_name_to_num_tokens = {}
|
43 |
for model_name, model_dataframe in model_dataframes_outputs.items():
|
44 |
print(f"model_name_to_num_words for {model_name}")
|
|
|
45 |
model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
|
46 |
model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
|
47 |
get_num_tokens
|
@@ -88,5 +91,8 @@ df = df.rename(
|
|
88 |
"std": "num_tokens_std",
|
89 |
}
|
90 |
)
|
|
|
91 |
|
92 |
-
df
|
|
|
|
|
|
20 |
if os.path.isdir(model_dir):
|
21 |
model_output_file = os.path.join(model_dir, "model_outputs.json")
|
22 |
if os.path.exists(model_output_file):
|
23 |
+
df = pd.read_json(model_output_file)
|
24 |
+
df["model_name"] = model_name
|
25 |
+
model_dataframes_outputs[model_name] = df
|
26 |
|
27 |
|
28 |
def get_num_words(text):
|
|
|
44 |
model_name_to_num_tokens = {}
|
45 |
for model_name, model_dataframe in model_dataframes_outputs.items():
|
46 |
print(f"model_name_to_num_words for {model_name}")
|
47 |
+
model_dataframe["model_name"] = model_name
|
48 |
model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
|
49 |
model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
|
50 |
get_num_tokens
|
|
|
91 |
"std": "num_tokens_std",
|
92 |
}
|
93 |
)
|
94 |
+
df["model_name"] = df.index
|
95 |
|
96 |
+
df = df[df["length_controlled_winrate"] > 25]
|
97 |
+
|
98 |
+
df.to_json("data/model_win_rates.jsonl", orient="records", lines=True)
|
prep_data_annotations.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import tiktoken
|
4 |
+
from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main
|
5 |
+
from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate
|
6 |
+
import os
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
|
10 |
+
|
11 |
+
# Define the path to the top-level directory
|
12 |
+
TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results"
|
13 |
+
|
14 |
+
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
|
15 |
+
relevant_models = df["model_name"].unique().tolist()
|
16 |
+
|
17 |
+
# Initialize an empty dictionary to hold the model name to dataframe mapping
|
18 |
+
model_dataframes_outputs = {}
|
19 |
+
|
20 |
+
# Iterate through each subdirectory in the top-level directory
|
21 |
+
df_response_judging = pd.DataFrame()
|
22 |
+
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
|
23 |
+
if model_name not in relevant_models:
|
24 |
+
continue
|
25 |
+
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
|
26 |
+
if os.path.isdir(model_dir):
|
27 |
+
model_output_file = os.path.join(
|
28 |
+
model_dir, "weighted_alpaca_eval_gpt4_turbo/annotations.json"
|
29 |
+
)
|
30 |
+
if os.path.exists(model_output_file):
|
31 |
+
df_response_judging = pd.concat(
|
32 |
+
[df_response_judging, pd.read_json(model_output_file)]
|
33 |
+
)
|
34 |
+
|
35 |
+
df_responses = pd.DataFrame()
|
36 |
+
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
|
37 |
+
if model_name not in relevant_models:
|
38 |
+
continue
|
39 |
+
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
|
40 |
+
if os.path.isdir(model_dir):
|
41 |
+
model_output_file = os.path.join(model_dir, "model_outputs.json")
|
42 |
+
if os.path.exists(model_output_file):
|
43 |
+
df_responses = pd.concat([df_responses, pd.read_json(model_output_file)])
|
44 |
+
|
45 |
+
df_responses = df_responses.drop("all_generated_texts", axis=1)
|
46 |
+
df_responses = df_responses.drop("Unnamed: 0.1", axis=1)
|
47 |
+
df_responses = df_responses.drop("index", axis=1)
|
48 |
+
df_responses = df_responses.drop("Unnamed: 0", axis=1)
|
49 |
+
df_responses = df_responses.drop("scores", axis=1)
|
50 |
+
df_responses = df_responses.drop("all_results_idx_best", axis=1)
|
51 |
+
df_responses = df_responses.drop("original_output", axis=1)
|
52 |
+
df_responses = df_responses.drop("new_prompt", axis=1)
|
53 |
+
|
54 |
+
breakpoint()
|
55 |
+
|
56 |
+
# Whitelist.
|
57 |
+
|
58 |
+
|
59 |
+
df_response_judging.to_json(
|
60 |
+
"data/df_response_judging.jsonl", lines=True, orient="records"
|
61 |
+
)
|
62 |
+
df_responses.to_json("data/df_responses.jsonl", lines=True, orient="records")
|