Spaces:
Running
Running
natolambert
commited on
Commit
Β·
31bff5a
1
Parent(s):
6b2b055
major imporvements
Browse files
README.md
CHANGED
@@ -6,6 +6,7 @@ colorTo: blue
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.12.0
|
8 |
app_file: app.py
|
|
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.12.0
|
8 |
app_file: app.py
|
9 |
+
header: mini
|
10 |
pinned: false
|
11 |
license: apache-2.0
|
12 |
---
|
app.py
CHANGED
@@ -12,17 +12,17 @@ import numpy as np
|
|
12 |
api = HfApi()
|
13 |
|
14 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
15 |
-
evals_repo = "
|
16 |
|
17 |
-
eval_set_repo = "
|
18 |
-
|
19 |
|
20 |
def restart_space():
|
21 |
-
api.restart_space(repo_id="
|
22 |
|
23 |
print("Pulling evaluation results")
|
24 |
repo = snapshot_download(
|
25 |
-
local_dir=
|
26 |
ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
|
27 |
repo_id=evals_repo,
|
28 |
use_auth_token=COLLAB_TOKEN,
|
@@ -32,7 +32,7 @@ repo = snapshot_download(
|
|
32 |
)
|
33 |
|
34 |
|
35 |
-
def
|
36 |
"""
|
37 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
38 |
|
@@ -96,7 +96,7 @@ def expand_subsets(dataframe):
|
|
96 |
|
97 |
def length_bias_check(dataframe):
|
98 |
"""
|
99 |
-
Takes the raw
|
100 |
Then, take the average of the three buckets as "average"
|
101 |
"""
|
102 |
new_df = dataframe.copy()
|
@@ -130,16 +130,16 @@ def length_bias_check(dataframe):
|
|
130 |
|
131 |
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
prefs_data = load_all_data(
|
136 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
137 |
|
138 |
-
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
|
144 |
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
|
145 |
|
@@ -170,72 +170,105 @@ def regex_table(dataframe, regex, filter_button):
|
|
170 |
regex_list = [x.strip() for x in regex.split(",")]
|
171 |
# Join the list into a single regex pattern with '|' acting as OR
|
172 |
combined_regex = '|'.join(regex_list)
|
|
|
173 |
# if filter_button, remove all rows with "ai2" in the model name
|
174 |
-
if (
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
177 |
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
|
178 |
|
179 |
|
180 |
with gr.Blocks() as app:
|
181 |
-
# create tabs for the app, moving the current table to one titled "
|
182 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
with gr.Column(scale=3):
|
184 |
gr.Markdown(TOP_TEXT)
|
185 |
-
with gr.Column(scale=2):
|
186 |
-
search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
187 |
-
filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
188 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
189 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
with gr.Row():
|
191 |
# reference data
|
192 |
-
|
193 |
-
|
194 |
-
datatype=
|
195 |
-
headers=
|
196 |
visible=False,
|
197 |
)
|
198 |
-
|
199 |
-
regex_table(
|
200 |
-
datatype=
|
201 |
-
headers=
|
202 |
-
elem_id="
|
203 |
height=1000,
|
204 |
)
|
205 |
|
206 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
with gr.Row():
|
208 |
# ref data
|
209 |
-
|
210 |
-
|
211 |
-
datatype=
|
212 |
-
headers=
|
213 |
visible=False,
|
214 |
)
|
215 |
-
|
216 |
-
regex_table(
|
217 |
-
datatype=
|
218 |
-
headers=
|
219 |
-
elem_id="
|
220 |
height=1000,
|
221 |
)
|
222 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
with gr.Row():
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
)
|
231 |
-
herm_table_len = gr.Dataframe(
|
232 |
-
regex_table(herm_data_length.copy(), "", False).values,
|
233 |
-
datatype=cols_herm_data_length,
|
234 |
-
headers=herm_data_length.columns.tolist(),
|
235 |
-
elem_id="herm_dataframe_length",
|
236 |
-
height=1000,
|
237 |
-
)
|
238 |
-
with gr.TabItem("Known Pref. Sets"):
|
239 |
with gr.Row():
|
240 |
PREF_SET_TEXT = """
|
241 |
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
|
@@ -250,7 +283,7 @@ with gr.Blocks() as app:
|
|
250 |
visible=False,
|
251 |
)
|
252 |
pref_sets_table = gr.Dataframe(
|
253 |
-
regex_table(prefs_data.copy(), "",
|
254 |
datatype=col_types_prefs,
|
255 |
headers=prefs_data.columns.tolist(),
|
256 |
elem_id="prefs_dataframe",
|
@@ -276,21 +309,25 @@ with gr.Blocks() as app:
|
|
276 |
# removed plot because not pretty enough
|
277 |
# with gr.TabItem("Model Correlation"):
|
278 |
# with gr.Row():
|
279 |
-
# plot = plot_avg_correlation(
|
280 |
# gr.Plot(plot)
|
281 |
|
282 |
-
|
283 |
-
|
284 |
-
search.change(regex_table, inputs=[
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
287 |
# Load data when app starts, TODO make this used somewhere...
|
288 |
# def load_data_on_start():
|
289 |
-
#
|
290 |
-
#
|
291 |
|
292 |
-
#
|
293 |
-
#
|
294 |
|
295 |
# data_prefs = load_all_data(repo_dir_prefs)
|
296 |
# pref_sets_table.update(data_prefs)
|
|
|
12 |
api = HfApi()
|
13 |
|
14 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
15 |
+
evals_repo = "allenai/reward-bench-results"
|
16 |
|
17 |
+
eval_set_repo = "allenai/reward-bench"
|
18 |
+
repo_dir_rewardbench = "./evals/rewardbench/"
|
19 |
|
20 |
def restart_space():
|
21 |
+
api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
|
22 |
|
23 |
print("Pulling evaluation results")
|
24 |
repo = snapshot_download(
|
25 |
+
local_dir=repo_dir_rewardbench,
|
26 |
ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
|
27 |
repo_id=evals_repo,
|
28 |
use_auth_token=COLLAB_TOKEN,
|
|
|
32 |
)
|
33 |
|
34 |
|
35 |
+
def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
36 |
"""
|
37 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
38 |
|
|
|
96 |
|
97 |
def length_bias_check(dataframe):
|
98 |
"""
|
99 |
+
Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
|
100 |
Then, take the average of the three buckets as "average"
|
101 |
"""
|
102 |
new_df = dataframe.copy()
|
|
|
130 |
|
131 |
|
132 |
|
133 |
+
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
134 |
+
rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
135 |
+
prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
136 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
137 |
|
138 |
+
rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
139 |
|
140 |
+
col_types_rewardbench = ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
141 |
+
col_types_rewardbench_avg = ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
142 |
+
cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
|
143 |
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
|
144 |
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
|
145 |
|
|
|
170 |
regex_list = [x.strip() for x in regex.split(",")]
|
171 |
# Join the list into a single regex pattern with '|' acting as OR
|
172 |
combined_regex = '|'.join(regex_list)
|
173 |
+
|
174 |
# if filter_button, remove all rows with "ai2" in the model name
|
175 |
+
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
176 |
+
if "AI2 Experiments" not in filter_button and ("ai2" not in regex):
|
177 |
+
dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)]
|
178 |
+
if "Seq. Classifiers" not in filter_button:
|
179 |
+
dataframe = dataframe[~dataframe["model_type"].str.contains("Seq. Classifier", case=False, na=False)]
|
180 |
+
if "DPO" not in filter_button:
|
181 |
+
dataframe = dataframe[~dataframe["model_type"].str.contains("DPO", case=False, na=False)]
|
182 |
+
if "Custom Classifiers" not in filter_button:
|
183 |
+
dataframe = dataframe[~dataframe["model_type"].str.contains("Custom Classifier", case=False, na=False)]
|
184 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
185 |
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
|
186 |
|
187 |
|
188 |
with gr.Blocks() as app:
|
189 |
+
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
190 |
with gr.Row():
|
191 |
+
with gr.Column(scale=2.2):
|
192 |
+
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
193 |
+
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
194 |
+
# img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
|
195 |
+
gr.Markdown("""
|
196 |
+
![](file/src/logo.png)
|
197 |
+
""")
|
198 |
with gr.Column(scale=3):
|
199 |
gr.Markdown(TOP_TEXT)
|
|
|
|
|
|
|
200 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
201 |
+
with gr.TabItem("π RewardBench Leaderboard"):
|
202 |
+
with gr.Row():
|
203 |
+
search_1 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
204 |
+
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
205 |
+
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
206 |
+
label="Model Types",
|
207 |
+
# info="Which model types to include.",
|
208 |
+
)
|
209 |
with gr.Row():
|
210 |
# reference data
|
211 |
+
rewardbench_table_hidden = gr.Dataframe(
|
212 |
+
rewardbench_data_avg.values,
|
213 |
+
datatype=col_types_rewardbench_avg,
|
214 |
+
headers=rewardbench_data_avg.columns.tolist(),
|
215 |
visible=False,
|
216 |
)
|
217 |
+
rewardbench_table = gr.Dataframe(
|
218 |
+
regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
|
219 |
+
datatype=col_types_rewardbench_avg,
|
220 |
+
headers=rewardbench_data_avg.columns.tolist(),
|
221 |
+
elem_id="rewardbench_dataframe_avg",
|
222 |
height=1000,
|
223 |
)
|
224 |
|
225 |
+
with gr.TabItem("π RewardBench - Detailed"):
|
226 |
+
with gr.Row():
|
227 |
+
search_2 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
228 |
+
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
229 |
+
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
230 |
+
label="Model Types",
|
231 |
+
# info="Which model types to include."
|
232 |
+
)
|
233 |
with gr.Row():
|
234 |
# ref data
|
235 |
+
rewardbench_table_detailed_hidden = gr.Dataframe(
|
236 |
+
rewardbench_data.values,
|
237 |
+
datatype=col_types_rewardbench,
|
238 |
+
headers=rewardbench_data.columns.tolist(),
|
239 |
visible=False,
|
240 |
)
|
241 |
+
rewardbench_table_detailed = gr.Dataframe(
|
242 |
+
regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
|
243 |
+
datatype=col_types_rewardbench,
|
244 |
+
headers=rewardbench_data.columns.tolist(),
|
245 |
+
elem_id="rewardbench_dataframe",
|
246 |
height=1000,
|
247 |
)
|
248 |
+
# with gr.TabItem("rewardbench Eval Set - Length Bias"):
|
249 |
+
# with gr.Row():
|
250 |
+
# # backup
|
251 |
+
# rewardbench_table_len_hidden = gr.Dataframe(
|
252 |
+
# rewardbench_data_length.values,
|
253 |
+
# datatype=cols_rewardbench_data_length,
|
254 |
+
# headers=rewardbench_data_length.columns.tolist(),
|
255 |
+
# visible=False,
|
256 |
+
# )
|
257 |
+
# rewardbench_table_len = gr.Dataframe(
|
258 |
+
# regex_table(rewardbench_data_length.copy(), "", False).values,
|
259 |
+
# datatype=cols_rewardbench_data_length,
|
260 |
+
# headers=rewardbench_data_length.columns.tolist(),
|
261 |
+
# elem_id="rewardbench_dataframe_length",
|
262 |
+
# height=1000,
|
263 |
+
# )
|
264 |
+
with gr.TabItem("Existing Test Sets"):
|
265 |
with gr.Row():
|
266 |
+
search_3 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
267 |
+
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
268 |
+
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
269 |
+
label="Model Types",
|
270 |
+
# info="Which model types to include.",
|
271 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
with gr.Row():
|
273 |
PREF_SET_TEXT = """
|
274 |
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
|
|
|
283 |
visible=False,
|
284 |
)
|
285 |
pref_sets_table = gr.Dataframe(
|
286 |
+
regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
|
287 |
datatype=col_types_prefs,
|
288 |
headers=prefs_data.columns.tolist(),
|
289 |
elem_id="prefs_dataframe",
|
|
|
309 |
# removed plot because not pretty enough
|
310 |
# with gr.TabItem("Model Correlation"):
|
311 |
# with gr.Row():
|
312 |
+
# plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
|
313 |
# gr.Plot(plot)
|
314 |
|
315 |
+
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
316 |
+
search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
317 |
+
# search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
|
318 |
+
search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
319 |
+
|
320 |
+
model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
321 |
+
model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
322 |
+
model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
323 |
+
|
324 |
# Load data when app starts, TODO make this used somewhere...
|
325 |
# def load_data_on_start():
|
326 |
+
# data_rewardbench = load_all_data(repo_dir_rewardbench)
|
327 |
+
# rewardbench_table.update(data_rewardbench)
|
328 |
|
329 |
+
# data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
|
330 |
+
# rewardbench_table.update(data_rewardbench_avg)
|
331 |
|
332 |
# data_prefs = load_all_data(repo_dir_prefs)
|
333 |
# pref_sets_table.update(data_prefs)
|
src/logo.png
ADDED
src/md.py
CHANGED
@@ -78,9 +78,11 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
|
|
78 |
"""
|
79 |
|
80 |
TOP_TEXT = """
|
81 |
-
#
|
82 |
|
83 |
Evaluating the capabilities, safety, and pitfalls of reward models.
|
84 |
|
85 |
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
|
|
|
|
86 |
"""
|
|
|
78 |
"""
|
79 |
|
80 |
TOP_TEXT = """
|
81 |
+
# RewardBench from AI2
|
82 |
|
83 |
Evaluating the capabilities, safety, and pitfalls of reward models.
|
84 |
|
85 |
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
86 |
+
|
87 |
+
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
88 |
"""
|