natolambert commited on
Commit
31bff5a
Β·
1 Parent(s): 6b2b055

major imporvements

Browse files
Files changed (4) hide show
  1. README.md +1 -0
  2. app.py +104 -67
  3. src/logo.png +0 -0
  4. src/md.py +3 -1
README.md CHANGED
@@ -6,6 +6,7 @@ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.12.0
8
  app_file: app.py
 
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
6
  sdk: gradio
7
  sdk_version: 4.12.0
8
  app_file: app.py
9
+ header: mini
10
  pinned: false
11
  license: apache-2.0
12
  ---
app.py CHANGED
@@ -12,17 +12,17 @@ import numpy as np
12
  api = HfApi()
13
 
14
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
15
- evals_repo = "ai2-adapt-dev/HERM-Results"
16
 
17
- eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
18
- repo_dir_herm = "./evals/herm/"
19
 
20
  def restart_space():
21
- api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
22
 
23
  print("Pulling evaluation results")
24
  repo = snapshot_download(
25
- local_dir=repo_dir_herm,
26
  ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
27
  repo_id=evals_repo,
28
  use_auth_token=COLLAB_TOKEN,
@@ -32,7 +32,7 @@ repo = snapshot_download(
32
  )
33
 
34
 
35
- def avg_over_herm(dataframe_core, dataframe_prefs):
36
  """
37
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
38
 
@@ -96,7 +96,7 @@ def expand_subsets(dataframe):
96
 
97
  def length_bias_check(dataframe):
98
  """
99
- Takes the raw herm dataframe and splits the data into new buckets according to length_categories.
100
  Then, take the average of the three buckets as "average"
101
  """
102
  new_df = dataframe.copy()
@@ -130,16 +130,16 @@ def length_bias_check(dataframe):
130
 
131
 
132
 
133
- herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
134
- herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
135
- prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
136
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
137
 
138
- herm_data_avg = avg_over_herm(herm_data, prefs_data).sort_values(by='average', ascending=False)
139
 
140
- col_types_herm = ["markdown"] + ["str"] + ["number"] * (len(herm_data.columns) - 1)
141
- col_types_herm_avg = ["markdown"]+ ["str"] + ["number"] * (len(herm_data_avg.columns) - 1)
142
- cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1)
143
  col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
144
  # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
145
 
@@ -170,72 +170,105 @@ def regex_table(dataframe, regex, filter_button):
170
  regex_list = [x.strip() for x in regex.split(",")]
171
  # Join the list into a single regex pattern with '|' acting as OR
172
  combined_regex = '|'.join(regex_list)
 
173
  # if filter_button, remove all rows with "ai2" in the model name
174
- if (not filter_button) and ("ai2" not in regex):
175
- dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)]
 
 
 
 
 
 
 
176
  # Filter the dataframe such that 'model' contains any of the regex patterns
177
  return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
178
 
179
 
180
  with gr.Blocks() as app:
181
- # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
182
  with gr.Row():
 
 
 
 
 
 
 
183
  with gr.Column(scale=3):
184
  gr.Markdown(TOP_TEXT)
185
- with gr.Column(scale=2):
186
- search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
187
- filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
188
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
189
- with gr.TabItem("HERM Eval Set - Overview"):
 
 
 
 
 
 
 
190
  with gr.Row():
191
  # reference data
192
- herm_table_hidden = gr.Dataframe(
193
- herm_data_avg.values,
194
- datatype=col_types_herm_avg,
195
- headers=herm_data_avg.columns.tolist(),
196
  visible=False,
197
  )
198
- herm_table = gr.Dataframe(
199
- regex_table(herm_data_avg.copy(), "", False).values,
200
- datatype=col_types_herm_avg,
201
- headers=herm_data_avg.columns.tolist(),
202
- elem_id="herm_dataframe_avg",
203
  height=1000,
204
  )
205
 
206
- with gr.TabItem("HERM Eval Set - Detailed"):
 
 
 
 
 
 
 
207
  with gr.Row():
208
  # ref data
209
- herm_table_detailed_hidden = gr.Dataframe(
210
- herm_data.values,
211
- datatype=col_types_herm,
212
- headers=herm_data.columns.tolist(),
213
  visible=False,
214
  )
215
- herm_table_detailed = gr.Dataframe(
216
- regex_table(herm_data.copy(), "", False).values,
217
- datatype=col_types_herm,
218
- headers=herm_data.columns.tolist(),
219
- elem_id="herm_dataframe",
220
  height=1000,
221
  )
222
- with gr.TabItem("HERM Eval Set - Length Bias"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  with gr.Row():
224
- # backup
225
- herm_table_len_hidden = gr.Dataframe(
226
- herm_data_length.values,
227
- datatype=cols_herm_data_length,
228
- headers=herm_data_length.columns.tolist(),
229
- visible=False,
230
- )
231
- herm_table_len = gr.Dataframe(
232
- regex_table(herm_data_length.copy(), "", False).values,
233
- datatype=cols_herm_data_length,
234
- headers=herm_data_length.columns.tolist(),
235
- elem_id="herm_dataframe_length",
236
- height=1000,
237
- )
238
- with gr.TabItem("Known Pref. Sets"):
239
  with gr.Row():
240
  PREF_SET_TEXT = """
241
  For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
@@ -250,7 +283,7 @@ with gr.Blocks() as app:
250
  visible=False,
251
  )
252
  pref_sets_table = gr.Dataframe(
253
- regex_table(prefs_data.copy(), "", False).values,
254
  datatype=col_types_prefs,
255
  headers=prefs_data.columns.tolist(),
256
  elem_id="prefs_dataframe",
@@ -276,21 +309,25 @@ with gr.Blocks() as app:
276
  # removed plot because not pretty enough
277
  # with gr.TabItem("Model Correlation"):
278
  # with gr.Row():
279
- # plot = plot_avg_correlation(herm_data_avg, prefs_data)
280
  # gr.Plot(plot)
281
 
282
- search.change(regex_table, inputs=[herm_table_hidden, search, filter_button], outputs=herm_table)
283
- search.change(regex_table, inputs=[herm_table_detailed_hidden, search, filter_button], outputs=herm_table_detailed)
284
- search.change(regex_table, inputs=[herm_table_len_hidden, search, filter_button], outputs=herm_table_len)
285
- search.change(regex_table, inputs=[pref_sets_table_hidden, search, filter_button], outputs=pref_sets_table)
286
-
 
 
 
 
287
  # Load data when app starts, TODO make this used somewhere...
288
  # def load_data_on_start():
289
- # data_herm = load_all_data(repo_dir_herm)
290
- # herm_table.update(data_herm)
291
 
292
- # data_herm_avg = avg_over_herm(repo_dir_herm)
293
- # herm_table.update(data_herm_avg)
294
 
295
  # data_prefs = load_all_data(repo_dir_prefs)
296
  # pref_sets_table.update(data_prefs)
 
12
  api = HfApi()
13
 
14
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
15
+ evals_repo = "allenai/reward-bench-results"
16
 
17
+ eval_set_repo = "allenai/reward-bench"
18
+ repo_dir_rewardbench = "./evals/rewardbench/"
19
 
20
  def restart_space():
21
+ api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
22
 
23
  print("Pulling evaluation results")
24
  repo = snapshot_download(
25
+ local_dir=repo_dir_rewardbench,
26
  ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
27
  repo_id=evals_repo,
28
  use_auth_token=COLLAB_TOKEN,
 
32
  )
33
 
34
 
35
+ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
36
  """
37
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
38
 
 
96
 
97
  def length_bias_check(dataframe):
98
  """
99
+ Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
100
  Then, take the average of the three buckets as "average"
101
  """
102
  new_df = dataframe.copy()
 
130
 
131
 
132
 
133
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
134
+ rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
135
+ prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
136
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
137
 
138
+ rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
139
 
140
+ col_types_rewardbench = ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
141
+ col_types_rewardbench_avg = ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
142
+ cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
143
  col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
144
  # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
145
 
 
170
  regex_list = [x.strip() for x in regex.split(",")]
171
  # Join the list into a single regex pattern with '|' acting as OR
172
  combined_regex = '|'.join(regex_list)
173
+
174
  # if filter_button, remove all rows with "ai2" in the model name
175
+ if isinstance(filter_button, list) or isinstance(filter_button, str):
176
+ if "AI2 Experiments" not in filter_button and ("ai2" not in regex):
177
+ dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)]
178
+ if "Seq. Classifiers" not in filter_button:
179
+ dataframe = dataframe[~dataframe["model_type"].str.contains("Seq. Classifier", case=False, na=False)]
180
+ if "DPO" not in filter_button:
181
+ dataframe = dataframe[~dataframe["model_type"].str.contains("DPO", case=False, na=False)]
182
+ if "Custom Classifiers" not in filter_button:
183
+ dataframe = dataframe[~dataframe["model_type"].str.contains("Custom Classifier", case=False, na=False)]
184
  # Filter the dataframe such that 'model' contains any of the regex patterns
185
  return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
186
 
187
 
188
  with gr.Blocks() as app:
189
+ # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
190
  with gr.Row():
191
+ with gr.Column(scale=2.2):
192
+ # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
193
+ # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
194
+ # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
195
+ gr.Markdown("""
196
+ ![](file/src/logo.png)
197
+ """)
198
  with gr.Column(scale=3):
199
  gr.Markdown(TOP_TEXT)
 
 
 
200
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
201
+ with gr.TabItem("πŸ† RewardBench Leaderboard"):
202
+ with gr.Row():
203
+ search_1 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
204
+ model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
205
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
206
+ label="Model Types",
207
+ # info="Which model types to include.",
208
+ )
209
  with gr.Row():
210
  # reference data
211
+ rewardbench_table_hidden = gr.Dataframe(
212
+ rewardbench_data_avg.values,
213
+ datatype=col_types_rewardbench_avg,
214
+ headers=rewardbench_data_avg.columns.tolist(),
215
  visible=False,
216
  )
217
+ rewardbench_table = gr.Dataframe(
218
+ regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
219
+ datatype=col_types_rewardbench_avg,
220
+ headers=rewardbench_data_avg.columns.tolist(),
221
+ elem_id="rewardbench_dataframe_avg",
222
  height=1000,
223
  )
224
 
225
+ with gr.TabItem("πŸ” RewardBench - Detailed"):
226
+ with gr.Row():
227
+ search_2 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
228
+ model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
229
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
230
+ label="Model Types",
231
+ # info="Which model types to include."
232
+ )
233
  with gr.Row():
234
  # ref data
235
+ rewardbench_table_detailed_hidden = gr.Dataframe(
236
+ rewardbench_data.values,
237
+ datatype=col_types_rewardbench,
238
+ headers=rewardbench_data.columns.tolist(),
239
  visible=False,
240
  )
241
+ rewardbench_table_detailed = gr.Dataframe(
242
+ regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
243
+ datatype=col_types_rewardbench,
244
+ headers=rewardbench_data.columns.tolist(),
245
+ elem_id="rewardbench_dataframe",
246
  height=1000,
247
  )
248
+ # with gr.TabItem("rewardbench Eval Set - Length Bias"):
249
+ # with gr.Row():
250
+ # # backup
251
+ # rewardbench_table_len_hidden = gr.Dataframe(
252
+ # rewardbench_data_length.values,
253
+ # datatype=cols_rewardbench_data_length,
254
+ # headers=rewardbench_data_length.columns.tolist(),
255
+ # visible=False,
256
+ # )
257
+ # rewardbench_table_len = gr.Dataframe(
258
+ # regex_table(rewardbench_data_length.copy(), "", False).values,
259
+ # datatype=cols_rewardbench_data_length,
260
+ # headers=rewardbench_data_length.columns.tolist(),
261
+ # elem_id="rewardbench_dataframe_length",
262
+ # height=1000,
263
+ # )
264
+ with gr.TabItem("Existing Test Sets"):
265
  with gr.Row():
266
+ search_3 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
267
+ model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
268
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
269
+ label="Model Types",
270
+ # info="Which model types to include.",
271
+ )
 
 
 
 
 
 
 
 
 
272
  with gr.Row():
273
  PREF_SET_TEXT = """
274
  For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
 
283
  visible=False,
284
  )
285
  pref_sets_table = gr.Dataframe(
286
+ regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
287
  datatype=col_types_prefs,
288
  headers=prefs_data.columns.tolist(),
289
  elem_id="prefs_dataframe",
 
309
  # removed plot because not pretty enough
310
  # with gr.TabItem("Model Correlation"):
311
  # with gr.Row():
312
+ # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
313
  # gr.Plot(plot)
314
 
315
+ search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
316
+ search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
317
+ # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
318
+ search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
319
+
320
+ model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
321
+ model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
322
+ model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
323
+
324
  # Load data when app starts, TODO make this used somewhere...
325
  # def load_data_on_start():
326
+ # data_rewardbench = load_all_data(repo_dir_rewardbench)
327
+ # rewardbench_table.update(data_rewardbench)
328
 
329
+ # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
330
+ # rewardbench_table.update(data_rewardbench_avg)
331
 
332
  # data_prefs = load_all_data(repo_dir_prefs)
333
  # pref_sets_table.update(data_prefs)
src/logo.png ADDED
src/md.py CHANGED
@@ -78,9 +78,11 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
78
  """
79
 
80
  TOP_TEXT = """
81
- # Holistic Evaluation of Reward Models (HERM) from AI2
82
 
83
  Evaluating the capabilities, safety, and pitfalls of reward models.
84
 
85
  [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
 
 
86
  """
 
78
  """
79
 
80
  TOP_TEXT = """
81
+ # RewardBench from AI2
82
 
83
  Evaluating the capabilities, safety, and pitfalls of reward models.
84
 
85
  [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
86
+
87
+ All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
88
  """