natolambert commited on
Commit
8799e00
·
1 Parent(s): 4e61a96
Files changed (4) hide show
  1. app.py +70 -45
  2. src/constants.py +57 -0
  3. src/md.py +6 -0
  4. src/utils.py +5 -0
app.py CHANGED
@@ -6,6 +6,7 @@ from datasets import load_dataset
6
  from src.utils import load_all_data
7
  from src.md import ABOUT_TEXT, TOP_TEXT
8
  from src.plt import plot_avg_correlation
 
9
  import numpy as np
10
 
11
  api = HfApi()
@@ -33,54 +34,34 @@ repo = snapshot_download(
33
  def avg_over_herm(dataframe):
34
  """
35
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
 
 
 
 
 
 
 
36
  """
37
  new_df = dataframe.copy()
38
- subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
39
- # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
40
- for subset in subsets:
41
- if subset == "refusals":
42
- subset_cols = ["refusals-dangerous", "refusals-offensive", "donotanswer","xstest-should-refuse", "xstest-should-respond"]
43
- else:
44
- subset_cols = [col for col in new_df.columns if subset in col]
45
- new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
46
-
47
- keep_columns = ["model", "average"] + subsets
 
48
  new_df = new_df[keep_columns]
49
- # replace average column with new average
50
- new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
51
- # rename column "hep" to "hep (code)"
52
- new_df = new_df.rename(columns={"hep": "hep (code)"})
53
  return new_df
54
 
55
  def expand_subsets(dataframe):
56
  # TODO need to modify data/ script to do this
57
  pass
58
 
59
- # reference for length bias categories
60
- length_categories = {
61
- 'alpacaeval-easy': 'True',
62
- 'alpacaeval-hard': 'True',
63
- 'alpacaeval-length': 'Neutral',
64
- 'donotanswer': 'False',
65
- 'hep-cpp': 'Neutral',
66
- 'hep-go': 'Neutral',
67
- 'hep-java': 'Neutral',
68
- 'hep-js': 'Neutral',
69
- 'hep-python': 'Neutral',
70
- 'hep-rust': 'Neutral',
71
- 'llmbar-adver-GPTInst': 'False',
72
- 'llmbar-adver-GPTOut': 'Neutral',
73
- 'llmbar-adver-manual': 'False',
74
- 'llmbar-adver-neighbor': 'False',
75
- 'llmbar-natural': 'Neutral',
76
- 'mt-bench-easy': 'False',
77
- 'mt-bench-hard': 'False',
78
- 'mt-bench-med': 'Neutral',
79
- 'refusals-dangerous': 'False',
80
- 'refusals-offensive': 'False',
81
- 'xstest-should-refuse': 'False',
82
- 'xstest-should-respond': 'True'
83
- }
84
 
85
  def length_bias_check(dataframe):
86
  """
@@ -119,7 +100,7 @@ def length_bias_check(dataframe):
119
 
120
 
121
  herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
122
- herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
123
  herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
124
  prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
125
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
@@ -149,10 +130,23 @@ def random_sample(r: gr.Request, subset):
149
 
150
  subsets = eval_set.unique("subset")
151
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with gr.Blocks() as app:
153
  # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
154
  with gr.Row():
155
  gr.Markdown(TOP_TEXT)
 
156
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
157
  with gr.TabItem("HERM Eval Set - Overview"):
158
  with gr.Row():
@@ -163,24 +157,45 @@ with gr.Blocks() as app:
163
  elem_id="herm_dataframe_avg",
164
  height=1000,
165
  )
 
 
 
 
 
 
 
166
  with gr.TabItem("HERM Eval Set - Detailed"):
167
  with gr.Row():
168
- herm_table = gr.Dataframe(
169
  herm_data.values,
170
  datatype=col_types_herm,
171
  headers=herm_data.columns.tolist(),
172
  elem_id="herm_dataframe",
173
  height=1000,
174
  )
 
 
 
 
 
 
 
175
  with gr.TabItem("HERM Eval Set - Length Bias"):
176
  with gr.Row():
177
- herm_table = gr.Dataframe(
178
  herm_data_length.values,
179
  datatype=cols_herm_data_length,
180
  headers=herm_data_length.columns.tolist(),
181
  elem_id="herm_dataframe_length",
182
  height=1000,
183
  )
 
 
 
 
 
 
 
184
  with gr.TabItem("Known Pref. Sets"):
185
  with gr.Row():
186
  PREF_SET_TEXT = """
@@ -195,6 +210,13 @@ with gr.Blocks() as app:
195
  elem_id="prefs_dataframe",
196
  height=1000,
197
  )
 
 
 
 
 
 
 
198
 
199
  with gr.TabItem("About"):
200
  with gr.Row():
@@ -216,6 +238,11 @@ with gr.Blocks() as app:
216
  # with gr.Row():
217
  # plot = plot_avg_correlation(herm_data_avg, prefs_data)
218
  # gr.Plot(plot)
 
 
 
 
 
219
 
220
  # Load data when app starts, TODO make this used somewhere...
221
  # def load_data_on_start():
@@ -231,6 +258,4 @@ with gr.Blocks() as app:
231
  scheduler = BackgroundScheduler()
232
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
233
  scheduler.start()
234
-
235
-
236
- app.queue().launch()
 
6
  from src.utils import load_all_data
7
  from src.md import ABOUT_TEXT, TOP_TEXT
8
  from src.plt import plot_avg_correlation
9
+ from src.constants import subset_mapping, length_categories, example_counts
10
  import numpy as np
11
 
12
  api = HfApi()
 
34
  def avg_over_herm(dataframe):
35
  """
36
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
37
+
38
+ We average over 4 core sections (per prompt weighting):
39
+ 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
40
+ 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
41
+ 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
42
+ 4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
43
+
44
  """
45
  new_df = dataframe.copy()
46
+
47
+ # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
48
+ for subset, sub_subsets in subset_mapping.items():
49
+ subset_cols = [col for col in new_df.columns if col in sub_subsets]
50
+ sub_data = new_df[subset_cols].values # take the relevant column values
51
+ sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
52
+ new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average
53
+ # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
54
+
55
+ keep_columns = ["model",] + list(subset_mapping.keys())
56
+ # keep_columns = ["model", "average"] + subsets
57
  new_df = new_df[keep_columns]
58
+
 
 
 
59
  return new_df
60
 
61
  def expand_subsets(dataframe):
62
  # TODO need to modify data/ script to do this
63
  pass
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def length_bias_check(dataframe):
67
  """
 
100
 
101
 
102
  herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
103
+ herm_data_avg = avg_over_herm(herm_data).sort_values(by='Chat', ascending=False)
104
  herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
105
  prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
106
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
 
130
 
131
  subsets = eval_set.unique("subset")
132
 
133
+ def regex_table(dataframe, regex):
134
+ """
135
+ Takes a model name as a regex, then returns only the rows that has that in it.
136
+ """
137
+ # Split regex statement by comma and trim whitespace around regexes
138
+ regex_list = [x.strip() for x in regex.split(",")]
139
+ # Join the list into a single regex pattern with '|' acting as OR
140
+ combined_regex = '|'.join(regex_list)
141
+ # Filter the dataframe such that 'model' contains any of the regex patterns
142
+ return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
143
+
144
+
145
  with gr.Blocks() as app:
146
  # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
147
  with gr.Row():
148
  gr.Markdown(TOP_TEXT)
149
+ search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
150
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
151
  with gr.TabItem("HERM Eval Set - Overview"):
152
  with gr.Row():
 
157
  elem_id="herm_dataframe_avg",
158
  height=1000,
159
  )
160
+ # backup reference data
161
+ herm_table_hidden = gr.Dataframe(
162
+ herm_data_avg.values,
163
+ datatype=col_types_herm_avg,
164
+ headers=herm_data_avg.columns.tolist(),
165
+ visible=False,
166
+ )
167
  with gr.TabItem("HERM Eval Set - Detailed"):
168
  with gr.Row():
169
+ herm_table_detailed = gr.Dataframe(
170
  herm_data.values,
171
  datatype=col_types_herm,
172
  headers=herm_data.columns.tolist(),
173
  elem_id="herm_dataframe",
174
  height=1000,
175
  )
176
+ # backup
177
+ herm_table_detailed_hidden = gr.Dataframe(
178
+ herm_data.values,
179
+ datatype=col_types_herm,
180
+ headers=herm_data.columns.tolist(),
181
+ visible=False,
182
+ )
183
  with gr.TabItem("HERM Eval Set - Length Bias"):
184
  with gr.Row():
185
+ herm_table_len = gr.Dataframe(
186
  herm_data_length.values,
187
  datatype=cols_herm_data_length,
188
  headers=herm_data_length.columns.tolist(),
189
  elem_id="herm_dataframe_length",
190
  height=1000,
191
  )
192
+ # backup
193
+ herm_table_len_hidden = gr.Dataframe(
194
+ herm_data_length.values,
195
+ datatype=cols_herm_data_length,
196
+ headers=herm_data_length.columns.tolist(),
197
+ visible=False,
198
+ )
199
  with gr.TabItem("Known Pref. Sets"):
200
  with gr.Row():
201
  PREF_SET_TEXT = """
 
210
  elem_id="prefs_dataframe",
211
  height=1000,
212
  )
213
+ # backup
214
+ pref_sets_table_hidden = gr.Dataframe(
215
+ prefs_data.values,
216
+ datatype=col_types_prefs,
217
+ headers=prefs_data.columns.tolist(),
218
+ visible=False,
219
+ )
220
 
221
  with gr.TabItem("About"):
222
  with gr.Row():
 
238
  # with gr.Row():
239
  # plot = plot_avg_correlation(herm_data_avg, prefs_data)
240
  # gr.Plot(plot)
241
+
242
+ search.change(regex_table, inputs=[herm_table_hidden, search], outputs=herm_table)
243
+ search.change(regex_table, inputs=[herm_table_detailed_hidden, search], outputs=herm_table_detailed)
244
+ search.change(regex_table, inputs=[herm_table_len_hidden, search], outputs=herm_table_len)
245
+ search.change(regex_table, inputs=[pref_sets_table_hidden, search], outputs=pref_sets_table)
246
 
247
  # Load data when app starts, TODO make this used somewhere...
248
  # def load_data_on_start():
 
258
  scheduler = BackgroundScheduler()
259
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
260
  scheduler.start()
261
+ app.launch() # had .queue() before launch before... not sure if that's necessary
 
 
src/constants.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reference for length bias categories
2
+ length_categories = {
3
+ 'alpacaeval-easy': 'True',
4
+ 'alpacaeval-hard': 'True',
5
+ 'alpacaeval-length': 'Neutral',
6
+ 'donotanswer': 'False',
7
+ 'hep-cpp': 'Neutral',
8
+ 'hep-go': 'Neutral',
9
+ 'hep-java': 'Neutral',
10
+ 'hep-js': 'Neutral',
11
+ 'hep-python': 'Neutral',
12
+ 'hep-rust': 'Neutral',
13
+ 'llmbar-adver-GPTInst': 'False',
14
+ 'llmbar-adver-GPTOut': 'Neutral',
15
+ 'llmbar-adver-manual': 'False',
16
+ 'llmbar-adver-neighbor': 'False',
17
+ 'llmbar-natural': 'Neutral',
18
+ 'mt-bench-easy': 'False',
19
+ 'mt-bench-hard': 'False',
20
+ 'mt-bench-med': 'Neutral',
21
+ 'refusals-dangerous': 'False',
22
+ 'refusals-offensive': 'False',
23
+ 'xstest-should-refuse': 'False',
24
+ 'xstest-should-respond': 'True'
25
+ }
26
+
27
+ example_counts = {
28
+ "alpacaeval-easy": 100,
29
+ "alpacaeval-length": 95,
30
+ "alpacaeval-hard": 95,
31
+ "mt-bench-easy": 28,
32
+ "mt-bench-med": 40,
33
+ "mt-bench-hard": 37,
34
+ "refusals-dangerous": 100,
35
+ "refusals-offensive": 100,
36
+ "llmbar-natural": 100,
37
+ "llmbar-adver-neighbor": 134,
38
+ "llmbar-adver-GPTInst": 92,
39
+ "llmbar-adver-GPTOut": 47,
40
+ "llmbar-adver-manual": 46,
41
+ "xstest-should-refuse": 250,
42
+ "xstest-should-respond": 154,
43
+ "donotanswer": 136,
44
+ "hep-cpp": 164,
45
+ "hep-go": 164,
46
+ "hep-java": 164,
47
+ "hep-js": 164,
48
+ "hep-python": 164,
49
+ "hep-rust": 164
50
+ }
51
+
52
+ subset_mapping = {
53
+ "Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
54
+ "Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
55
+ "Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
56
+ "Code": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
57
+ }
src/md.py CHANGED
@@ -2,6 +2,12 @@ ABOUT_TEXT = """
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
 
 
 
 
 
 
5
  ## Subset Summary
6
 
7
  Total number of the prompts is: 2538, filtered from 4676.
 
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
5
+ We average over 4 core sections (per prompt weighting):
6
+ 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
7
+ 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
8
+ 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
9
+ 4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
10
+
11
  ## Subset Summary
12
 
13
  Total number of the prompts is: 2538, filtered from 4676.
src/utils.py CHANGED
@@ -61,6 +61,11 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
61
  # select all columns except "model"
62
  cols = df.columns.tolist()
63
  cols.remove("model")
 
 
 
 
 
64
  # round
65
  df[cols] = df[cols].round(2)
66
  avg = np.nanmean(df[cols].values,axis=1).round(2)
 
61
  # select all columns except "model"
62
  cols = df.columns.tolist()
63
  cols.remove("model")
64
+ # remove model_beaker from dataframe
65
+ if "model_beaker" in cols:
66
+ cols.remove("model_beaker")
67
+ df = df.drop(columns=["model_beaker"])
68
+
69
  # round
70
  df[cols] = df[cols].round(2)
71
  avg = np.nanmean(df[cols].values,axis=1).round(2)