natolambert commited on
Commit
6fda62c
·
1 Parent(s): 054ed2d
Files changed (2) hide show
  1. app.py +12 -11
  2. src/md.py +2 -2
app.py CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
- 5. Test Sets: Includes the test sets (anthropic_helpful, mtbench_gpt4, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
@@ -61,28 +61,28 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
61
  new_df = new_df[keep_columns]
62
 
63
  # selected average from pref_sets
64
- pref_columns = ["anthropic_helpful", "mtbench_gpt4", "shp", "summarize"]
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
- dataframe_prefs["Test Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
69
 
70
  # add column Test Sets empty to new_df
71
- new_df["Test Sets"] = np.nan
72
- # per row in new_df if model is in dataframe_prefs, add the value to new_df["Test Sets"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
- values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0])
78
- # new_df.at[i, "Test Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
- new_df["Test Sets"] = values
83
 
84
  # add total average
85
- data_cols += ["Test Sets"]
86
  new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
87
 
88
  # make average third column
@@ -280,7 +280,7 @@ with gr.Blocks(css=custom_css) as app:
280
  # elem_id="rewardbench_dataframe_length",
281
  # height=1000,
282
  # )
283
- with gr.TabItem("Existing Test Sets"):
284
  with gr.Row():
285
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
286
  model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
@@ -318,7 +318,8 @@ with gr.Blocks(css=custom_css) as app:
318
  with gr.TabItem("Dataset Viewer"):
319
  with gr.Row():
320
  # loads one sample
321
- gr.Markdown("## Random Dataset Sample Viewer")
 
322
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
323
  button = gr.Button("Show Random Sample")
324
 
 
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
+ 5. Classic Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
 
61
  new_df = new_df[keep_columns]
62
 
63
  # selected average from pref_sets
64
+ pref_columns = ["anthropic_helpful", "anthropic_hhh", "mtbench_human", "shp", "summarize"]
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
+ dataframe_prefs["Classic Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
69
 
70
  # add column Test Sets empty to new_df
71
+ new_df["Classic Sets"] = np.nan
72
+ # per row in new_df if model is in dataframe_prefs, add the value to new_df["Classic Sets"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
+ values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0])
78
+ # new_df.at[i, "Classic Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
+ new_df["Classic Sets"] = values
83
 
84
  # add total average
85
+ data_cols += ["Classic Sets"]
86
  new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
87
 
88
  # make average third column
 
280
  # elem_id="rewardbench_dataframe_length",
281
  # height=1000,
282
  # )
283
+ with gr.TabItem("Classic Sets"):
284
  with gr.Row():
285
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
286
  model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
 
318
  with gr.TabItem("Dataset Viewer"):
319
  with gr.Row():
320
  # loads one sample
321
+ gr.Markdown("""## Random Dataset Sample Viewer
322
+ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
323
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
324
  button = gr.Button("Show Random Sample")
325
 
src/md.py CHANGED
@@ -9,7 +9,7 @@ We average over 4 core sections (per prompt weighting):
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
  4. **Code**: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
- 5. **Test Sets**: Includes the test sets (anthropic_helpful, mtbench_gpt4, shp, summarize)
13
 
14
  We include multiple types of reward models in this evaluation:
15
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
@@ -82,5 +82,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
82
  TOP_TEXT = """
83
  # RewardBench: Benchmarking Reward Models
84
  ### Evaluating the capabilities, safety, and pitfalls of reward models
85
- [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
86
  """
 
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
  4. **Code**: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
+ 5. **Classic Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
13
 
14
  We include multiple types of reward models in this evaluation:
15
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
 
82
  TOP_TEXT = """
83
  # RewardBench: Benchmarking Reward Models
84
  ### Evaluating the capabilities, safety, and pitfalls of reward models
85
+ [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Classic Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
86
  """