|
import gradio as gr |
|
import os |
|
from huggingface_hub import HfApi, snapshot_download |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from datasets import load_dataset |
|
from src.utils import load_all_data |
|
from src.md import ABOUT_TEXT, TOP_TEXT |
|
from src.plt import plot_avg_correlation |
|
from src.constants import subset_mapping, length_categories, example_counts |
|
import numpy as np |
|
|
|
api = HfApi() |
|
|
|
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") |
|
evals_repo = "ai2-adapt-dev/HERM-Results" |
|
|
|
eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev" |
|
repo_dir_herm = "./evals/herm/" |
|
|
|
def restart_space(): |
|
api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN) |
|
|
|
print("Pulling evaluation results") |
|
repo = snapshot_download( |
|
local_dir=repo_dir_herm, |
|
ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"], |
|
repo_id=evals_repo, |
|
use_auth_token=COLLAB_TOKEN, |
|
tqdm_class=None, |
|
etag_timeout=30, |
|
repo_type="dataset", |
|
) |
|
|
|
|
|
def avg_over_herm(dataframe_core, dataframe_prefs): |
|
""" |
|
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. |
|
|
|
We average over 4 core sections (per prompt weighting): |
|
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium) |
|
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual) |
|
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer) |
|
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust) |
|
|
|
""" |
|
new_df = dataframe_core.copy() |
|
dataframe_prefs = dataframe_prefs.copy() |
|
|
|
|
|
for subset, sub_subsets in subset_mapping.items(): |
|
subset_cols = [col for col in new_df.columns if col in sub_subsets] |
|
sub_data = new_df[subset_cols].values |
|
sub_counts = [example_counts[s] for s in sub_subsets] |
|
new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) |
|
|
|
|
|
data_cols = list(subset_mapping.keys()) |
|
keep_columns = ["model",] + ["model_type"] + data_cols |
|
|
|
new_df = new_df[keep_columns] |
|
|
|
|
|
pref_columns = ["anthropic_helpful", "mtbench_gpt4", "shp", "summarize"] |
|
pref_data = dataframe_prefs[pref_columns].values |
|
|
|
|
|
dataframe_prefs["Test Sets"] = np.round(np.nanmean(pref_data, axis=1), 2) |
|
|
|
|
|
new_df["Test Sets"] = np.nan |
|
|
|
values = [] |
|
for i, row in new_df.iterrows(): |
|
model = row["model"] |
|
if model in dataframe_prefs["model"].values: |
|
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Test Sets"].values[0]) |
|
|
|
else: |
|
values.append(np.nan) |
|
|
|
new_df["Test Sets"] = values |
|
|
|
|
|
data_cols += ["Test Sets"] |
|
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2) |
|
|
|
|
|
keep_columns = ["model", "model_type", "average"] + data_cols |
|
new_df = new_df[keep_columns] |
|
return new_df |
|
|
|
def expand_subsets(dataframe): |
|
|
|
pass |
|
|
|
|
|
def length_bias_check(dataframe): |
|
""" |
|
Takes the raw herm dataframe and splits the data into new buckets according to length_categories. |
|
Then, take the average of the three buckets as "average" |
|
""" |
|
new_df = dataframe.copy() |
|
existing_subsets = new_df.columns[3:] |
|
final_subsets = ["Length Bias", "Neutral", "Terse Bias"] |
|
|
|
new_data = {s: [] for s in final_subsets} |
|
|
|
|
|
|
|
for subset in existing_subsets: |
|
subset_data = new_df[subset].values |
|
subset_length = length_categories[subset] |
|
|
|
if subset_length == "True": |
|
new_data["Length Bias"].append(subset_data) |
|
elif subset_length == "Neutral": |
|
new_data["Neutral"].append(subset_data) |
|
elif subset_length == "False": |
|
new_data["Terse Bias"].append(subset_data) |
|
|
|
|
|
for subset in final_subsets: |
|
new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2) |
|
keep_columns = ["model"] + final_subsets |
|
new_df = new_df[keep_columns] |
|
|
|
|
|
|
|
return new_df |
|
|
|
|
|
|
|
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False) |
|
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False) |
|
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False) |
|
|
|
|
|
herm_data_avg = avg_over_herm(herm_data, prefs_data).sort_values(by='average', ascending=False) |
|
|
|
col_types_herm = ["markdown"] + ["str"] + ["number"] * (len(herm_data.columns) - 1) |
|
col_types_herm_avg = ["markdown"]+ ["str"] + ["number"] * (len(herm_data_avg.columns) - 1) |
|
cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1) |
|
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) |
|
|
|
|
|
|
|
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") |
|
def random_sample(r: gr.Request, subset): |
|
if subset is None or subset == []: |
|
sample_index = np.random.randint(0, len(eval_set) - 1) |
|
sample = eval_set[sample_index] |
|
else: |
|
if isinstance(subset, str): |
|
subset = [subset] |
|
|
|
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset) |
|
sample_index = np.random.randint(0, len(eval_set_filtered) - 1) |
|
sample = eval_set_filtered[sample_index] |
|
|
|
markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()]) |
|
return markdown_text |
|
|
|
subsets = eval_set.unique("subset") |
|
|
|
def regex_table(dataframe, regex, filter_button): |
|
""" |
|
Takes a model name as a regex, then returns only the rows that has that in it. |
|
""" |
|
|
|
regex_list = [x.strip() for x in regex.split(",")] |
|
|
|
combined_regex = '|'.join(regex_list) |
|
|
|
if (not filter_button) and ("ai2" not in regex): |
|
dataframe = dataframe[~dataframe["model"].str.contains("ai2", case=False, na=False)] |
|
|
|
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)] |
|
|
|
|
|
with gr.Blocks() as app: |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
gr.Markdown(TOP_TEXT) |
|
with gr.Column(scale=2): |
|
search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") |
|
filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True) |
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("HERM Eval Set - Overview"): |
|
with gr.Row(): |
|
|
|
herm_table_hidden = gr.Dataframe( |
|
herm_data_avg.values, |
|
datatype=col_types_herm_avg, |
|
headers=herm_data_avg.columns.tolist(), |
|
visible=False, |
|
) |
|
herm_table = gr.Dataframe( |
|
regex_table(herm_data_avg.copy(), "", False).values, |
|
datatype=col_types_herm_avg, |
|
headers=herm_data_avg.columns.tolist(), |
|
elem_id="herm_dataframe_avg", |
|
height=1000, |
|
) |
|
|
|
with gr.TabItem("HERM Eval Set - Detailed"): |
|
with gr.Row(): |
|
|
|
herm_table_detailed_hidden = gr.Dataframe( |
|
herm_data.values, |
|
datatype=col_types_herm, |
|
headers=herm_data.columns.tolist(), |
|
visible=False, |
|
) |
|
herm_table_detailed = gr.Dataframe( |
|
regex_table(herm_data.copy(), "", False).values, |
|
datatype=col_types_herm, |
|
headers=herm_data.columns.tolist(), |
|
elem_id="herm_dataframe", |
|
height=1000, |
|
) |
|
with gr.TabItem("HERM Eval Set - Length Bias"): |
|
with gr.Row(): |
|
|
|
herm_table_len_hidden = gr.Dataframe( |
|
herm_data_length.values, |
|
datatype=cols_herm_data_length, |
|
headers=herm_data_length.columns.tolist(), |
|
visible=False, |
|
) |
|
herm_table_len = gr.Dataframe( |
|
regex_table(herm_data_length.copy(), "", False).values, |
|
datatype=cols_herm_data_length, |
|
headers=herm_data_length.columns.tolist(), |
|
elem_id="herm_dataframe_length", |
|
height=1000, |
|
) |
|
with gr.TabItem("Known Pref. Sets"): |
|
with gr.Row(): |
|
PREF_SET_TEXT = """ |
|
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). |
|
""" |
|
gr.Markdown(PREF_SET_TEXT) |
|
with gr.Row(): |
|
|
|
pref_sets_table_hidden = gr.Dataframe( |
|
prefs_data.values, |
|
datatype=col_types_prefs, |
|
headers=prefs_data.columns.tolist(), |
|
visible=False, |
|
) |
|
pref_sets_table = gr.Dataframe( |
|
regex_table(prefs_data.copy(), "", False).values, |
|
datatype=col_types_prefs, |
|
headers=prefs_data.columns.tolist(), |
|
elem_id="prefs_dataframe", |
|
height=1000, |
|
) |
|
|
|
|
|
with gr.TabItem("About"): |
|
with gr.Row(): |
|
gr.Markdown(ABOUT_TEXT) |
|
|
|
with gr.TabItem("Dataset Viewer"): |
|
with gr.Row(): |
|
|
|
gr.Markdown("## Random Dataset Sample Viewer") |
|
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True) |
|
button = gr.Button("Show Random Sample") |
|
|
|
with gr.Row(): |
|
sample_display = gr.Markdown("{sampled data loads here}") |
|
|
|
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
search.change(regex_table, inputs=[herm_table_hidden, search, filter_button], outputs=herm_table) |
|
search.change(regex_table, inputs=[herm_table_detailed_hidden, search, filter_button], outputs=herm_table_detailed) |
|
search.change(regex_table, inputs=[herm_table_len_hidden, search, filter_button], outputs=herm_table_len) |
|
search.change(regex_table, inputs=[pref_sets_table_hidden, search, filter_button], outputs=pref_sets_table) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=10800) |
|
scheduler.start() |
|
app.launch() |
|
|