Spaces:
Running
Running
import gradio as gr | |
import os | |
from huggingface_hub import HfApi, snapshot_download | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from datasets import load_dataset | |
from src.utils import load_all_data | |
from src.md import ABOUT_TEXT, TOP_TEXT | |
from src.plt import plot_avg_correlation | |
from src.constants import subset_mapping, length_categories, example_counts | |
import numpy as np | |
api = HfApi() | |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") | |
evals_repo = "ai2-adapt-dev/HERM-Results" | |
eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev" | |
repo_dir_herm = "./evals/herm/" | |
def restart_space(): | |
api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN) | |
print("Pulling evaluation results") | |
repo = snapshot_download( | |
local_dir=repo_dir_herm, | |
repo_id=evals_repo, | |
use_auth_token=COLLAB_TOKEN, | |
tqdm_class=None, | |
etag_timeout=30, | |
repo_type="dataset", | |
) | |
def avg_over_herm(dataframe): | |
""" | |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. | |
We average over 4 core sections (per prompt weighting): | |
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium) | |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual) | |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer) | |
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust) | |
""" | |
new_df = dataframe.copy() | |
# for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models | |
for subset, sub_subsets in subset_mapping.items(): | |
subset_cols = [col for col in new_df.columns if col in sub_subsets] | |
sub_data = new_df[subset_cols].values # take the relevant column values | |
sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts | |
new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average | |
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2) | |
keep_columns = ["model",] + list(subset_mapping.keys()) | |
# keep_columns = ["model", "average"] + subsets | |
new_df = new_df[keep_columns] | |
return new_df | |
def expand_subsets(dataframe): | |
# TODO need to modify data/ script to do this | |
pass | |
def length_bias_check(dataframe): | |
""" | |
Takes the raw herm dataframe and splits the data into new buckets according to length_categories. | |
Then, take the average of the three buckets as "average" | |
""" | |
new_df = dataframe.copy() | |
existing_subsets = new_df.columns[2:] | |
final_subsets = ["Length Bias", "Neutral", "Terse Bias"] | |
# new data is empty list dict for each final subset | |
new_data = {s: [] for s in final_subsets} | |
# now, subsets correspond to those with True, Nuetral, and False length bias | |
# check if length_categories[subset] == "True" or "False" or "Neutral" | |
for subset in existing_subsets: | |
subset_data = new_df[subset].values | |
subset_length = length_categories[subset] | |
# route to the correct bucket | |
if subset_length == "True": | |
new_data["Length Bias"].append(subset_data) | |
elif subset_length == "Neutral": | |
new_data["Neutral"].append(subset_data) | |
elif subset_length == "False": | |
new_data["Terse Bias"].append(subset_data) | |
# take average of new_data and add to new_df (removing other columns than model) | |
for subset in final_subsets: | |
new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2) | |
keep_columns = ["model"] + final_subsets | |
new_df = new_df[keep_columns] | |
# recompute average | |
# new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2) | |
return new_df | |
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False) | |
herm_data_avg = avg_over_herm(herm_data).sort_values(by='Chat', ascending=False) | |
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False) | |
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False) | |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False) | |
col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1) | |
col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1) | |
cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1) | |
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) | |
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1) | |
# for showing random samples | |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") | |
def random_sample(r: gr.Request, subset): | |
if subset is None or subset == []: | |
sample_index = np.random.randint(0, len(eval_set) - 1) | |
sample = eval_set[sample_index] | |
else: # filter by subsets (can be list) | |
if isinstance(subset, str): | |
subset = [subset] | |
# filter down dataset to only include the subset(s) | |
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset) | |
sample_index = np.random.randint(0, len(eval_set_filtered) - 1) | |
sample = eval_set_filtered[sample_index] | |
markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()]) | |
return markdown_text | |
subsets = eval_set.unique("subset") | |
def regex_table(dataframe, regex): | |
""" | |
Takes a model name as a regex, then returns only the rows that has that in it. | |
""" | |
# Split regex statement by comma and trim whitespace around regexes | |
regex_list = [x.strip() for x in regex.split(",")] | |
# Join the list into a single regex pattern with '|' acting as OR | |
combined_regex = '|'.join(regex_list) | |
# Filter the dataframe such that 'model' contains any of the regex patterns | |
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)] | |
with gr.Blocks() as app: | |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About" | |
with gr.Row(): | |
gr.Markdown(TOP_TEXT) | |
search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("HERM Eval Set - Overview"): | |
with gr.Row(): | |
herm_table = gr.Dataframe( | |
herm_data_avg.values, | |
datatype=col_types_herm_avg, | |
headers=herm_data_avg.columns.tolist(), | |
elem_id="herm_dataframe_avg", | |
height=1000, | |
) | |
# backup reference data | |
herm_table_hidden = gr.Dataframe( | |
herm_data_avg.values, | |
datatype=col_types_herm_avg, | |
headers=herm_data_avg.columns.tolist(), | |
visible=False, | |
) | |
with gr.TabItem("HERM Eval Set - Detailed"): | |
with gr.Row(): | |
herm_table_detailed = gr.Dataframe( | |
herm_data.values, | |
datatype=col_types_herm, | |
headers=herm_data.columns.tolist(), | |
elem_id="herm_dataframe", | |
height=1000, | |
) | |
# backup | |
herm_table_detailed_hidden = gr.Dataframe( | |
herm_data.values, | |
datatype=col_types_herm, | |
headers=herm_data.columns.tolist(), | |
visible=False, | |
) | |
with gr.TabItem("HERM Eval Set - Length Bias"): | |
with gr.Row(): | |
herm_table_len = gr.Dataframe( | |
herm_data_length.values, | |
datatype=cols_herm_data_length, | |
headers=herm_data_length.columns.tolist(), | |
elem_id="herm_dataframe_length", | |
height=1000, | |
) | |
# backup | |
herm_table_len_hidden = gr.Dataframe( | |
herm_data_length.values, | |
datatype=cols_herm_data_length, | |
headers=herm_data_length.columns.tolist(), | |
visible=False, | |
) | |
with gr.TabItem("Known Pref. Sets"): | |
with gr.Row(): | |
PREF_SET_TEXT = """ | |
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). | |
""" | |
gr.Markdown(PREF_SET_TEXT) | |
with gr.Row(): | |
pref_sets_table = gr.Dataframe( | |
prefs_data.values, | |
datatype=col_types_prefs, | |
headers=prefs_data.columns.tolist(), | |
elem_id="prefs_dataframe", | |
height=1000, | |
) | |
# backup | |
pref_sets_table_hidden = gr.Dataframe( | |
prefs_data.values, | |
datatype=col_types_prefs, | |
headers=prefs_data.columns.tolist(), | |
visible=False, | |
) | |
with gr.TabItem("About"): | |
with gr.Row(): | |
gr.Markdown(ABOUT_TEXT) | |
with gr.TabItem("Dataset Viewer"): | |
with gr.Row(): | |
# loads one sample | |
gr.Markdown("## Random Dataset Sample Viewer") | |
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True) | |
button = gr.Button("Show Random Sample") | |
with gr.Row(): | |
sample_display = gr.Markdown("{sampled data loads here}") | |
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display]) | |
# removed plot because not pretty enough | |
# with gr.TabItem("Model Correlation"): | |
# with gr.Row(): | |
# plot = plot_avg_correlation(herm_data_avg, prefs_data) | |
# gr.Plot(plot) | |
search.change(regex_table, inputs=[herm_table_hidden, search], outputs=herm_table) | |
search.change(regex_table, inputs=[herm_table_detailed_hidden, search], outputs=herm_table_detailed) | |
search.change(regex_table, inputs=[herm_table_len_hidden, search], outputs=herm_table_len) | |
search.change(regex_table, inputs=[pref_sets_table_hidden, search], outputs=pref_sets_table) | |
# Load data when app starts, TODO make this used somewhere... | |
# def load_data_on_start(): | |
# data_herm = load_all_data(repo_dir_herm) | |
# herm_table.update(data_herm) | |
# data_herm_avg = avg_over_herm(repo_dir_herm) | |
# herm_table.update(data_herm_avg) | |
# data_prefs = load_all_data(repo_dir_prefs) | |
# pref_sets_table.update(data_prefs) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h | |
scheduler.start() | |
app.launch() # had .queue() before launch before... not sure if that's necessary | |