|
import pandas as pd |
|
from pathlib import Path |
|
from datasets import load_dataset |
|
import numpy as np |
|
import os |
|
import re |
|
|
|
UNVERIFIED_MODELS = [ |
|
"nvidia/Nemotron-4-340B-Reward", |
|
"nvidia/Llama3-70B-SteerLM-RM", |
|
"Cohere May 2024", |
|
"google/gemini-1.5-pro-0514", |
|
"google/flame-24b-july-2024", |
|
"Cohere March 2024", |
|
"facebook/Self-taught-Llama-3-70B", |
|
"facebook/Self-taught-evaluator-llama3.1-70B", |
|
"google/flame-1.0-24B-july-2024", |
|
"Salesforce/SFR-LLaMa-3.1-70B-Judge-r", |
|
"Salesforce/SFR-nemo-12B-Judge-r", |
|
"Salesforce/SFR-LLaMa-3.1-8B-Judge-r", |
|
"SF-Foundation/TextEval-OffsetBias-12B", |
|
"SF-Foundation/TextEval-Llama3.1-70B", |
|
"nvidia/Llama-3.1-Nemotron-70B-Reward", |
|
] |
|
|
|
CONTAMINATED_MODELS = [ |
|
"Skywork/Skywork-Reward-Gemma-2-27B", |
|
"Skywork/Skywork-Critic-Llama-3.1-70B", |
|
"LxzGordon/URM-LLaMa-3.1-8B", |
|
"Skywork/Skywork-Reward-Llama-3.1-8B", |
|
"Ray2333/GRM-Llama3-8B-rewardmodel-ft", |
|
"nicolinho/QRM-Llama3.1-8B", |
|
"nicolinho/QRM-Llama3-8B", |
|
"general-preference/GPM-Llama-3.1-8B", |
|
"general-preference/GPM-Gemma-2B" |
|
] |
|
|
|
|
|
def model_hyperlink(link, model_name): |
|
|
|
if len(model_name) > 50: |
|
model_name = model_name[:47] + "..." |
|
if model_name == "random": |
|
output = "random" |
|
elif model_name == "Cohere March 2024": |
|
output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
elif "openai" == model_name.split("/")[0]: |
|
output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
elif "Anthropic" == model_name.split("/")[0]: |
|
output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
elif "google" == model_name.split("/")[0]: |
|
output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
elif "PoLL" == model_name.split("/")[0]: |
|
output = model_name |
|
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
if model_name in UNVERIFIED_MODELS: |
|
output += " *" |
|
if model_name in CONTAMINATED_MODELS: |
|
output += " ⚠️" |
|
return output |
|
|
|
def undo_hyperlink(html_string): |
|
|
|
pattern = r'>[^<]+<' |
|
match = re.search(pattern, html_string) |
|
if match: |
|
|
|
return match.group(0)[1:-1] |
|
else: |
|
return "No text found" |
|
|
|
|
|
|
|
def load_all_data(data_repo, subdir:str, subsubsets=False): |
|
dir = Path(data_repo) |
|
data_dir = dir / subdir |
|
orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))] |
|
|
|
models_results = [] |
|
for org in orgs: |
|
org_dir = data_dir / org |
|
files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))] |
|
for file in files: |
|
if file.endswith(".json"): |
|
models_results.append(org + "/" + file) |
|
|
|
|
|
df = pd.DataFrame() |
|
|
|
|
|
for model in models_results: |
|
model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train") |
|
df2 = pd.DataFrame(model_data) |
|
|
|
df = pd.concat([df2, df]) |
|
|
|
|
|
|
|
df = df.drop(columns=["chat_template"]) |
|
|
|
|
|
df = df.reindex(sorted(df.columns), axis=1) |
|
|
|
|
|
cols = list(df.columns) |
|
cols.insert(0, cols.pop(cols.index('model'))) |
|
df = df.loc[:, cols] |
|
|
|
|
|
cols = df.columns.tolist() |
|
cols.remove("model") |
|
|
|
if "model_type" in cols: |
|
cols.remove("model_type") |
|
|
|
if "ref_model" in cols: |
|
cols.remove("ref_model") |
|
|
|
if "model_beaker" in cols: |
|
cols.remove("model_beaker") |
|
df = df.drop(columns=["model_beaker"]) |
|
|
|
|
|
|
|
if "xstest" in cols: |
|
df = df.drop(columns=["xstest"]) |
|
cols.remove("xstest") |
|
|
|
if "ref_model" in df.columns: |
|
df = df.drop(columns=["ref_model"]) |
|
|
|
|
|
if "anthropic" in cols: |
|
df = df.drop(columns=["anthropic"]) |
|
cols.remove("anthropic") |
|
if "summarize_prompted" in cols: |
|
df = df.drop(columns=["summarize_prompted"]) |
|
cols.remove("summarize_prompted") |
|
|
|
if "pku_better" in cols: |
|
df = df.drop(columns=["pku_better"]) |
|
cols.remove("pku_better") |
|
if "pku_safer" in cols: |
|
df = df.drop(columns=["pku_safer"]) |
|
cols.remove("pku_safer") |
|
|
|
|
|
df[cols] = (df[cols]*100) |
|
avg = np.nanmean(df[cols].values,axis=1) |
|
|
|
df["average"] = avg |
|
|
|
|
|
df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x)) |
|
|
|
|
|
cols = list(df.columns) |
|
cols.insert(1, cols.pop(cols.index('average'))) |
|
df = df.loc[:, cols] |
|
|
|
|
|
if "model_type" in cols: |
|
cols = list(df.columns) |
|
cols.insert(1, cols.pop(cols.index('model_type'))) |
|
df = df.loc[:, cols] |
|
|
|
|
|
df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)] |
|
|
|
return df |
|
|