|
import json |
|
from dataclasses import dataclass, field, fields |
|
from functools import cached_property |
|
from pathlib import Path |
|
from typing import Literal |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
from pandas import DataFrame |
|
from pandas.io.formats.style import Styler |
|
|
|
from content import * |
|
|
|
TASK_METRICS = { |
|
"arc": "acc_norm", |
|
"hellaswag": "acc_norm", |
|
"mmlu": "acc_norm", |
|
"truthfulqa": "mc2", |
|
} |
|
|
|
MODEL_TYPE_EMOJIS = { |
|
"pretrained": "🟢", |
|
"fine-tuned": "🔶", |
|
"instruction-tuned": "⭕", |
|
"RL-tuned": "🟦", |
|
} |
|
|
|
NOT_GIVEN_SYMBOL = "❔" |
|
|
|
|
|
@dataclass |
|
class Result: |
|
model_name: str |
|
short_name: str |
|
model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"] |
|
dutch_coverage: Literal["none", "pretrained", "fine-tuned"] |
|
num_parameters: int |
|
arc: float = field(default=np.nan) |
|
average: float = field(default=np.nan, init=False) |
|
hellaswag: float = field(default=np.nan) |
|
mmlu: float = field(default=np.nan) |
|
truthfulqa: float = field(default=np.nan) |
|
num_parameters_kmb: str = field(init=False) |
|
|
|
def __post_init__(self): |
|
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]: |
|
raise ValueError( |
|
f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned'," |
|
f" 'instruction-tuned', 'RL-tuned', 'not-given" |
|
) |
|
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]: |
|
raise ValueError( |
|
f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given" |
|
) |
|
|
|
field_names = {f.name for f in fields(self)} |
|
for task_name in TASK_METRICS: |
|
if task_name not in field_names: |
|
raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame") |
|
|
|
if any([np.isnan(getattr(self, task_name)) for task_name in TASK_METRICS]): |
|
self.average = np.nan |
|
else: |
|
self.average = sum([getattr(self, task_name) for task_name in TASK_METRICS]) / 4 |
|
self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters) |
|
|
|
|
|
@dataclass |
|
class ResultSet: |
|
results: list[Result] |
|
column_names: dict[str, str] = field(default_factory=dict) |
|
column_types: dict[str, str] = field(default_factory=dict) |
|
|
|
def __post_init__(self): |
|
if not self.column_names: |
|
|
|
self.column_names = { |
|
"short_name": "Model", |
|
"model_type": "T", |
|
"dutch_coverage": "🇳🇱", |
|
"num_parameters": "Size", |
|
"average": "Avg.", |
|
"arc": "ARC (25-shot)", |
|
"hellaswag": "HellaSwag (10-shot)", |
|
"mmlu": "MMLU (5-shot)", |
|
"truthfulqa": "TruthfulQA (0-shot)", |
|
} |
|
self.column_types = { |
|
"Model": "markdown", |
|
"T": "str", |
|
"🇳🇱": "str", |
|
"Size": "str", |
|
"Avg.": "number", |
|
"ARC (25-shot)": "number", |
|
"HellaSwag (10-shot)": "number", |
|
"MMLU (5-shot)": "number", |
|
"TruthfulQA (0-shot)": "number", |
|
} |
|
|
|
for column_type in self.column_types: |
|
if column_type not in set(self.column_names.values()): |
|
raise ValueError( |
|
f"Column names specified in column_types must be values in column_names." |
|
f" {column_type} not found." |
|
) |
|
|
|
if "average" not in self.column_names: |
|
raise ValueError("Column names must contain 'average' column name") |
|
|
|
field_names = [f.name for f in fields(Result)] |
|
for column_name in self.column_names: |
|
if column_name not in field_names: |
|
raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame") |
|
|
|
@cached_property |
|
def df(self) -> DataFrame: |
|
data = [ |
|
{col_name: getattr(result, attr) for attr, col_name in self.column_names.items()} |
|
for result in self.results |
|
] |
|
|
|
df = pd.DataFrame(data) |
|
df = df.sort_values(by=self.column_names["average"], ascending=False) |
|
return df |
|
|
|
@cached_property |
|
def styled_df(self) -> Styler: |
|
data = [ |
|
{ |
|
col_name: ( |
|
f"<a target='_blank' href='https://huggingface.co/{result.model_name}'" |
|
f" style='color: var(--link-text-color); text-decoration: underline;text-decoration-style:" |
|
f" dotted;'>{result.short_name}</a>" |
|
) |
|
if attr == "short_name" |
|
else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL) |
|
if attr == "model_type" |
|
else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL) |
|
if attr == "dutch_coverage" |
|
else getattr(result, attr) |
|
for attr, col_name in self.column_names.items() |
|
} |
|
for result in self.results |
|
] |
|
|
|
df = pd.DataFrame(data) |
|
df = df.sort_values(by=self.column_names["average"], ascending=False) |
|
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"] |
|
styler = df.style.format("{:.4f}", subset=number_cols, na_rep="<missing>") |
|
|
|
def highlight_max(col): |
|
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) |
|
|
|
styler = styler.apply(highlight_max, axis=0, subset=number_cols) |
|
num_params_col = self.column_names["num_parameters"] |
|
styler = styler.format(convert_number_to_kmb, subset=num_params_col) |
|
styler.set_caption("Leaderboard on Dutch benchmarks.") |
|
styler = styler.hide() |
|
return styler |
|
|
|
@cached_property |
|
def latex_df(self) -> Styler: |
|
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"] |
|
styler = self.df.style.format("{:.2f}", subset=number_cols, na_rep="<missing>") |
|
|
|
def highlight_max(col): |
|
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) |
|
|
|
styler = styler.apply(highlight_max, axis=0, subset=number_cols) |
|
num_params_col = self.column_names["num_parameters"] |
|
styler = styler.format(convert_number_to_kmb, subset=num_params_col) |
|
styler.set_caption("Leaderboard on Dutch benchmarks.") |
|
styler = styler.hide() |
|
return styler |
|
|
|
@cached_property |
|
def viz_checkboxes(self): |
|
model_col_name = self.column_names["short_name"] |
|
avg_col = self.column_names["average"] |
|
top3_models = self.df.sort_values(by=avg_col, ascending=False)[model_col_name].tolist()[:3] |
|
return gr.CheckboxGroup(self.df[model_col_name].tolist(), label="Models", value=top3_models) |
|
|
|
def plot(self, model_names: list[str]): |
|
if not model_names: |
|
return None |
|
|
|
|
|
task_columns = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "short_name"] |
|
df = self.df[task_columns] |
|
|
|
|
|
reversed_col_names = {v: k for k, v in self.column_names.items() if v != "Model"} |
|
df = df.rename(columns=reversed_col_names) |
|
|
|
|
|
df = df[df["Model"].isin(model_names)] |
|
|
|
|
|
df = df.melt(id_vars=["Model"], var_name="Task", value_name="Score").sort_values(by="Task") |
|
|
|
|
|
fig = go.Figure() |
|
for model_name in model_names: |
|
model_df = df[df["Model"] == model_name] |
|
scores = model_df["Score"].tolist() |
|
tasks = model_df["Task"].tolist() |
|
|
|
|
|
|
|
scores.append(scores[0]) |
|
tasks.append(tasks[0]) |
|
|
|
fig.add_trace(go.Scatterpolar(r=scores, theta=tasks, name=model_name)) |
|
|
|
fig.update_layout( |
|
title="Model performance on Dutch benchmarks", |
|
) |
|
|
|
return fig |
|
|
|
|
|
def convert_number_to_kmb(number: int) -> str: |
|
""" |
|
Converts a number to a string with K, M or B suffix |
|
:param number: the number to convert |
|
:return: a string with the number and a suffix, e.g. "7B", rounded to one decimal |
|
""" |
|
if number >= 1_000_000_000: |
|
return f"{round(number / 1_000_000_000, 1)}B" |
|
elif number >= 1_000_000: |
|
return f"{round(number / 1_000_000, 1)}M" |
|
elif number >= 1_000: |
|
return f"{round(number / 1_000, 1)}K" |
|
else: |
|
return str(number) |
|
|
|
|
|
def collect_results() -> ResultSet: |
|
""" |
|
Collects results from the evals folder and returns a dictionary of results |
|
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are |
|
dictionaries of the form {benchmark_name: performance_score} |
|
""" |
|
evals_dir = Path(__file__).parent.joinpath("evals") |
|
pf_overview = evals_dir.joinpath("models.json") |
|
if not pf_overview.exists(): |
|
raise ValueError( |
|
f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`." |
|
) |
|
|
|
model_info = json.loads(pf_overview.read_text(encoding="utf-8")) |
|
model_results = {} |
|
for pfin in evals_dir.rglob("*.json"): |
|
data = json.loads(pfin.read_text(encoding="utf-8")) |
|
|
|
if "results" not in data: |
|
continue |
|
|
|
task_results = data["results"] |
|
short_name = pfin.stem.split("_", 2)[2].lower() |
|
|
|
if short_name not in model_info: |
|
raise KeyError( |
|
f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON" |
|
f" file exists that has not yet been processed. First run the `generate_overview_json.py` script." |
|
) |
|
|
|
if short_name not in model_results: |
|
model_results[short_name] = { |
|
"short_name": short_name, |
|
"model_name": model_info[short_name]["model_name"], |
|
"model_type": model_info[short_name]["model_type"], |
|
"dutch_coverage": model_info[short_name]["dutch_coverage"], |
|
"num_parameters": model_info[short_name]["num_parameters"], |
|
} |
|
|
|
for task_name, task_result in task_results.items(): |
|
task_name = task_name.rsplit("_", 1)[0] |
|
metric = TASK_METRICS[task_name] |
|
model_results[short_name][task_name] = task_result[metric] |
|
|
|
model_results = ResultSet([Result(**res) for short_name, res in model_results.items()]) |
|
|
|
return model_results |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRO_TEXT) |
|
|
|
gr.Markdown( |
|
f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!" |
|
" All models have been benchmarked in 8-bit. `<missing>` values indicate that those benchmarks are still" |
|
" pending." |
|
) |
|
|
|
results = collect_results() |
|
|
|
gr.components.Dataframe( |
|
results.styled_df, |
|
headers=list(results.df.columns), |
|
datatype=[results.column_types[col] for col in results.df.columns], |
|
interactive=False, |
|
elem_id="leaderboard-table", |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
modeltypes_str = "<br>".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()]) |
|
gr.Markdown(f"Model types:<br>{modeltypes_str}") |
|
|
|
with gr.Column(): |
|
gr.Markdown( |
|
f"Language coverage ({results.column_names['dutch_coverage']}):" |
|
f"<br>- `none`: no explicit/deliberate Dutch coverage," |
|
f"<br>- `pretrained`: pretrained on Dutch data," |
|
f"<br>- `fine-tuned`: fine-tuned on Dutch data" |
|
) |
|
|
|
with gr.Column(): |
|
metrics_str = "<br>".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()]) |
|
gr.Markdown(f"Reported metrics:<br>{metrics_str}") |
|
|
|
gr.Markdown("## LaTeX") |
|
gr.Code(results.latex_df.to_latex(convert_css=True)) |
|
|
|
gr.Markdown("## Visualization") |
|
with gr.Row(): |
|
with gr.Column(): |
|
buttons = results.viz_checkboxes |
|
|
|
with gr.Column(scale=2): |
|
plot = gr.Plot(container=True) |
|
buttons.change(results.plot, inputs=buttons, outputs=[plot]) |
|
demo.load(results.plot, inputs=buttons, outputs=[plot]) |
|
|
|
gr.Markdown(DISCLAIMER, elem_classes="markdown-text") |
|
gr.Markdown(CREDIT, elem_classes="markdown-text") |
|
gr.Markdown(CITATION, elem_classes="markdown-text") |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|