Bram Vanroy
revision for Dutch only
5693ee5
raw
history blame
4.27 kB
import json
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler
from content import *
ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
"""
Collects results from the evals folder and returns a dictionary of results
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
"""
performance_dict = defaultdict(dict)
for pfin in Path("evals").rglob("*.json"):
data = json.loads(pfin.read_text(encoding="utf-8"))
if "results" not in data or "config" not in data:
continue
results = data["results"]
config = data["config"]
if "model_args" not in config:
continue
model_args = config["model_args"].split(",")
pretrained = [x for x in model_args if x.startswith("pretrained=")]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split("=")[1]
pretrained = pretrained.split("/")[-1]
for lang_task, perfs in results.items():
task, lang = lang_task.split("_")
assert task in BENCHMARKS
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return dict(performance_dict)
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
"""
Builds a dataframe from the performance dictionary
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
"""
data = []
for (pretrained, lang), perfs in performance_dict.items():
arc_perf = perfs.get(ARC, 0.0)
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
mmlu_perf = perfs.get(MMLU, 0.0)
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
data.append(row)
df = pd.DataFrame.from_records(data, columns=COLS)
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
return df
def style_df(df: DataFrame) -> Styler:
"""
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
:param df: the dataframe to style
:return: the Styler
"""
styler = df.style.format("{:.2f}", subset=df.columns[1:])
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
return styler
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]
results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT)
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
elem_id="leaderboard-table",
)
gr.Markdown("## LaTeX")
gr.Code(styled_df.to_latex(convert_css=True))
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
if __name__ == '__main__':
demo.launch()