open_dutch_llm_leaderboard

Sleeping

File size: 6,255 Bytes

import json
from collections import defaultdict
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path

import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler

from content import *

ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]

METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]

MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"


@dataclass
class Result:
    train_type: str
    num_parameters: int
    arc: float = field(default=0.)
    hellaswag: float = field(default=0.)
    mmlu: float = field(default=0.)
    truthfulqa: float = field(default=0.)

    @cached_property
    def num_parameters_kmb(self) -> str:
        return convert_number_to_kmb(self.num_parameters)

    @cached_property
    def average(self) -> float:
        return self.arc + self.hellaswag + self.mmlu + self.truthfulqa / 4


def convert_number_to_kmb(number: int) -> str:
    """
    Converts a number to a string with K, M or B suffix
    :param number: the number to convert
    :return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
    """
    if number >= 1_000_000_000:
        return f"{round(number / 1_000_000_000, 1)}B"
    elif number >= 1_000_000:
        return f"{round(number / 1_000_000, 1)}M"
    elif number >= 1_000:
        return f"{round(number / 1_000, 1)}K"
    else:
        return str(number)



def collect_results() -> dict[tuple[str, str], dict[str, float]]:
    """
    Collects results from the evals folder and returns a dictionary of results
    :return: a dictionary of results where the keys are typles of (model_name, language) and the values are
    dictionaries of the form {benchmark_name: performance_score}
    """
    performance_dict = defaultdict(dict)
    for pfin in Path("evals").rglob("*.json"):
        data = json.loads(pfin.read_text(encoding="utf-8"))
        if "results" not in data or "config" not in data:
            continue
        results = data["results"]
        config = data["config"]
        if "model_args" not in config:
            continue

        model_args = config["model_args"].split(",")
        pretrained = [x for x in model_args if x.startswith("pretrained=")]
        if len(pretrained) != 1:
            continue
        pretrained = pretrained[0].split("=")[1]
        pretrained = pretrained.split("/")[-1]

        for lang_task, perfs in results.items():
            task, lang = lang_task.split("_")
            assert task in BENCHMARKS

            if lang and task:
                metric = METRICS[BENCHMARKS.index(task)]
                p = round(perfs[metric] * 100, 1)
                performance_dict[(pretrained, lang)][task] = p

    return dict(performance_dict)


def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
    """
    Builds a dataframe from the performance dictionary
    :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
    dictionaries of the form {benchmark_name: performance_score}
    :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
    """
    data = []
    dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))

    for (pretrained, lang), perfs in performance_dict.items():
        arc_perf = perfs.get(ARC, 0.0)
        hellaswag_perf = perfs.get(HELLASWAG, 0.0)
        mmlu_perf = perfs.get(MMLU, 0.0)
        truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
        training_type = dutch_training_info.get(pretrained, "NA")

        avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
        row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
        data.append(row)

    df = pd.DataFrame.from_records(data, columns=COLS)
    df = df.sort_values(by=[AVERAGE_COL], ascending=False)

    return df


def style_df(df: DataFrame) -> Styler:
    """
    Styles the dataframe by rounding to two decimals and putting the max value in bold per column
    :param df: the dataframe to style
    :return: the Styler
    """
    styler = df.style.format("{:.2f}", subset=df.columns[2:])

    def highlight_max(col):
        return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

    styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
    styler = styler.hide()
    return styler


MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"

COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]

results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRO_TEXT)

    gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
    gr.components.Dataframe(
        value=original_df,
        headers=COLS,
        datatype=TYPES,
        elem_id="leaderboard-table",
    )
    gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
            " Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")

    gr.Markdown("## LaTeX")
    gr.Code(styled_df.to_latex(convert_css=True))

    gr.Markdown(CREDIT, elem_classes="markdown-text")
    gr.Markdown(CITATION, elem_classes="markdown-text")

if __name__ == '__main__':
    demo.launch()