File size: 6,255 Bytes
d1253a8
 
107c2a4
 
863e074
 
5693ee5
8c2ee0f
d1253a8
5693ee5
 
863e074
f067bfb
d1253a8
a5244e0
d1253a8
 
a5244e0
d1253a8
 
 
 
107c2a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5693ee5
 
 
 
 
 
 
d1253a8
863e074
 
5693ee5
d1253a8
863e074
 
 
d1253a8
 
863e074
 
d1253a8
 
863e074
 
d1253a8
 
863e074
a5244e0
d1253a8
 
 
 
 
863e074
5693ee5
d1253a8
 
5693ee5
 
 
 
 
 
 
 
2c801d0
 
d1253a8
 
 
 
 
2c801d0
d1253a8
 
2c801d0
5693ee5
8c2ee0f
5693ee5
863e074
2c801d0
d1253a8
 
 
5693ee5
 
 
 
 
 
2c801d0
5693ee5
 
 
 
2c801d0
559e4ba
5693ee5
8c2ee0f
 
d1253a8
 
 
 
f067bfb
d1253a8
2c801d0
107c2a4
 
d1253a8
2c801d0
5693ee5
d1253a8
5693ee5
 
 
 
f067bfb
5693ee5
 
 
 
 
 
 
 
 
2c801d0
 
5693ee5
 
 
8c2ee0f
06aa2d9
f067bfb
 
5693ee5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import json
from collections import defaultdict
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path

import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler

from content import *

ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]

METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]

MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"


@dataclass
class Result:
    train_type: str
    num_parameters: int
    arc: float = field(default=0.)
    hellaswag: float = field(default=0.)
    mmlu: float = field(default=0.)
    truthfulqa: float = field(default=0.)

    @cached_property
    def num_parameters_kmb(self) -> str:
        return convert_number_to_kmb(self.num_parameters)

    @cached_property
    def average(self) -> float:
        return self.arc + self.hellaswag + self.mmlu + self.truthfulqa / 4


def convert_number_to_kmb(number: int) -> str:
    """
    Converts a number to a string with K, M or B suffix
    :param number: the number to convert
    :return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
    """
    if number >= 1_000_000_000:
        return f"{round(number / 1_000_000_000, 1)}B"
    elif number >= 1_000_000:
        return f"{round(number / 1_000_000, 1)}M"
    elif number >= 1_000:
        return f"{round(number / 1_000, 1)}K"
    else:
        return str(number)



def collect_results() -> dict[tuple[str, str], dict[str, float]]:
    """
    Collects results from the evals folder and returns a dictionary of results
    :return: a dictionary of results where the keys are typles of (model_name, language) and the values are
    dictionaries of the form {benchmark_name: performance_score}
    """
    performance_dict = defaultdict(dict)
    for pfin in Path("evals").rglob("*.json"):
        data = json.loads(pfin.read_text(encoding="utf-8"))
        if "results" not in data or "config" not in data:
            continue
        results = data["results"]
        config = data["config"]
        if "model_args" not in config:
            continue

        model_args = config["model_args"].split(",")
        pretrained = [x for x in model_args if x.startswith("pretrained=")]
        if len(pretrained) != 1:
            continue
        pretrained = pretrained[0].split("=")[1]
        pretrained = pretrained.split("/")[-1]

        for lang_task, perfs in results.items():
            task, lang = lang_task.split("_")
            assert task in BENCHMARKS

            if lang and task:
                metric = METRICS[BENCHMARKS.index(task)]
                p = round(perfs[metric] * 100, 1)
                performance_dict[(pretrained, lang)][task] = p

    return dict(performance_dict)


def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
    """
    Builds a dataframe from the performance dictionary
    :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
    dictionaries of the form {benchmark_name: performance_score}
    :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
    """
    data = []
    dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))

    for (pretrained, lang), perfs in performance_dict.items():
        arc_perf = perfs.get(ARC, 0.0)
        hellaswag_perf = perfs.get(HELLASWAG, 0.0)
        mmlu_perf = perfs.get(MMLU, 0.0)
        truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
        training_type = dutch_training_info.get(pretrained, "NA")

        avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
        row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
        data.append(row)

    df = pd.DataFrame.from_records(data, columns=COLS)
    df = df.sort_values(by=[AVERAGE_COL], ascending=False)

    return df


def style_df(df: DataFrame) -> Styler:
    """
    Styles the dataframe by rounding to two decimals and putting the max value in bold per column
    :param df: the dataframe to style
    :return: the Styler
    """
    styler = df.style.format("{:.2f}", subset=df.columns[2:])

    def highlight_max(col):
        return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

    styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
    styler = styler.hide()
    return styler


MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"

COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]

results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRO_TEXT)

    gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
    gr.components.Dataframe(
        value=original_df,
        headers=COLS,
        datatype=TYPES,
        elem_id="leaderboard-table",
    )
    gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
            " Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")

    gr.Markdown("## LaTeX")
    gr.Code(styled_df.to_latex(convert_css=True))

    gr.Markdown(CREDIT, elem_classes="markdown-text")
    gr.Markdown(CITATION, elem_classes="markdown-text")

if __name__ == '__main__':
    demo.launch()