File size: 6,255 Bytes
d1253a8 107c2a4 863e074 5693ee5 8c2ee0f d1253a8 5693ee5 863e074 f067bfb d1253a8 a5244e0 d1253a8 a5244e0 d1253a8 107c2a4 5693ee5 d1253a8 863e074 5693ee5 d1253a8 863e074 d1253a8 863e074 d1253a8 863e074 d1253a8 863e074 a5244e0 d1253a8 863e074 5693ee5 d1253a8 5693ee5 2c801d0 d1253a8 2c801d0 d1253a8 2c801d0 5693ee5 8c2ee0f 5693ee5 863e074 2c801d0 d1253a8 5693ee5 2c801d0 5693ee5 2c801d0 559e4ba 5693ee5 8c2ee0f d1253a8 f067bfb d1253a8 2c801d0 107c2a4 d1253a8 2c801d0 5693ee5 d1253a8 5693ee5 f067bfb 5693ee5 2c801d0 5693ee5 8c2ee0f 06aa2d9 f067bfb 5693ee5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import json
from collections import defaultdict
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path
import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler
from content import *
ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"
@dataclass
class Result:
train_type: str
num_parameters: int
arc: float = field(default=0.)
hellaswag: float = field(default=0.)
mmlu: float = field(default=0.)
truthfulqa: float = field(default=0.)
@cached_property
def num_parameters_kmb(self) -> str:
return convert_number_to_kmb(self.num_parameters)
@cached_property
def average(self) -> float:
return self.arc + self.hellaswag + self.mmlu + self.truthfulqa / 4
def convert_number_to_kmb(number: int) -> str:
"""
Converts a number to a string with K, M or B suffix
:param number: the number to convert
:return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
"""
if number >= 1_000_000_000:
return f"{round(number / 1_000_000_000, 1)}B"
elif number >= 1_000_000:
return f"{round(number / 1_000_000, 1)}M"
elif number >= 1_000:
return f"{round(number / 1_000, 1)}K"
else:
return str(number)
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
"""
Collects results from the evals folder and returns a dictionary of results
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
"""
performance_dict = defaultdict(dict)
for pfin in Path("evals").rglob("*.json"):
data = json.loads(pfin.read_text(encoding="utf-8"))
if "results" not in data or "config" not in data:
continue
results = data["results"]
config = data["config"]
if "model_args" not in config:
continue
model_args = config["model_args"].split(",")
pretrained = [x for x in model_args if x.startswith("pretrained=")]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split("=")[1]
pretrained = pretrained.split("/")[-1]
for lang_task, perfs in results.items():
task, lang = lang_task.split("_")
assert task in BENCHMARKS
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return dict(performance_dict)
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
"""
Builds a dataframe from the performance dictionary
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
"""
data = []
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
for (pretrained, lang), perfs in performance_dict.items():
arc_perf = perfs.get(ARC, 0.0)
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
mmlu_perf = perfs.get(MMLU, 0.0)
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
training_type = dutch_training_info.get(pretrained, "NA")
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
data.append(row)
df = pd.DataFrame.from_records(data, columns=COLS)
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
return df
def style_df(df: DataFrame) -> Styler:
"""
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
:param df: the dataframe to style
:return: the Styler
"""
styler = df.style.format("{:.2f}", subset=df.columns[2:])
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
styler = styler.hide()
return styler
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]
results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT)
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
elem_id="leaderboard-table",
)
gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
gr.Markdown("## LaTeX")
gr.Code(styled_df.to_latex(convert_css=True))
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
if __name__ == '__main__':
demo.launch()
|