File size: 3,565 Bytes
d1253a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
import json
import glob
from collections import defaultdict
import gradio as gr
import glob
ARC = "arc_challenge"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa-mc"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
def collect_results():
performance_dict = defaultdict(dict)
pretrained_models = set()
for file in glob.glob('evals/*/*.json'):
with open(file, 'r') as f:
data = json.load(f)
if 'results' not in data:
continue
if 'config' not in data:
continue
results = data['results']
config = data['config']
if 'model_args' not in config:
continue
model_args = config['model_args'].split(',')
pretrained = [x for x in model_args if x.startswith('pretrained=')]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split('=')[1]
pretrained = pretrained.split('/')[-1]
pretrained_models.add(pretrained)
for lang_task, perfs in results.items():
if lang_task.startswith('arc_') and lang_task.endswith('_challenge'):
lang = lang_task.split('_')[1]
task = ARC
elif lang_task.startswith('hellaswag_'):
_, lang = lang_task.split('_')
task = HELLASWAG
elif lang_task.startswith('mmlu_'):
_, lang = lang_task.split('_')
task = MMLU
elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
lang = lang_task.split('_')[1]
task = TRUTHFULQA
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return performance_dict, pretrained_models
def get_leaderboard_df(performance_dict, pretrained_models):
df = list()
for (pretrained, lang), perfs in performance_dict.items():
arc_perf = perfs.get(ARC, 0.0)
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
mmlu_perf = perfs.get(MMLU, 0.0)
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
continue
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
df.append(row)
return df
MODEL_COL = "Model"
LANG_COL = "Language"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot))️"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "str", "number", "number", "number", "number", "number"]
args = collect_results()
leaderboard_df = get_leaderboard_df(*args)
demo = gr.Blocks()
with demo:
gr.HTML('Open Multilingual Large Language Model Evaluation Leaderboard')
gr.Markdown('INTRODUCTION TEXT', elem_classes="markdown-text")
with gr.Box():
search_bar = gr.Textbox(
placeholder="Search models...", show_label=False, elem_id="search-bar"
)
leaderboard_table = gr.components.Dataframe(
value=leaderboard_df,
headers=COLS,
datatype=TYPES,
max_rows=5,
elem_id="leaderboard-table",
)
demo.launch()
|