File size: 4,206 Bytes
f90ad24 b2c063a f90ad24 b2c063a f90ad24 59c748f f90ad24 b2c063a f90ad24 59c748f b2c063a 59c748f b2c063a f90ad24 59c748f f90ad24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import shutil
import numpy as np
import gradio as gr
from huggingface_hub import Repository, HfApi
from transformers import AutoConfig, AutoModel
import json
from apscheduler.schedulers.background import BackgroundScheduler
import pandas as pd
import datetime
import glob
from dataclasses import dataclass
from typing import List, Tuple, Dict
# clone / pull the lmeh eval data
H4_TOKEN = os.environ.get("H4_TOKEN", None)
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
BENCH_TO_NAME = {
"arc_challenge":"ARC (25-shot) ⬆️",
"hellaswag":"HellaSwag (10-shot) ⬆️",
"hendrycks":"MMLU (5-shot) ⬆️",
"truthfulqa_mc":"TruthQA (0-shot) ⬆️",
}
def make_clickable_model(model_name):
# remove user from model name
#model_name_show = ' '.join(model_name.split('/')[1:])
link = "https://huggingface.co/" + model_name
return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
def get_n_params(base_model):
return "unknown"
# WARNING: High memory usage
# Retrieve the number of parameters from the configuration
try:
config = AutoConfig.from_pretrained(base_model, use_auth_token=True, low_cpu_mem_usage=True)
n_params = AutoModel.from_config(config).num_parameters()
except Exception as e:
print(f"Error:{e} The number of parameters is not available in the config for the model '{base_model}'.")
return "unknown"
return str(n_params)
@dataclass
class EvalResult:
eval_name : str
org : str
model : str
is_8bit : bool
results : dict
def to_dict(self):
if self.org is not None:
base_model =f"{self.org}/{self.model}"
else:
base_model =f"{self.model}"
data_dict = {}
data_dict["eval_name"] = self.eval_name
data_dict["base_model"] = make_clickable_model(base_model)
data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
data_dict["# params"] = get_n_params(base_model)
for benchmark in BENCHMARKS:
if not benchmark in self.results.keys():
self.results[benchmark] = None
for k,v in BENCH_TO_NAME.items():
data_dict[v] = self.results[k]
return data_dict
def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
with open(json_filepath) as fp:
data = json.load(fp)
path_split = json_filepath.split("/")
org = None
model = path_split[-3]
is_8bit = path_split[-2] == "8bit"
if len(path_split)== 5:
# handles gpt2 type models that don't have an org
result_key = f"{path_split[-3]}_{path_split[-2]}"
else:
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
org = path_split[-4]
eval_result = None
for benchmark, metric in zip(BENCHMARKS, METRICS):
if benchmark in json_filepath:
accs = np.array([v[metric] for k, v in data["results"].items()])
mean_acc = round(np.mean(accs),3)
eval_result = EvalResult(result_key, org, model, is_8bit, {benchmark:mean_acc})
return result_key, eval_result
def get_eval_results() -> List[EvalResult]:
json_filepaths = glob.glob("evals/eval_results/**/*.json", recursive=True)
eval_results = {}
for json_filepath in json_filepaths:
result_key, eval_result = parse_eval_result(json_filepath)
if result_key in eval_results.keys():
eval_results[result_key].results.update(eval_result.results)
else:
eval_results[result_key] = eval_result
eval_results = [v for k,v in eval_results.items()]
return eval_results
def get_eval_results_dicts() -> List[Dict]:
eval_results = get_eval_results()
return [e.to_dict() for e in eval_results]
eval_results_dict = get_eval_results_dicts()
print(eval_results_dict) |