Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
·
f98b171
1
Parent(s):
663521e
Add aggregate_nlp_metrics.py and more precise score.csv
Browse files- data/score.csv +19 -20
- scripts/aggregate_nlp_metrics.py +44 -0
data/score.csv
CHANGED
@@ -1,21 +1,20 @@
|
|
1 |
model,arc,hellaswag,truthfulqa
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
metaai/llama-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
togethercomputer/RedPajama-INCITE-7B-Chat,42.2,70.8,36.1
|
|
|
1 |
model,arc,hellaswag,truthfulqa
|
2 |
+
BAIR/koala-13b,52.901023890784984,77.54431388169687,50.091065219059125
|
3 |
+
BAIR/koala-7b,47.098976109215016,73.70045807608047,45.997635958147875
|
4 |
+
lmsys/vicuna-7B,53.49829351535836,77.53435570603465,48.997614637055264
|
5 |
+
metaai/llama-13B,56.31399317406144,80.8603863772157,39.90298264801161
|
6 |
+
tatsu-lab/alpaca-7B,52.64505119453925,76.90699063931487,39.552770976749336
|
7 |
+
OpenAssistant/oasst-sft-1-pythia-12b,45.563139931740615,69.92630950009958,39.1893543136912
|
8 |
+
databricks/dolly-v2-12b,42.15017064846416,71.82832105158336,33.37136000408915
|
9 |
+
h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,36.86006825938566,61.551483768173675,37.9421602393762
|
10 |
+
lmsys/fastchat-t5-3b-v1.0,35.92150170648464,46.355307707627965,48.787610045893985
|
11 |
+
nomic-ai/gpt4all-13b-snoozy,56.058020477815695,78.68950408285203,48.35948664919701
|
12 |
+
openaccess-ai-collective/manticore-13b-chat-pyg,58.703071672354945,81.95578570005975,48.86009773651491
|
13 |
+
lmsys/vicuna-13B,52.901023890784984,80.12348137821151,51.81653185716687
|
14 |
+
metaai/llama-7B,51.10921501706485,77.74347739494125,34.0786227034917
|
15 |
+
StabilityAI/stablelm-tuned-alpha-7b,31.91126279863481,53.59490141406095,40.22458364155103
|
16 |
+
project-baize/baize-v2-7B,48.4641638225256,75.00497908783112,41.66264911575524
|
17 |
+
FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.084372288512725
|
18 |
+
camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
|
19 |
+
Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
|
20 |
+
togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
|
|
scripts/aggregate_nlp_metrics.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import tyro
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
TASK_METRICS = {
|
8 |
+
"arc_challenge": "acc_norm",
|
9 |
+
"hellaswag": "acc_norm",
|
10 |
+
"truthfulqa_mc": "mc2",
|
11 |
+
}
|
12 |
+
|
13 |
+
TASK_SHORT_NAMES = {
|
14 |
+
"arc_challenge": "arc",
|
15 |
+
"hellaswag": "hellaswag",
|
16 |
+
"truthfulqa_mc": "truthfulqa",
|
17 |
+
}
|
18 |
+
|
19 |
+
|
20 |
+
def main(data_dir: str, out_file: str = "score.csv") -> None:
|
21 |
+
"""Aggregate results from lm-evaluation-harness into a CSV file.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
data_dir: The directory containing the results. Model names are
|
25 |
+
expected to be the immediate subdirectories of `data_dir`.
|
26 |
+
out_file: The path to the output CSV file. (Default: `score.csv`)
|
27 |
+
"""
|
28 |
+
models = list(filter(lambda x: os.path.isdir(f"{data_dir}/{x}"), os.listdir(data_dir)))
|
29 |
+
|
30 |
+
df = pd.DataFrame(columns=TASK_SHORT_NAMES.values())
|
31 |
+
for model_dir in models:
|
32 |
+
for task, metric in TASK_METRICS.items():
|
33 |
+
model_name = "/".join(model_dir.split("--")[-2:])
|
34 |
+
results = json.load(open(f"{data_dir}/{model_dir}/{task}"))
|
35 |
+
df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0
|
36 |
+
df = df.reset_index().rename(columns={"index": "model"})
|
37 |
+
|
38 |
+
# Write the CSV file.
|
39 |
+
if dirname := os.path.dirname(out_file):
|
40 |
+
os.makedirs(dirname, exist_ok=True)
|
41 |
+
df.to_csv(out_file, index=False)
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
tyro.cli(main)
|