Jae-Won Chung commited on
Commit
f98b171
·
1 Parent(s): 663521e

Add aggregate_nlp_metrics.py and more precise score.csv

Browse files
Files changed (2) hide show
  1. data/score.csv +19 -20
  2. scripts/aggregate_nlp_metrics.py +44 -0
data/score.csv CHANGED
@@ -1,21 +1,20 @@
1
  model,arc,hellaswag,truthfulqa
2
- lmsys/vicuna-7B,53.5,77.5,49.0
3
- lmsys/vicuna-13B,52.9,80.1,51.8
4
- tatsu-lab/alpaca-7B,52.6,76.9,39.6
5
- metaai/llama-7B,51.1,77.7,34.1
6
- metaai/llama-13B,56.3,80.9,39.9
7
- camel-ai/CAMEL-13B-Combined-Data,55.5,79.3,47.3
8
- BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth,NaN,NaN,NaN
9
- databricks/dolly-v2-12b,42.2,71.8,33.4
10
- FreedomIntelligence/phoenix-inst-chat-7b,45.0,63.2,47.1
11
- h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,36.9,61.6,37.9
12
- lmsys/fastchat-t5-3b-v1.0,35.9,46.4,48.8
13
- Neutralzz/BiLLa-7B-SFT,27.7,26.0,49.0
14
- nomic-ai/gpt4all-13b-snoozy,56.1,78.7,48.4
15
- openaccess-ai-collective/manticore-13b-chat-pyg,58.7,82.0,48.9
16
- OpenAssistant/oasst-sft-1-pythia-12b,45.6,69.9,39.2
17
- project-baize/baize-v2-7B,48.5,75.0,41.7
18
- BAIR/koala-7b,47.1,73.7,46.0
19
- BAIR/koala-13b,52.9,77.5,50.1
20
- StabilityAI/stablelm-tuned-alpha-7b,31.9,53.6,40.2
21
- togethercomputer/RedPajama-INCITE-7B-Chat,42.2,70.8,36.1
 
1
  model,arc,hellaswag,truthfulqa
2
+ BAIR/koala-13b,52.901023890784984,77.54431388169687,50.091065219059125
3
+ BAIR/koala-7b,47.098976109215016,73.70045807608047,45.997635958147875
4
+ lmsys/vicuna-7B,53.49829351535836,77.53435570603465,48.997614637055264
5
+ metaai/llama-13B,56.31399317406144,80.8603863772157,39.90298264801161
6
+ tatsu-lab/alpaca-7B,52.64505119453925,76.90699063931487,39.552770976749336
7
+ OpenAssistant/oasst-sft-1-pythia-12b,45.563139931740615,69.92630950009958,39.1893543136912
8
+ databricks/dolly-v2-12b,42.15017064846416,71.82832105158336,33.37136000408915
9
+ h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,36.86006825938566,61.551483768173675,37.9421602393762
10
+ lmsys/fastchat-t5-3b-v1.0,35.92150170648464,46.355307707627965,48.787610045893985
11
+ nomic-ai/gpt4all-13b-snoozy,56.058020477815695,78.68950408285203,48.35948664919701
12
+ openaccess-ai-collective/manticore-13b-chat-pyg,58.703071672354945,81.95578570005975,48.86009773651491
13
+ lmsys/vicuna-13B,52.901023890784984,80.12348137821151,51.81653185716687
14
+ metaai/llama-7B,51.10921501706485,77.74347739494125,34.0786227034917
15
+ StabilityAI/stablelm-tuned-alpha-7b,31.91126279863481,53.59490141406095,40.22458364155103
16
+ project-baize/baize-v2-7B,48.4641638225256,75.00497908783112,41.66264911575524
17
+ FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.084372288512725
18
+ camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
19
+ Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
20
+ togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
 
scripts/aggregate_nlp_metrics.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import tyro
5
+ import pandas as pd
6
+
7
+ TASK_METRICS = {
8
+ "arc_challenge": "acc_norm",
9
+ "hellaswag": "acc_norm",
10
+ "truthfulqa_mc": "mc2",
11
+ }
12
+
13
+ TASK_SHORT_NAMES = {
14
+ "arc_challenge": "arc",
15
+ "hellaswag": "hellaswag",
16
+ "truthfulqa_mc": "truthfulqa",
17
+ }
18
+
19
+
20
+ def main(data_dir: str, out_file: str = "score.csv") -> None:
21
+ """Aggregate results from lm-evaluation-harness into a CSV file.
22
+
23
+ Args:
24
+ data_dir: The directory containing the results. Model names are
25
+ expected to be the immediate subdirectories of `data_dir`.
26
+ out_file: The path to the output CSV file. (Default: `score.csv`)
27
+ """
28
+ models = list(filter(lambda x: os.path.isdir(f"{data_dir}/{x}"), os.listdir(data_dir)))
29
+
30
+ df = pd.DataFrame(columns=TASK_SHORT_NAMES.values())
31
+ for model_dir in models:
32
+ for task, metric in TASK_METRICS.items():
33
+ model_name = "/".join(model_dir.split("--")[-2:])
34
+ results = json.load(open(f"{data_dir}/{model_dir}/{task}"))
35
+ df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0
36
+ df = df.reset_index().rename(columns={"index": "model"})
37
+
38
+ # Write the CSV file.
39
+ if dirname := os.path.dirname(out_file):
40
+ os.makedirs(dirname, exist_ok=True)
41
+ df.to_csv(out_file, index=False)
42
+
43
+ if __name__ == "__main__":
44
+ tyro.cli(main)