Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
·
e3571c1
1
Parent(s):
97b5f1c
Clean up
Browse files- Dockerfile +1 -1
- README.md +6 -5
- extract.py +0 -69
- leaderboard_1.csv +0 -5
- leaderboard_2.csv +0 -5
- leaderboard_3.csv +0 -5
- models.txt +20 -0
- running_command.sh +0 -27
- benchmark.py → scripts/benchmark.py +2 -1
- scripts/compute_metrics.py +25 -0
- sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json +0 -0
Dockerfile
CHANGED
@@ -11,7 +11,7 @@ RUN apt-get update -qq \
|
|
11 |
&& apt-get clean all \
|
12 |
&& rm -r /var/lib/apt/lists/*
|
13 |
|
14 |
-
# Install Miniconda3
|
15 |
ENV PATH="/root/.local/miniconda3/bin:$PATH"
|
16 |
RUN mkdir -p /root/.local \
|
17 |
&& wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
|
|
|
11 |
&& apt-get clean all \
|
12 |
&& rm -r /var/lib/apt/lists/*
|
13 |
|
14 |
+
# Install Miniconda3 23.3.1
|
15 |
ENV PATH="/root/.local/miniconda3/bin:$PATH"
|
16 |
RUN mkdir -p /root/.local \
|
17 |
&& wget https://repo.anaconda.com/miniconda/Miniconda3-py39_23.3.1-0-Linux-x86_64.sh \
|
README.md
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
# ML.ENERGY Leaderboard
|
|
|
|
|
2 |
|
3 |
## Devs
|
4 |
|
5 |
-
|
6 |
|
7 |
1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
|
8 |
-
|
9 |
2. Let's share the Huggingface Transformer cache:
|
10 |
|
11 |
```bash
|
@@ -19,6 +20,6 @@ $ docker build -t leaderboard:latest .
|
|
19 |
$ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
|
20 |
|
21 |
# cd leaderboard
|
22 |
-
# python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
23 |
-
# python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
24 |
```
|
|
|
1 |
+
<h1><a href="https://ml.energy" style="color: #27cb63; text-decoration: none">ML.ENERGY</a> Leaderboard</h1>
|
2 |
+
|
3 |
+
How much energy do LLMs consume?
|
4 |
|
5 |
## Devs
|
6 |
|
7 |
+
Current setup in `ampere02`:
|
8 |
|
9 |
1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
|
|
|
10 |
2. Let's share the Huggingface Transformer cache:
|
11 |
|
12 |
```bash
|
|
|
20 |
$ docker run -it --name jw-leaderboard --gpus all --cap-add SYS_ADMIN -v /data/leaderboard:/data/leaderboard -v $HOME/workspace/leaderboard:/workspace/leaderboard leaderboard:latest bash
|
21 |
|
22 |
# cd leaderboard
|
23 |
+
# python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
24 |
+
# python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
25 |
```
|
extract.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import json
|
3 |
-
import numpy as np
|
4 |
-
import statistics
|
5 |
-
import os
|
6 |
-
import csv
|
7 |
-
|
8 |
-
model = []
|
9 |
-
throughput = []
|
10 |
-
response_length = []
|
11 |
-
latency = []
|
12 |
-
energy = []
|
13 |
-
|
14 |
-
temp_throughput = []
|
15 |
-
temp_response_length = []
|
16 |
-
temp_latency = []
|
17 |
-
temp_energy = []
|
18 |
-
|
19 |
-
model_name = os.listdir("data/chat")
|
20 |
-
|
21 |
-
match_name = False
|
22 |
-
|
23 |
-
for models in model_name:
|
24 |
-
with open("data/chat/"+models+"/benchmark.json", 'r') as file:
|
25 |
-
json_data = json.load(file)
|
26 |
-
|
27 |
-
for obj in json_data:
|
28 |
-
if not match_name:
|
29 |
-
name = str(obj["model"])
|
30 |
-
model.append(name.replace('--','/'))
|
31 |
-
match_name = True
|
32 |
-
temp_throughput.append(float(obj["throughput"]))
|
33 |
-
temp_response_length.append(float(obj["response_length"]))
|
34 |
-
temp_latency.append(float(obj["latency"]))
|
35 |
-
temp_energy.append(float(obj["energy"]))
|
36 |
-
|
37 |
-
match_name = False
|
38 |
-
|
39 |
-
throughput.append(temp_throughput.copy())
|
40 |
-
response_length.append(temp_response_length.copy())
|
41 |
-
latency.append(temp_latency.copy())
|
42 |
-
energy.append(temp_energy.copy())
|
43 |
-
|
44 |
-
temp_throughput.clear()
|
45 |
-
temp_response_length.clear()
|
46 |
-
temp_latency.clear()
|
47 |
-
temp_energy.clear()
|
48 |
-
|
49 |
-
|
50 |
-
avg_throughput = [statistics.mean(row) for row in throughput]
|
51 |
-
avg_response_length = [statistics.mean(row) for row in response_length]
|
52 |
-
avg_latency = [statistics.mean(row) for row in latency]
|
53 |
-
avg_energy = [statistics.mean(row) for row in energy]
|
54 |
-
|
55 |
-
for i in range(len(model)):
|
56 |
-
print(model[i])
|
57 |
-
print(len(throughput[i]))
|
58 |
-
print(len(response_length[i]))
|
59 |
-
print(len(latency[i]))
|
60 |
-
print(len(energy[i]))
|
61 |
-
|
62 |
-
csv_file = "leaderboard.csv"
|
63 |
-
|
64 |
-
with open(csv_file, "w", newline="") as file:
|
65 |
-
writer = csv.writer(file)
|
66 |
-
writer.writerow(["model","throughput","response_length","latency","energy"])
|
67 |
-
for i in range(len(model)):
|
68 |
-
writer.writerow([model[i], avg_throughput[i], avg_response_length[i], avg_latency[i], avg_energy[i]])
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_1.csv
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
model,score,throughput,response_length,latency,energy
|
2 |
-
lmsys/vicuna-7B,1000,30.08236985276053,283.0862995298858,9.431178230227955,2271.4826004029537
|
3 |
-
lmsys/vicuna-13B,1000,17.509990378755237,281.76623376623377,16.124334009682688,4283.697810470779
|
4 |
-
tatsu-lab/alpaca-7B,1000,30.09713731797294,125.20013431833445,4.129986896187982,916.045386501007
|
5 |
-
metaai/llama-7B,1000,25.768609507174105,64.59032907991941,2.284814629996714,525.7081235728675
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_2.csv
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
model,score,throughput,response_length,latency,energy
|
2 |
-
metaai/llama-13B,1000,15.699146010424393,80.32236400268637,4.757332595030835,1293.689832437891
|
3 |
-
camel-ai/CAMEL-13B-Combined-Data,1000,17.408929446926095,292.3656943839791,16.840487937994777,4481.158658249824
|
4 |
-
BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth,1000,33.10830960148045,243.21793149764943,6.9481068778416555,1833.7241615177682
|
5 |
-
databricks/dolly-v2-12b,1000,15.597444626791148,148.3270651443922,9.168758730287117,2362.087664204047
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_3.csv
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
model,score,throughput,response_length,latency,energy
|
2 |
-
FreedomIntelligence/phoenix-inst-chat-7b,1000,32.663340053939855,243.14909335124244,7.271332307256473,2149.2483156478947
|
3 |
-
h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,1000,28.851651162429675,216.66286098052385,7.544740398256815,1636.1981326393268
|
4 |
-
lmsys/fastchat-t5-3b-v1.0,1000,17.78202422600336,313.22527472527474,23.570470748014376,2255.7007728936983
|
5 |
-
Neutralzz/BiLLa-7B-SFT,1000,29.49201862368961,159.29986568166555,5.443799112468728,1218.644757555166
|
|
|
|
|
|
|
|
|
|
|
|
models.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/data/leaderboard/weights/metaai/llama-7B
|
2 |
+
/data/leaderboard/weights/metaai/llama-13B
|
3 |
+
/data/leaderboard/weights/lmsys/vicuna-7B
|
4 |
+
/data/leaderboard/weights/lmsys/vicuna-13B
|
5 |
+
/data/leaderboard/weights/tatsu-lab/alpaca-7B
|
6 |
+
/data/leaderboard/weights/BAIR/koala-7b
|
7 |
+
/data/leaderboard/weights/BAIR/koala-13b
|
8 |
+
/data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth
|
9 |
+
camel-ai/CAMEL-13B-Combined-Data
|
10 |
+
databricks/dolly-v2-12b
|
11 |
+
FreedomIntelligence/phoenix-inst-chat-7b
|
12 |
+
h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2
|
13 |
+
lmsys/fastchat-t5-3b-v1.0
|
14 |
+
Neutralzz/BiLLa-7B-SFT
|
15 |
+
nomic-ai/gpt4all-13b-snoozy
|
16 |
+
openaccess-ai-collective/manticore-13b-chat-pyg
|
17 |
+
OpenAssistant/oasst-sft-1-pythia-12b
|
18 |
+
project-baize/baize-v2-7B
|
19 |
+
StabilityAI/stablelm-tuned-alpha-7b
|
20 |
+
togethercomputer/RedPajama-INCITE-7B-Chat
|
running_command.sh
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
# node with four gpus
|
4 |
-
python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
5 |
-
python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
|
6 |
-
python benchmark.py --model-path /data/leaderboard/weights/tatsu-lab/alpaca-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
|
7 |
-
python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
|
8 |
-
|
9 |
-
python benchmark.py --model-path /data/leaderboard/weights/metaai/llama-13B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
10 |
-
python benchmark.py --model-path camel-ai/CAMEL-13B-Combined-Data --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
|
11 |
-
python benchmark.py --model-path /data/leaderboard/weights/BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
|
12 |
-
python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
|
13 |
-
|
14 |
-
python benchmark.py --model-path FreedomIntelligence/phoenix-inst-chat-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
15 |
-
python benchmark.py --model-path h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
|
16 |
-
python benchmark.py --model-path lmsys/fastchat-t5-3b-v1.0 --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
|
17 |
-
python benchmark.py --model-path Neutralzz/BiLLa-7B-SFT --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
|
18 |
-
|
19 |
-
python benchmark.py --model-path nomic-ai/gpt4all-13b-snoozy --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
20 |
-
python benchmark.py --model-path openaccess-ai-collective/manticore-13b-chat-pyg --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
|
21 |
-
python benchmark.py --model-path OpenAssistant/oasst-sft-1-pythia-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
|
22 |
-
python benchmark.py --model-path project-baize/baize-v2-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
|
23 |
-
|
24 |
-
python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
|
25 |
-
python benchmark.py --model-path /data/leaderboard/weights/BAIR/koala-13b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 1
|
26 |
-
python benchmark.py --model-path StabilityAI/stablelm-tuned-alpha-7b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 2
|
27 |
-
python benchmark.py --model-path togethercomputer/RedPajama-INCITE-7B-Chat --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --device-index 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark.py → scripts/benchmark.py
RENAMED
@@ -40,7 +40,7 @@ SYSTEM_PROMPTS = {
|
|
40 |
|
41 |
def main(
|
42 |
model_path: str,
|
43 |
-
input_file: str,
|
44 |
output_dir: str = "data",
|
45 |
device_index: int = 0,
|
46 |
task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
|
@@ -54,6 +54,7 @@ def main(
|
|
54 |
Args:
|
55 |
model_path: Path to or Huggingface Hub Id of the model.
|
56 |
input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
|
|
|
57 |
output_dir: Path to the output directory. (Default: "data")
|
58 |
device_index: Index of the GPU to use for inference. (Default: 0)
|
59 |
task: Type of task to perform inference on. (Default: "chat")
|
|
|
40 |
|
41 |
def main(
|
42 |
model_path: str,
|
43 |
+
input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
|
44 |
output_dir: str = "data",
|
45 |
device_index: int = 0,
|
46 |
task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
|
|
|
54 |
Args:
|
55 |
model_path: Path to or Huggingface Hub Id of the model.
|
56 |
input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
|
57 |
+
(Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json")
|
58 |
output_dir: Path to the output directory. (Default: "data")
|
59 |
device_index: Index of the GPU to use for inference. (Default: 0)
|
60 |
task: Type of task to perform inference on. (Default: "chat")
|
scripts/compute_metrics.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import csv
|
3 |
+
|
4 |
+
import tyro
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
|
8 |
+
def main(data_dir: str, out_file: str) -> None:
|
9 |
+
"""Compute metrics for all models in the given directory."""
|
10 |
+
model_names = os.listdir(data_dir)
|
11 |
+
print(f"{model_names=}")
|
12 |
+
|
13 |
+
out_csv = csv.writer(open(out_file, "w", newline=""))
|
14 |
+
metrics = ["throughput", "response_length", "latency", "energy"]
|
15 |
+
out_csv.writerow(["model"] + metrics)
|
16 |
+
|
17 |
+
for model_name in model_names:
|
18 |
+
df = pd.read_json(f"{data_dir}/{model_name}/benchmark.json")
|
19 |
+
out_csv.writerow(
|
20 |
+
[model_name.replace("--", "/")] + df[metrics].mean().to_list(),
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
tyro.cli(main)
|
sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|