Spaces:
Running
Running
Zhiyu Wu
commited on
Add llama2, sort ShareGPT dataset by length (#18)
Browse files- README.md +2 -2
- data/A40_chat-concise_benchmark.csv +2 -0
- data/A40_chat_benchmark.csv +2 -0
- data/A40_instruct-concise_benchmark.csv +2 -0
- data/A40_instruct_benchmark.csv +2 -0
- data/score.csv +2 -0
- pegasus/benchmark.yaml +1 -1
- requirements-benchmark.txt +1 -1
- scripts/benchmark.py +12 -6
- sharegpt/README.md +5 -0
- sharegpt/{sg_90k_part1_html_cleaned_lang_first_sampled.json → sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json} +0 -0
README.md
CHANGED
@@ -52,6 +52,6 @@ We run benchmarks using multiple nodes and GPUs using [Pegasus](https://github.c
|
|
52 |
You can still run benchmarks without Pegasus like this:
|
53 |
|
54 |
```console
|
55 |
-
$ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/
|
56 |
-
$ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/
|
57 |
```
|
|
|
52 |
You can still run benchmarks without Pegasus like this:
|
53 |
|
54 |
```console
|
55 |
+
$ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
|
56 |
+
$ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
|
57 |
```
|
data/A40_chat-concise_benchmark.csv
CHANGED
@@ -19,3 +19,5 @@ metaai/llama-7B,25.80475014752762,63.463734049697784,2.2525196486312047,539.0479
|
|
19 |
Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
|
20 |
openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
|
21 |
FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534
|
|
|
|
|
|
19 |
Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
|
20 |
openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
|
21 |
FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534
|
22 |
+
metaai/Llama-2-13b-chat-hf,16.934647828854768,358.7941571524513,20.990738735323337,3942.400414707617
|
23 |
+
metaai/Llama-2-7b-chat-hf,31.733044836542074,402.6699126930826,12.569092892522697,2398.9215396235386
|
data/A40_chat_benchmark.csv
CHANGED
@@ -19,3 +19,5 @@ BAIR/koala-7b,29.723806931945834,260.7196104768301,8.720630589929986,2017.329562
|
|
19 |
BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
|
20 |
StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
|
21 |
togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497
|
|
|
|
|
|
19 |
BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
|
20 |
StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
|
21 |
togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497
|
22 |
+
metaai/Llama-2-13b-chat-hf,16.95804416983929,384.7333781061115,22.55271715111622,4337.670243116255
|
23 |
+
metaai/Llama-2-7b-chat-hf,31.922994116700572,428.19341840161184,13.367807321468502,2556.7166067830576
|
data/A40_instruct-concise_benchmark.csv
CHANGED
@@ -19,3 +19,5 @@ Neutralzz/BiLLa-7B-SFT,29.118626503392385,104.97817327065144,3.5443721553023035,
|
|
19 |
nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
|
20 |
project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
|
21 |
lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694
|
|
|
|
|
|
19 |
nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
|
20 |
project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
|
21 |
lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694
|
22 |
+
metaai/Llama-2-13b-chat-hf,16.753336372767794,223.39019476158495,12.93183804940574,2423.302869711249
|
23 |
+
metaai/Llama-2-7b-chat-hf,30.95799874634315,220.83680322364003,6.815573463441101,1288.2125369376631
|
data/A40_instruct_benchmark.csv
CHANGED
@@ -19,3 +19,5 @@ lmsys/fastchat-t5-3b-v1.0,31.014371537480102,357.13734049697786,17.9643423938542
|
|
19 |
nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
|
20 |
BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
|
21 |
metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275
|
|
|
|
|
|
19 |
nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
|
20 |
BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
|
21 |
metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275
|
22 |
+
metaai/Llama-2-13b-chat-hf,16.999960399598052,371.56312961719277,21.688517364074986,4210.194823371436
|
23 |
+
metaai/Llama-2-7b-chat-hf,31.815139493955602,365.40362659503023,11.316028104293823,2180.2478049026786
|
data/score.csv
CHANGED
@@ -18,3 +18,5 @@ FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.
|
|
18 |
camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
|
19 |
Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
|
20 |
togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
|
|
|
|
|
|
18 |
camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
|
19 |
Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
|
20 |
togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
|
21 |
+
metaai/Llama-2-7b-chat-hf,52.73037542662116,78.48038239394542,45.32519554457334
|
22 |
+
metaai/Llama-2-13b-chat-hf,59.129692832764505,81.94582752439753,43.9572591900371
|
pegasus/benchmark.yaml
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
# {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
|
4 |
# determines the specific node and gpu the generated job command will run on.
|
5 |
- command:
|
6 |
-
- docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/
|
7 |
model:
|
8 |
- /data/leaderboard/weights/metaai/llama-7B
|
9 |
- /data/leaderboard/weights/metaai/llama-13B
|
|
|
3 |
# {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
|
4 |
# determines the specific node and gpu the generated job command will run on.
|
5 |
- command:
|
6 |
+
- docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json --model-path {{ model }} --task {{ task }}
|
7 |
model:
|
8 |
- /data/leaderboard/weights/metaai/llama-7B
|
9 |
- /data/leaderboard/weights/metaai/llama-13B
|
requirements-benchmark.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
zeus-ml==0.4.0
|
2 |
-
fschat==0.2.
|
3 |
rwkv==0.7.5
|
4 |
einops
|
5 |
tyro
|
|
|
1 |
zeus-ml==0.4.0
|
2 |
+
fschat==0.2.20
|
3 |
rwkv==0.7.5
|
4 |
einops
|
5 |
tyro
|
scripts/benchmark.py
CHANGED
@@ -197,7 +197,7 @@ def generate_stream(
|
|
197 |
if not any(partially_stopped):
|
198 |
# indicates which request in batch stopped
|
199 |
different_indices = np.where(stopped != old_stopped)[0]
|
200 |
-
stop_length = np.array([(
|
201 |
yield {
|
202 |
"text": output,
|
203 |
"stop_length": stop_length,
|
@@ -215,7 +215,7 @@ def generate_stream(
|
|
215 |
spaces_between_special_tokens=False,
|
216 |
clean_up_tokenization_spaces=True,
|
217 |
)
|
218 |
-
stop_length = np.array([(i,
|
219 |
|
220 |
yield {
|
221 |
"text": output,
|
@@ -230,7 +230,7 @@ def generate_stream(
|
|
230 |
|
231 |
def main(
|
232 |
model_path: str,
|
233 |
-
input_file: str = "sharegpt/
|
234 |
output_dir: str = "data",
|
235 |
device_index: int = 0,
|
236 |
task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
|
@@ -245,7 +245,7 @@ def main(
|
|
245 |
Args:
|
246 |
model_path: Path to or Huggingface Hub Id of the model.
|
247 |
input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
|
248 |
-
(Default: "sharegpt/
|
249 |
output_dir: Path to the output directory. (Default: "data")
|
250 |
device_index: Index of the GPU to use for inference. (Default: 0)
|
251 |
task: Type of task to perform inference on. (Default: "chat")
|
@@ -304,7 +304,12 @@ def main(
|
|
304 |
conv_base = get_conversation_template(model_path)
|
305 |
|
306 |
# Standardize the system prompt for every model.
|
307 |
-
|
|
|
|
|
|
|
|
|
|
|
308 |
conv_base.messages = []
|
309 |
conv_base.offset = 0
|
310 |
|
@@ -407,7 +412,8 @@ def main(
|
|
407 |
# Record numbers.
|
408 |
output_text = output["text"]
|
409 |
if not is_warmup:
|
410 |
-
|
|
|
411 |
latency = measurements.time
|
412 |
throughput = response_length / latency
|
413 |
energy = measurements.total_energy
|
|
|
197 |
if not any(partially_stopped):
|
198 |
# indicates which request in batch stopped
|
199 |
different_indices = np.where(stopped != old_stopped)[0]
|
200 |
+
stop_length = np.array([(j, i+1) for j in different_indices])
|
201 |
yield {
|
202 |
"text": output,
|
203 |
"stop_length": stop_length,
|
|
|
215 |
spaces_between_special_tokens=False,
|
216 |
clean_up_tokenization_spaces=True,
|
217 |
)
|
218 |
+
stop_length = np.array([(i, max_new_tokens) for i in false_indices])
|
219 |
|
220 |
yield {
|
221 |
"text": output,
|
|
|
230 |
|
231 |
def main(
|
232 |
model_path: str,
|
233 |
+
input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json",
|
234 |
output_dir: str = "data",
|
235 |
device_index: int = 0,
|
236 |
task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
|
|
|
245 |
Args:
|
246 |
model_path: Path to or Huggingface Hub Id of the model.
|
247 |
input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
|
248 |
+
(Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json")
|
249 |
output_dir: Path to the output directory. (Default: "data")
|
250 |
device_index: Index of the GPU to use for inference. (Default: 0)
|
251 |
task: Type of task to perform inference on. (Default: "chat")
|
|
|
304 |
conv_base = get_conversation_template(model_path)
|
305 |
|
306 |
# Standardize the system prompt for every model.
|
307 |
+
if "llama-2" in model_path.lower():
|
308 |
+
conv_base.system = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPTS[task]}\n<</SYS>>\n\n"
|
309 |
+
elif "stablelm" in model_path.lower():
|
310 |
+
conv_base.system = f"""<|SYSTEM|># {SYSTEM_PROMPTS[task]}\n"""
|
311 |
+
else:
|
312 |
+
conv_base.system = SYSTEM_PROMPTS[task]
|
313 |
conv_base.messages = []
|
314 |
conv_base.offset = 0
|
315 |
|
|
|
412 |
# Record numbers.
|
413 |
output_text = output["text"]
|
414 |
if not is_warmup:
|
415 |
+
total_length = int(sum(batch_token_len.values())) # number of valid tokens
|
416 |
+
response_length = float(total_length) / len(convs)
|
417 |
latency = measurements.time
|
418 |
throughput = response_length / latency
|
419 |
energy = measurements.total_energy
|
sharegpt/README.md
CHANGED
@@ -25,3 +25,8 @@ python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file
|
|
25 |
```
|
26 |
python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
|
27 |
```
|
|
|
|
|
|
|
|
|
|
|
|
25 |
```
|
26 |
python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
|
27 |
```
|
28 |
+
|
29 |
+
## Sorted data
|
30 |
+
'''
|
31 |
+
python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
|
32 |
+
'''
|
sharegpt/{sg_90k_part1_html_cleaned_lang_first_sampled.json → sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json}
RENAMED
The diff for this file is too large to render.
See raw diff
|
|