File size: 3,647 Bytes
b10121d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from __future__ import annotations

import os
import argparse
import subprocess
from itertools import product


def print_and_write(outfile, line: str, flush: bool = False):
    print(line, end="", flush=flush)
    outfile.write(line)
    if flush:
        outfile.flush()


def main(args: argparse.Namespace) -> None:
    hf_token = os.environ["HF_TOKEN"]

    outdir = f"{args.result_root}/{args.model}"
    os.makedirs(outdir, exist_ok=True)

    outfile = open(f"{outdir}/gpus{''.join(args.gpu_ids)}.out.txt", "a")

    assert len(args.backends) == len(args.server_images)
    server_images = dict(zip(args.backends, args.server_images))

    print_and_write(outfile, f"Benchmarking {args.model}\n")
    print_and_write(outfile, f"Backends: {args.backends}\n")
    print_and_write(outfile, f"Server images: {args.server_images}\n")
    print_and_write(outfile, f"Request rates: {args.request_rates}\n")
    print_and_write(outfile, f"Power limits: {args.power_limits}\n")
    print_and_write(outfile, f"Maximum num seqs: {args.max_num_seqs}\n")

    for backend, request_rate, power_limit, max_num_seqs in product(args.backends, args.request_rates, args.power_limits, args.max_num_seqs):
        print_and_write(outfile, f"{backend=}, {request_rate=}, {power_limit=}, {max_num_seqs=}\n", flush=True)
        with subprocess.Popen(
            args=[
                "python",
                "scripts/benchmark_one_datapoint.py",
                "--backend", backend,
                "--server-image", server_images[backend],
                "--model", args.model,
                "--dataset", "humaneval",
                "--request-rate", request_rate,
                "--power-limit", power_limit,
                "--result-root", args.result_root,
                "--huggingface-token", hf_token,
                "--gpu-ids", *args.gpu_ids,
                "--log-level", "INFO",
                "--mode", args.mode,
                "--max-num-seqs", str(max_num_seqs),
                "--data-dup-factor", args.data_dup_factor,
            ],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
        ) as proc:
            if proc.stdout:
                i = 0
                for line in proc.stdout:
                    print_and_write(outfile, line, flush=i % 50 == 0)
                    i += 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, help="ID of the model to benchmark")
    parser.add_argument("--result-root", type=str, help="Root directory to store the results")
    parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
    parser.add_argument("--backends", type=str, nargs="+", default=["vllm", "tgi"], help="Backends to benchmark")
    parser.add_argument("--server-images", type=str, nargs="+", default=["mlenergy/vllm:v0.4.2-api", "mlenergy/tgi:v2.0.2"], help="Server images to benchmark")
    parser.add_argument("--request-rates", type=str, nargs="+", default=["8.00", "4.00", "3.00", "2.00", "1.00"], help="Request rates to benchmark")
    parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
    parser.add_argument("--mode", type=str, choices=["codegen", "eval"], default="codegen", help="Mode to run the benchmark in")
    parser.add_argument("--max-num-seqs", type=str, nargs="+", help="vLLM --max-num-seqs to use.")
    parser.add_argument("--data-dup-factor", type=str, default="1", help="How many times to repeat the dataset to generate more requests.")
    args = parser.parse_args()
    main(args)