Spaces:
Running
Running
from __future__ import annotations | |
import os | |
import argparse | |
import subprocess | |
from itertools import product | |
def print_and_write(outfile, line: str, flush: bool = False): | |
print(line, end="", flush=flush) | |
outfile.write(line) | |
if flush: | |
outfile.flush() | |
def main(args: argparse.Namespace) -> None: | |
hf_token = os.environ["HF_TOKEN"] | |
outdir = f"{args.result_root}/{args.model}" | |
os.makedirs(outdir, exist_ok=True) | |
outfile = open(f"{outdir}/gpus{''.join(args.gpu_ids)}.out.txt", "a") | |
assert len(args.backends) == len(args.server_images) | |
server_images = dict(zip(args.backends, args.server_images)) | |
print_and_write(outfile, f"Benchmarking {args.model}\n") | |
print_and_write(outfile, f"Backends: {args.backends}\n") | |
print_and_write(outfile, f"Server images: {args.server_images}\n") | |
print_and_write(outfile, f"Request rates: {args.request_rates}\n") | |
print_and_write(outfile, f"Power limits: {args.power_limits}\n") | |
print_and_write(outfile, f"Maximum num seqs: {args.max_num_seqs}\n") | |
for backend, request_rate, power_limit, max_num_seqs in product(args.backends, args.request_rates, args.power_limits, args.max_num_seqs): | |
print_and_write(outfile, f"{backend=}, {request_rate=}, {power_limit=}, {max_num_seqs=}\n", flush=True) | |
with subprocess.Popen( | |
args=[ | |
"python", | |
"scripts/benchmark_one_datapoint.py", | |
"--backend", backend, | |
"--server-image", server_images[backend], | |
"--model", args.model, | |
"--dataset", "humaneval", | |
"--request-rate", request_rate, | |
"--power-limit", power_limit, | |
"--result-root", args.result_root, | |
"--huggingface-token", hf_token, | |
"--gpu-ids", *args.gpu_ids, | |
"--log-level", "INFO", | |
"--mode", args.mode, | |
"--max-num-seqs", str(max_num_seqs), | |
"--data-dup-factor", args.data_dup_factor, | |
], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, | |
text=True, | |
) as proc: | |
if proc.stdout: | |
i = 0 | |
for line in proc.stdout: | |
print_and_write(outfile, line, flush=i % 50 == 0) | |
i += 1 | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--model", type=str, help="ID of the model to benchmark") | |
parser.add_argument("--result-root", type=str, help="Root directory to store the results") | |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use") | |
parser.add_argument("--backends", type=str, nargs="+", default=["vllm", "tgi"], help="Backends to benchmark") | |
parser.add_argument("--server-images", type=str, nargs="+", default=["mlenergy/vllm:v0.4.2-api", "mlenergy/tgi:v2.0.2"], help="Server images to benchmark") | |
parser.add_argument("--request-rates", type=str, nargs="+", default=["8.00", "4.00", "3.00", "2.00", "1.00"], help="Request rates to benchmark") | |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark") | |
parser.add_argument("--mode", type=str, choices=["codegen", "eval"], default="codegen", help="Mode to run the benchmark in") | |
parser.add_argument("--max-num-seqs", type=str, nargs="+", help="vLLM --max-num-seqs to use.") | |
parser.add_argument("--data-dup-factor", type=str, default="1", help="How many times to repeat the dataset to generate more requests.") | |
args = parser.parse_args() | |
main(args) | |