from __future__ import annotations import os import argparse import subprocess from itertools import product def print_and_write(outfile, line: str, flush: bool = False): print(line, end="", flush=flush) outfile.write(line) if flush: outfile.flush() def main(args: argparse.Namespace) -> None: hf_token = os.environ["HF_TOKEN"] outdir = f"{args.result_root}/{args.model}" os.makedirs(outdir, exist_ok=True) outfile = open(f"{outdir}/gpus{''.join(args.gpu_ids)}.out.txt", "a") assert len(args.backends) == len(args.server_images) server_images = dict(zip(args.backends, args.server_images)) print_and_write(outfile, f"Benchmarking {args.model}\n") print_and_write(outfile, f"Backends: {args.backends}\n") print_and_write(outfile, f"Server images: {args.server_images}\n") print_and_write(outfile, f"Request rates: {args.request_rates}\n") print_and_write(outfile, f"Power limits: {args.power_limits}\n") print_and_write(outfile, f"Maximum num seqs: {args.max_num_seqs}\n") for backend, request_rate, power_limit, max_num_seqs in product(args.backends, args.request_rates, args.power_limits, args.max_num_seqs): print_and_write(outfile, f"{backend=}, {request_rate=}, {power_limit=}, {max_num_seqs=}\n", flush=True) with subprocess.Popen( args=[ "python", "scripts/benchmark_one_datapoint.py", "--backend", backend, "--server-image", server_images[backend], "--model", args.model, "--dataset", "humaneval", "--request-rate", request_rate, "--power-limit", power_limit, "--result-root", args.result_root, "--huggingface-token", hf_token, "--gpu-ids", *args.gpu_ids, "--log-level", "INFO", "--mode", args.mode, "--max-num-seqs", str(max_num_seqs), "--data-dup-factor", args.data_dup_factor, ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, ) as proc: if proc.stdout: i = 0 for line in proc.stdout: print_and_write(outfile, line, flush=i % 50 == 0) i += 1 if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, help="ID of the model to benchmark") parser.add_argument("--result-root", type=str, help="Root directory to store the results") parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use") parser.add_argument("--backends", type=str, nargs="+", default=["vllm", "tgi"], help="Backends to benchmark") parser.add_argument("--server-images", type=str, nargs="+", default=["mlenergy/vllm:v0.4.2-api", "mlenergy/tgi:v2.0.2"], help="Server images to benchmark") parser.add_argument("--request-rates", type=str, nargs="+", default=["8.00", "4.00", "3.00", "2.00", "1.00"], help="Request rates to benchmark") parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark") parser.add_argument("--mode", type=str, choices=["codegen", "eval"], default="codegen", help="Mode to run the benchmark in") parser.add_argument("--max-num-seqs", type=str, nargs="+", help="vLLM --max-num-seqs to use.") parser.add_argument("--data-dup-factor", type=str, default="1", help="How many times to repeat the dataset to generate more requests.") args = parser.parse_args() main(args)