Spaces:

ml-energy
/

leaderboard

Running

leaderboard / benchmark /llm_text_generation /code /scripts /benchmark_one_datapoint.py

Jae-Won Chung

New leaderboard prototype

b10121d 4 months ago

9.04 kB

	from __future__ import annotations

	import os
	import json
	import atexit
	import argparse
	import subprocess
	from pathlib import Path
	from typing import Literal


	def set_power_limit(power_limit: int, gpu_ids: list[int]) -> None:
	for gpu_id in gpu_ids:
	subprocess.check_call([
	"docker", "exec", "nvml",
	"nvidia-smi", "-i", str(gpu_id), "-pm", "1",
	])
	subprocess.check_call([
	"docker", "exec", "nvml",
	"nvidia-smi", "-i", str(gpu_id), "-pl", str(power_limit),
	])


	def start_server(
	backend: Literal["vllm", "tgi"],
	server_image: str,
	port: int,
	model: str,
	huggingface_token: str,
	gpu_ids: list[int],
	max_num_seqs: int,
	log_level: str,
	result_root: str,
	benchmark_name: str,
	) -> str:
	gpu_str = ",".join(str(gpu_id) for gpu_id in gpu_ids)
	gpu_str = f'"device={gpu_str}"'
	hf_cache_path = "/data/leaderboard/hfcache"
	models_dir = f"{os.getcwd()}/models"
	revision_filename = f"{model}/revision.txt"
	revision_path = f"{models_dir}/{revision_filename}"
	container_name = f"leaderboard-{backend}-{''.join(str(gpu_id) for gpu_id in gpu_ids)}"

	assert Path(hf_cache_path).exists(), f"Hugging Face cache not found: {hf_cache_path}"
	assert Path(revision_path).exists(), f"Revision file not found: {revision_path}"

	if backend == "vllm":
	server_cmd = [
	"docker", "run",
	"--gpus", gpu_str,
	"--ipc", "host",
	"--name", container_name,
	"-e", f"HF_TOKEN={huggingface_token}",
	"-e", f"LOG_LEVEL={log_level}",
	"-e", f"RESULT_FILE_PREFIX=/results/{benchmark_name}",
	"-p", f"{port}:8000",
	"-v", f"{hf_cache_path}:/root/.cache/huggingface",
	"-v", f"{result_root}:/results",
	server_image,
	"--model", model,
	"--revision", open(revision_path).read().strip(),
	"--tensor-parallel-size", str(len(gpu_ids)),
	"--gpu-memory-utilization", "0.95",
	"--trust-remote-code",
	"--enable-chunked-prefill", "False",
	"--max-model-len", "4096",
	"--disable-frontend-multiprocessing",
	"--max-num-seqs", str(max_num_seqs),
	]
	elif backend == "tgi":
	server_cmd = [
	"docker", "run",
	"--gpus", gpu_str,
	"--ipc", "host",
	"--name", container_name,
	"-e", f"HUGGING_FACE_HUB_TOKEN={huggingface_token}",
	"-e", f"LOG_LEVEL={log_level}",
	"-p", f"{port}:80",
	"-v", f"{hf_cache_path}:/root/.cache/huggingface",
	"-v", f"{models_dir}:/models",
	server_image,
	"--model-id", model,
	"--revision", open(revision_path).read().strip(),
	"--huggingface-hub-cache", "/root/.cache/huggingface/hub",
	"--num-shard", str(len(gpu_ids)),
	"--cuda-memory-fraction", "0.95",
	"--max-concurrent-requests", "512",
	"--max-stop-sequences", "7",
	"--trust-remote-code",
	]
	else:
	raise ValueError(f"Unknown backend: {backend}")

	print("Server:", " ".join(server_cmd))
	subprocess.Popen(server_cmd)

	return container_name


	def start_client(
	backend: Literal["vllm", "tgi"],
	port: int,
	model: str,
	dataset: str,
	request_rate: str,
	gpu_ids: list[int],
	benchmark_name: str,
	power_limit: int,
	max_num_seqs: int,
	data_dup_factor: int,
	) -> subprocess.Popen:
	client_cmd = [
	"python", "scripts/benchmark_client.py",
	"--backend", backend,
	"--port", str(port),
	"--model", model,
	"--dataset", dataset,
	"--request-rate", request_rate,
	"--benchmark-name", benchmark_name,
	"--power-limit", str(power_limit),
	"--max-num-seqs", str(max_num_seqs),
	"--data-dup-factor", str(data_dup_factor),
	]
	print("Client:", " ".join(client_cmd))
	return subprocess.Popen(
	client_cmd,
	env=os.environ \| {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in gpu_ids)},
	)


	def terminate_server(container_name: str) -> None:
	subprocess.run(["docker", "kill", "-s", "INT", container_name])
	subprocess.run(["timeout", "30", "docker", "wait", container_name])
	subprocess.run(["docker", "rm", "-f", container_name])


	def run_evalplus_eval(dataset: str, benchmark_name: str) -> None:
	benchmark_path = Path(benchmark_name)
	results_dir = benchmark_path.parent.absolute()
	benchmark_filename = f"{benchmark_path.name}+results+evalplus.jsonl"

	assert results_dir.exists(), f"Results directory not found: {results_dir}"
	assert (results_dir / benchmark_filename).exists(), f"Benchmark file not found: {results_dir / benchmark_filename}"

	evalplus_cmd = [
	"docker", "run",
	"-v", f"{results_dir}:/app",
	"ganler/evalplus:v0.2.0",
	"--dataset", dataset,
	"--samples", benchmark_filename,
	]
	print("EvalPlus:", " ".join(evalplus_cmd))
	output = subprocess.check_output(evalplus_cmd).decode("utf-8")
	print(output)

	key = ""
	results = {}
	for line in output.split("\n"):
	if "Base" in line:
	key = line.strip()
	if "pass@1" in line:
	results[key] = float(line.split(" ")[1][:-1])

	with open(f"{benchmark_name}+results+evalplus_acc.json", "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2)


	def main(args: argparse.Namespace) -> None:
	if args.model.startswith("models/"):
	args.model = args.model[len("models/"):]
	if args.model.endswith("/"):
	args.model = args.model[:-1]

	results_dir = Path(args.result_root) / args.model
	results_dir.mkdir(parents=True, exist_ok=True)
	benchmark_name = f"{args.backend}+rate{args.request_rate}+pl{args.power_limit}+maxbs{args.max_num_seqs}+gpus{''.join(str(i) for i in args.gpu_ids)}"

	if args.mode == "codegen":
	results_dir.mkdir(parents=True, exist_ok=True)

	port = 8000 + args.gpu_ids[0]

	server_handle = start_server(
	args.backend,
	args.server_image,
	port,
	args.model,
	args.huggingface_token,
	args.gpu_ids,
	args.max_num_seqs,
	args.log_level,
	str(results_dir.absolute()),
	benchmark_name,
	)
	kill_fn = lambda: terminate_server(server_handle)
	atexit.register(kill_fn)

	set_power_limit(args.power_limit, args.gpu_ids)

	client_handle = start_client(
	args.backend,
	port,
	args.model,
	args.dataset,
	args.request_rate,
	args.gpu_ids,
	str(results_dir / benchmark_name),
	args.power_limit,
	args.max_num_seqs,
	args.data_dup_factor,
	)

	try:
	exit_code = client_handle.wait(timeout=2 * 3600)
	except subprocess.TimeoutExpired:
	client_handle.terminate()
	raise RuntimeError("Benchmark client timed out after two hours")

	if exit_code != 0:
	raise RuntimeError(f"Benchmark client exited with code {exit_code}")

	terminate_server(server_handle)
	atexit.unregister(kill_fn)

	elif args.mode == "eval":
	run_evalplus_eval(args.dataset, str(results_dir / benchmark_name))

	else:
	raise ValueError(f"Unknown mode: {args.mode}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--backend", required=True, choices=["vllm", "tgi"], help="Server to benchmark.")
	parser.add_argument("--server-image", required=True, help="Docker image to use for the server.")
	parser.add_argument("--model", required=True, help="Model to benchmark, e.g., meta-llama/Llama-2-7b-chat-hf.")
	parser.add_argument("--dataset", required=True, choices=["humaneval", "mbpp"], help="EvalPlus dataset to use.")
	parser.add_argument("--request-rate", required=True, help="Poisson process rate for request arrival times.")
	parser.add_argument("--max-num-seqs", required=True, help="vLLM --max-num-seqs to use.")
	parser.add_argument("--power-limit", type=int, required=True, help="GPU power limit in Watts.")
	parser.add_argument("--result-root", default="results", help="Root directory to save results.")
	parser.add_argument("--huggingface-token", required=True, help="Hugging Face API token.")
	parser.add_argument("--gpu-ids", nargs="+", type=int, required=True, help="GPU IDs to use for the server.")
	parser.add_argument("--log-level", default="INFO", help="Logging level for the server.")
	parser.add_argument("--mode", required=True, choices=["codegen", "eval"], help="Mode to run the script in.")
	parser.add_argument("--data-dup-factor", type=int, default=1, help="How many times to repeat the dataset to generate more requests.")
	main(parser.parse_args())