NahFam13's picture
z1
d26280a verified
import time
from typing import List, Optional
from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
from gpt_engineer.core.base_agent import BaseAgent
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
def run(
agent: BaseAgent,
benchmark: Benchmark,
task_name: Optional[str] = None,
verbose=False,
) -> List[TaskResult]:
task_results = []
for task in benchmark.tasks:
t0 = time.time()
files_dict = agent.improve(task.initial_code, task.prompt, task.command)
t1 = time.time()
env = DiskExecutionEnv()
env.upload(files_dict)
if task.command:
p = env.popen(task.command)
stdout, stderr = p.communicate(benchmark.timeout)
stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
else:
p, stdout, stderr = None, None, None
exec_result = Assertable(
files=files_dict,
env=env,
process=p,
stdout=stdout,
stderr=stderr,
)
task_results.append(
TaskResult(
task_name=task.name,
assertion_results={
assertion_name: assertion(exec_result)
for assertion_name, assertion in task.assertions.items()
},
duration=t1 - t0,
)
)
if verbose:
print_results(task_results)
return task_results
def print_results(results: list[TaskResult]):
for task_result in results:
print(f"\n--- Results for {task_result.task_name} ---")
print(f"{task_result.task_name} ({task_result.duration:.2f}s)")
for assertion_name, assertion_result in task_result.assertion_results.items():
checkmark = "βœ…" if assertion_result else "❌"
print(f" {checkmark} {assertion_name}")
print()
total_time = sum(task_result.duration for task_result in results)
print(f"Total time: {total_time:.2f}s")
correct_assertions = sum(
sum(
assertion_result
for assertion_result in task_result.assertion_results.values()
)
for task_result in results
)
total_assertions = sum(
len(task_result.assertion_results) for task_result in results
)
print(f"Total correct assertions: {correct_assertions}/{total_assertions}")
correct_tasks = sum(
all(
assertion_result
for assertion_result in task_result.assertion_results.values()
)
for task_result in results
)
print(f"Correct tasks: {correct_tasks}/{len(results)}")
print()