|
import time |
|
|
|
from typing import List, Optional |
|
|
|
from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult |
|
from gpt_engineer.core.base_agent import BaseAgent |
|
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv |
|
|
|
|
|
def run( |
|
agent: BaseAgent, |
|
benchmark: Benchmark, |
|
task_name: Optional[str] = None, |
|
verbose=False, |
|
) -> List[TaskResult]: |
|
task_results = [] |
|
for task in benchmark.tasks: |
|
t0 = time.time() |
|
files_dict = agent.improve(task.initial_code, task.prompt, task.command) |
|
t1 = time.time() |
|
|
|
env = DiskExecutionEnv() |
|
env.upload(files_dict) |
|
|
|
if task.command: |
|
p = env.popen(task.command) |
|
stdout, stderr = p.communicate(benchmark.timeout) |
|
stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8") |
|
else: |
|
p, stdout, stderr = None, None, None |
|
exec_result = Assertable( |
|
files=files_dict, |
|
env=env, |
|
process=p, |
|
stdout=stdout, |
|
stderr=stderr, |
|
) |
|
|
|
task_results.append( |
|
TaskResult( |
|
task_name=task.name, |
|
assertion_results={ |
|
assertion_name: assertion(exec_result) |
|
for assertion_name, assertion in task.assertions.items() |
|
}, |
|
duration=t1 - t0, |
|
) |
|
) |
|
if verbose: |
|
print_results(task_results) |
|
return task_results |
|
|
|
|
|
def print_results(results: list[TaskResult]): |
|
for task_result in results: |
|
print(f"\n--- Results for {task_result.task_name} ---") |
|
print(f"{task_result.task_name} ({task_result.duration:.2f}s)") |
|
for assertion_name, assertion_result in task_result.assertion_results.items(): |
|
checkmark = "β
" if assertion_result else "β" |
|
print(f" {checkmark} {assertion_name}") |
|
print() |
|
|
|
total_time = sum(task_result.duration for task_result in results) |
|
print(f"Total time: {total_time:.2f}s") |
|
|
|
correct_assertions = sum( |
|
sum( |
|
assertion_result |
|
for assertion_result in task_result.assertion_results.values() |
|
) |
|
for task_result in results |
|
) |
|
total_assertions = sum( |
|
len(task_result.assertion_results) for task_result in results |
|
) |
|
print(f"Total correct assertions: {correct_assertions}/{total_assertions}") |
|
|
|
correct_tasks = sum( |
|
all( |
|
assertion_result |
|
for assertion_result in task_result.assertion_results.values() |
|
) |
|
for task_result in results |
|
) |
|
print(f"Correct tasks: {correct_tasks}/{len(results)}") |
|
print() |
|
|