File size: 3,054 Bytes
bb636ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json
import asyncio
import argparse
import httpx
from typing import List, Optional

_parser = argparse.ArgumentParser()

_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")


def load_jsonl(filename):
    with open(filename, "r") as file:
        return [json.loads(line.strip()) for line in file]

def save_jsonl(filename, data):
    with open(filename, "w") as file:
        for d in data:
            file.write(json.dumps(d))
            file.write("\n")
    return filename

async def call_oe_eval_bcb_client(
    samples_data: List[dict],
    calibrate: bool = True,
    parallel: int = -1,
    min_time_limit: float = 1,
    max_as_limit: int = 30 * 1024,
    max_data_limit: int = 30 * 1024,
    max_stack_limit: int = 10,
    no_gt: bool = True,
    execute_api: Optional[str] = None,
) -> List[dict]:
    """
    OE-Eval BigCodeBench remote code execution API
    """
    if execute_api is None:
        execute_api = "http://localhost:9000/evaluate/"

    async with httpx.AsyncClient() as client:
        params = {
            "calibrate": calibrate,
            "parallel": parallel,
            "min_time_limit": min_time_limit,
            "max_as_limit": max_as_limit,
            "max_data_limit": max_data_limit,
            "max_stack_limit": max_stack_limit,
            "no_gt": no_gt,
        }
        # Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
        #  generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
        total_timeout = 900
        response = await client.post(
            execute_api, json=samples_data, params=params, timeout=total_timeout
        )
        results = response.json()

    print("Results received from remote API. Processing ...")
    check_results = []
    for doc in results["eval"].values():
        for rep in doc:
            rep["tested_completion"] = rep.pop("solution")
            rep["passed"] = rep.pop("status") == "pass"
            rep["exec_result"] = rep.pop("details")
            check_results.append(rep)
    if check_results:
        pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
        return check_results, pass_at_1
    else:
        return None, None

def evaluate(sample_file, execute_api: Optional[str] = None):
    batched_code_test = load_jsonl(sample_file)
    results, pass_at_1 = asyncio.run(
        call_oe_eval_bcb_client(
            samples_data=batched_code_test,
            calibrate=True,
            parallel=-1,
            min_time_limit=30,
            execute_api = execute_api
        )
    )
    print("pass@1:", pass_at_1)
    return results

def main():
    args = _parser.parse_args()
    args_dict = vars(args)
    results = evaluate(args_dict["filename"], args_dict["remoteapi"])
    save_jsonl("data/eval_results.jsonl", results)

if __name__ == "__main__":
    main()