Update app.py
Browse files
app.py
CHANGED
@@ -106,6 +106,7 @@ def evaluate(
|
|
106 |
max_as_limit: int = 30 * 1024,
|
107 |
max_data_limit: int = 30 * 1024,
|
108 |
max_stack_limit: int = 10,
|
|
|
109 |
check_gt_only: bool = False,
|
110 |
no_gt: bool = False,
|
111 |
):
|
@@ -156,7 +157,7 @@ def evaluate(
|
|
156 |
if "solution" in sample
|
157 |
else problems[task_id]["complete_prompt"] + sample["completion"]
|
158 |
)
|
159 |
-
if
|
160 |
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
161 |
remainings.add(sample["_identifier"])
|
162 |
args = (
|
@@ -223,7 +224,7 @@ def evaluate(
|
|
223 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
224 |
pass_at_k["split"] = split
|
225 |
pass_at_k["subset"] = subset
|
226 |
-
pass_at_k["calibrated"] =
|
227 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
228 |
pass_at_k["failed_tasks"] = failed_tasks
|
229 |
|
|
|
106 |
max_as_limit: int = 30 * 1024,
|
107 |
max_data_limit: int = 30 * 1024,
|
108 |
max_stack_limit: int = 10,
|
109 |
+
calibrated: bool = True,
|
110 |
check_gt_only: bool = False,
|
111 |
no_gt: bool = False,
|
112 |
):
|
|
|
157 |
if "solution" in sample
|
158 |
else problems[task_id]["complete_prompt"] + sample["completion"]
|
159 |
)
|
160 |
+
if calibrated:
|
161 |
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
162 |
remainings.add(sample["_identifier"])
|
163 |
args = (
|
|
|
224 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
225 |
pass_at_k["split"] = split
|
226 |
pass_at_k["subset"] = subset
|
227 |
+
pass_at_k["calibrated"] = calibrated
|
228 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
229 |
pass_at_k["failed_tasks"] = failed_tasks
|
230 |
|