Spaces:
Runtime error
Runtime error
gabeorlanski
commited on
BC eval
Browse files- README.md +242 -5
- app.py +5 -0
- bc_eval.py +335 -0
- execution.py +145 -0
- requirements.txt +1 -0
README.md
CHANGED
@@ -1,12 +1,249 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: BabelCode Eval
|
3 |
+
colorFrom: blue
|
|
|
4 |
colorTo: red
|
5 |
sdk: gradio
|
6 |
+
sdk_version: 3.19.1
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
+
tags:
|
10 |
+
- evaluate
|
11 |
+
- metric
|
12 |
+
description: >-
|
13 |
+
This metric implements the evaluation harness for datasets translated with the
|
14 |
+
BabelCode framework as described in the paper "Measuring The Impact Of
|
15 |
+
Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
16 |
---
|
17 |
|
18 |
+
# Metric Card for bc_eval
|
19 |
+
|
20 |
+
|
21 |
+
## Metric Description
|
22 |
+
This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
23 |
+
|
24 |
+
## How to Use
|
25 |
+
1. Generate predictions for BabelCode supported datasets
|
26 |
+
2. Aggregate the predictions by their question.
|
27 |
+
3. With the aggregated predictions for each question, add the `question_info` from the original BabelCode dataset.
|
28 |
+
4. Run the metric on the `predictions`, `languages`, and `question_infos`.
|
29 |
+
5. The result of the metric is a tuple where the first is a metric dict and the second value is the results for each prediction.
|
30 |
+
|
31 |
+
```python
|
32 |
+
import evaluate
|
33 |
+
from datasets import load_dataset
|
34 |
+
import os
|
35 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
36 |
+
|
37 |
+
predictions = []
|
38 |
+
languages = []
|
39 |
+
question_infos = []
|
40 |
+
ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
|
41 |
+
|
42 |
+
for row in ds:
|
43 |
+
languages.append(row['language'])
|
44 |
+
question_infos.append(row['question_info'])
|
45 |
+
|
46 |
+
# Replace this with however you generate and postprocess predictions.
|
47 |
+
predictions.append(model.generate(row['signature_with_docstring']))
|
48 |
+
|
49 |
+
|
50 |
+
metric = evaluate.load("bc_eval")
|
51 |
+
metrics, results = metric.compute(
|
52 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
53 |
+
)
|
54 |
+
```
|
55 |
+
|
56 |
+
### Inputs
|
57 |
+
* `predictions`(`List[List[str]]`): The list of predictions for each question to execute.
|
58 |
+
* `languages`(`List[str]`): The language to use for each question.
|
59 |
+
* `question_dicts`(`List[Dict]`): The information for each question.
|
60 |
+
* `k`(`List[int]`): number of code candidates to consider in the evaluation (Default: [1, 10, 100])
|
61 |
+
* `num_workers`(`int`): number of workers used to evaluate the candidate programs (Default: 4).
|
62 |
+
* `language_timeout`(`Dict[str,int]`): Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
|
63 |
+
|
64 |
+
### Output Values
|
65 |
+
|
66 |
+
The `bc_eval` metric outputs two things:
|
67 |
+
|
68 |
+
* `metrics`: a dictionary with the pass rates for each k value defined in the arguments and the mean percent of tests passed per question. The keys are formatted as `{LANGUAGE NAME}/{METRIC NAME}`
|
69 |
+
|
70 |
+
* `results`: a list of dictionaries with the results from each individual prediction.
|
71 |
+
|
72 |
+
#### Values from Popular Papers
|
73 |
+
[PaLM-2](https://arxiv.org/pdf/2305.10403.pdf) Performance on BC-HumanEval (`pass@1` with greedy decoding):
|
74 |
+
|
75 |
+
| Language | PaLM 2-S* | PaLM 540B | PaLM-Coder-540B |
|
76 |
+
|------------|-----------|-----------|-----------------|
|
77 |
+
| C# | 24.22 | 20.5 | **26.09** |
|
78 |
+
| C++ | **34.16** | 21.74 | 24.22 |
|
79 |
+
| Go | 19.25 | 13.66 | **21.12** |
|
80 |
+
| Haskell | **8.7** | 1.86 | 1.86 |
|
81 |
+
| Java | **31.06** | 20.5 | 25.47 |
|
82 |
+
| JavaScript | **32.3** | 23.6 | 29.81 |
|
83 |
+
| Julia | **16.77** | 2.48 | 4.35 |
|
84 |
+
| Lua | **26.09** | 19.25 | 24.84 |
|
85 |
+
| PHP | **26.09** | 18.63 | 25.47 |
|
86 |
+
| Python | **34.16** | 17.39 | 26.71 |
|
87 |
+
| Rust | **28.57** | 16.15 | 22.98 |
|
88 |
+
| TypeScript | **32.3** | 17.39 | 30.43 |
|
89 |
+
|
90 |
+
|
91 |
+
### Examples
|
92 |
+
Full example with inputs that fail tests, time out, have an error, and pass.
|
93 |
+
|
94 |
+
#### Passing Example
|
95 |
+
```python
|
96 |
+
import evaluate
|
97 |
+
from datasets import load_dataset
|
98 |
+
import os
|
99 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
100 |
+
ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
|
101 |
+
example = ds[0]
|
102 |
+
metric = evaluate.load("bc_eval")
|
103 |
+
languages = ["Python"]
|
104 |
+
question_infos = [example["question_info"]]
|
105 |
+
predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
106 |
+
for idx, elem in enumerate(numbers):
|
107 |
+
for idx2, elem2 in enumerate(numbers):
|
108 |
+
if idx != idx2:
|
109 |
+
distance = abs(elem - elem2)
|
110 |
+
if distance < threshold:
|
111 |
+
return True
|
112 |
+
|
113 |
+
return False"""
|
114 |
+
]]
|
115 |
+
metrics, results = metric.compute(
|
116 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
117 |
+
)
|
118 |
+
```
|
119 |
+
`metrics` is:
|
120 |
+
```
|
121 |
+
{"Python/pass@1": 1.0, "Python/mean_pct_pass": 1.0}
|
122 |
+
```
|
123 |
+
`results` is:
|
124 |
+
```
|
125 |
+
[{"qid": 0, "idx": "0", "file_path": ".../tmpqt_p3dwn/0", "results": [{"return_code": 0, "runtime": 0.076369, "stdout": "TEST-0...PASSED\r\nTEST-1...PASSED\r\nTEST-2...PASSED\r\nTEST-3...PASSED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...PASSED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "PASSED", "2": "PASSED", "3": "PASSED", "4": "PASSED", "5": "PASSED", "6": "PASSED"}, "outcome": "PASSED"}]
|
126 |
+
```
|
127 |
+
|
128 |
+
|
129 |
+
#### Fails Test Example
|
130 |
+
|
131 |
+
```python
|
132 |
+
import evaluate
|
133 |
+
from datasets import load_dataset
|
134 |
+
import os
|
135 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
136 |
+
ds = load_dataset(
|
137 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
138 |
+
)
|
139 |
+
example = ds[0]
|
140 |
+
metric = evaluate.load("bc_eval")
|
141 |
+
languages = ["Python"]
|
142 |
+
question_infos = [example["question_info"]]
|
143 |
+
predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
144 |
+
for idx, elem in enumerate(numbers):
|
145 |
+
for idx2, elem2 in enumerate(numbers):
|
146 |
+
if idx != idx2:
|
147 |
+
distance = elem - elem2
|
148 |
+
if distance < threshold:
|
149 |
+
return True
|
150 |
+
|
151 |
+
return False"""
|
152 |
+
]]
|
153 |
+
metrics, results = metric.compute(
|
154 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
155 |
+
)
|
156 |
+
```
|
157 |
+
|
158 |
+
`metrics` is:
|
159 |
+
```
|
160 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.5714285714285714}
|
161 |
+
```
|
162 |
+
`results` is:
|
163 |
+
```
|
164 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmp7u587vk5/0", "results": [{"return_code": 0, "runtime": 0.08255, "stdout": "TEST-0...PASSED\r\nTEST-1...FAILED\r\nTEST-2...PASSED\r\nTEST-3...FAILED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...FAILED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "FAILED", "2": "PASSED", "3": "FAILED", "4": "PASSED", "5": "PASSED", "6": "FAILED"}, "outcome": "FAILED"}]
|
165 |
+
```
|
166 |
+
|
167 |
+
Note that the individual test results are located in results.
|
168 |
+
|
169 |
+
#### Timeout Example
|
170 |
+
|
171 |
+
```python
|
172 |
+
import evaluate
|
173 |
+
from datasets import load_dataset
|
174 |
+
import os
|
175 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
176 |
+
ds = load_dataset(
|
177 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
178 |
+
)
|
179 |
+
example = ds[0]
|
180 |
+
metric = evaluate.load("bc_eval")
|
181 |
+
languages = ["Python"]
|
182 |
+
question_infos = [example["question_info"]]
|
183 |
+
predictions = [["""import time
|
184 |
+
def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
185 |
+
time.sleep(100)
|
186 |
+
"""
|
187 |
+
]]
|
188 |
+
metrics, results = metric.compute(
|
189 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
190 |
+
)
|
191 |
+
```
|
192 |
+
|
193 |
+
`metrics` is:
|
194 |
+
```
|
195 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
|
196 |
+
```
|
197 |
+
`results` is:
|
198 |
+
```
|
199 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmp_rz6bhb9/0", "results": [{"return_code": -1, "runtime": 10, "stdout": null, "stderr": null, "timed_out": true}], "failed": false, "timed_out": true, "test_cases": {"0": "MISSING", "1": "MISSING", "2": "MISSING", "3": "MISSING", "4": "MISSING", "5": "MISSING", "6": "MISSING"}, "outcome": "TIMED_OUT"}]
|
200 |
+
```
|
201 |
+
|
202 |
+
#### Error Example
|
203 |
+
|
204 |
+
```python
|
205 |
+
import evaluate
|
206 |
+
from datasets import load_dataset
|
207 |
+
import os
|
208 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
209 |
+
ds = load_dataset(
|
210 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
211 |
+
)
|
212 |
+
example = ds[0]
|
213 |
+
metric = evaluate.load("bc_eval")
|
214 |
+
languages = ["Python"]
|
215 |
+
question_infos = [example["question_info"]]
|
216 |
+
predictions = [["""import time
|
217 |
+
def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
218 |
+
raise ValueError()
|
219 |
+
""",
|
220 |
+
"""def add(a, b):
|
221 |
+
return a+b"""
|
222 |
+
]]
|
223 |
+
metrics, results = metric.compute(
|
224 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
225 |
+
)
|
226 |
+
```
|
227 |
+
|
228 |
+
`metrics` is:
|
229 |
+
```
|
230 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
|
231 |
+
```
|
232 |
+
`results` is:
|
233 |
+
```
|
234 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmpjdn51aaa/0", "results": [{"return_code": 0, "runtime": 0.102855, "stdout": "TEST-0...ValueError\r\nTEST-1...ValueError\r\nTEST-2...ValueError\r\nTEST-3...ValueError\r\nTEST-4...ValueError\r\nTEST-5...ValueError\r\nTEST-6...ValueError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "ValueError", "1": "ValueError", "2": "ValueError", "3": "ValueError", "4": "ValueError", "5": "ValueError", "6": "ValueError"}, "outcome": "HAD_ERROR"},
|
235 |
+
{"qid": 0, "idx": "1", "file_path": "/tmpjdn51aaa/1", "results": [{"return_code": 0, "runtime": 0.094347, "stdout": "TEST-0...NameError\r\nTEST-1...NameError\r\nTEST-2...NameError\r\nTEST-3...NameError\r\nTEST-4...NameError\r\nTEST-5...NameError\r\nTEST-6...NameError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "NameError", "1": "NameError", "2": "NameError", "3": "NameError", "4": "NameError", "5": "NameError", "6": "NameError"}, "outcome": "HAD_ERROR"}]
|
236 |
+
```
|
237 |
+
|
238 |
+
## Limitations and Bias
|
239 |
+
This metric requires that the dataset be BabelCode compatible.
|
240 |
+
|
241 |
+
## Citation
|
242 |
+
```
|
243 |
+
@article{orlanski2023measuring,
|
244 |
+
title={Measuring The Impact Of Programming Language Distribution},
|
245 |
+
author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
|
246 |
+
journal={arXiv preprint arXiv:2302.01973},
|
247 |
+
year={2023}
|
248 |
+
}
|
249 |
+
```
|
app.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import evaluate
|
2 |
+
from evaluate.utils import launch_gradio_widget
|
3 |
+
|
4 |
+
module = evaluate.load("gabeorlanski/bc_eval")
|
5 |
+
launch_gradio_widget(module)
|
bc_eval.py
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import itertools
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import tempfile
|
6 |
+
from collections import defaultdict
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
import datasets
|
10 |
+
import evaluate
|
11 |
+
import numpy as np
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
from .execution import execute_predictions
|
15 |
+
|
16 |
+
STDOUT_PARSE_REGEX = re.compile(r"^TEST-(.+)\.\.\.(.+)$", flags=re.MULTILINE)
|
17 |
+
|
18 |
+
_CITATION = """\
|
19 |
+
@article{orlanski2023measuring,
|
20 |
+
title={Measuring The Impact Of Programming Language Distribution},
|
21 |
+
author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
|
22 |
+
journal={arXiv preprint arXiv:2302.01973},
|
23 |
+
year={2023}
|
24 |
+
}
|
25 |
+
"""
|
26 |
+
|
27 |
+
_DESCRIPTION = """\
|
28 |
+
This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
29 |
+
"""
|
30 |
+
|
31 |
+
|
32 |
+
_KWARGS_DESCRIPTION = """
|
33 |
+
Calculates how many predictions per question pass a set of tests for the given problem.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
predictions: The list of predictions for each question to execute.
|
37 |
+
languages: The language to use for each question.
|
38 |
+
question_dicts: The information for each question.
|
39 |
+
k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
|
40 |
+
num_workers: number of workers used to evaluate the candidate programs (Default: 4).
|
41 |
+
language_timeout: Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
|
42 |
+
Returns:
|
43 |
+
pass_at_k: dict with pass rates for each k
|
44 |
+
results: dict with granular results of each unittest
|
45 |
+
Examples:
|
46 |
+
>>> bc_eval = evaluate.load("bc_eval")
|
47 |
+
>>> predictions = [["def add(a,b):\n\treturn a+b", "def add(a,b):\n\treturn a-b"]]
|
48 |
+
>>> languages = ["Python"]
|
49 |
+
>>> question_dicts = [{"test_code": "...", "entry_fn_name": "add","entry_cls_name":"Solution", "test_case_ids":["0","1"],"test_list":"..."}]
|
50 |
+
>>> pass_at_k, results = code_eval.compute(predictions=predictions,languages=languages, question_dicts=question_dicts, k=[1, 2])
|
51 |
+
>>> print(pass_at_k)
|
52 |
+
{'pass@1': 0.5, 'pass@2': 1.0}
|
53 |
+
"""
|
54 |
+
|
55 |
+
|
56 |
+
_WARNING = """
|
57 |
+
################################################################################
|
58 |
+
!!!WARNING!!!
|
59 |
+
################################################################################
|
60 |
+
The "bc_eval" metric executes untrusted model-generated code in Python.
|
61 |
+
Although it is highly unlikely that model-generated code will do something
|
62 |
+
overtly malicious in response to this test suite, model-generated code may act
|
63 |
+
destructively due to a lack of model capability or alignment.
|
64 |
+
Users are strongly encouraged to sandbox this evaluation suite so that it
|
65 |
+
does not perform destructive actions on their host or network. For more
|
66 |
+
information on how OpenAI sandboxes its code, see the paper "Evaluating Large
|
67 |
+
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
|
68 |
+
Once you have read this disclaimer and taken appropriate precautions,
|
69 |
+
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
|
70 |
+
with:
|
71 |
+
>>> import os
|
72 |
+
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
73 |
+
################################################################################\
|
74 |
+
"""
|
75 |
+
|
76 |
+
_QUESTION_INFO_KEYS = {
|
77 |
+
"entry_fn_name",
|
78 |
+
"entry_cls_name",
|
79 |
+
"test_code",
|
80 |
+
"test_list",
|
81 |
+
"test_case_ids",
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
def make_file_and_command(
|
86 |
+
qid, idx, pred, question, working_dir, timeout_override=None
|
87 |
+
):
|
88 |
+
file_name = f"pred.{question['extension']}"
|
89 |
+
pred_dir = working_dir.joinpath(idx)
|
90 |
+
pred_dir.mkdir(parents=True)
|
91 |
+
pred_file = pred_dir.joinpath(file_name)
|
92 |
+
with pred_file.open("w") as f:
|
93 |
+
code = question["test_code"]
|
94 |
+
code = question["test_code"].replace("PLACEHOLDER_CODE_BODY", pred)
|
95 |
+
code = code.replace("PLACEHOLDER_FN_NAME", question["entry_fn_name"])
|
96 |
+
code = code.replace("PLACEHOLDER_CLS_NAME", question["entry_cls_name"])
|
97 |
+
f.write(code)
|
98 |
+
|
99 |
+
commands = []
|
100 |
+
for cmd, t in zip(question["commands"], question["timeouts"]):
|
101 |
+
commands.append(
|
102 |
+
{
|
103 |
+
"timeout": t if timeout_override is None else timeout_override,
|
104 |
+
"command": [
|
105 |
+
c if c != "__FILENAME__" else file_name for c in cmd
|
106 |
+
],
|
107 |
+
}
|
108 |
+
)
|
109 |
+
|
110 |
+
return {"qid": qid, "idx": idx, "commands": commands, "cwd": pred_dir}
|
111 |
+
|
112 |
+
|
113 |
+
def _write_preds(
|
114 |
+
preds,
|
115 |
+
languages,
|
116 |
+
language_timeout,
|
117 |
+
question_dicts,
|
118 |
+
tmp_dir,
|
119 |
+
):
|
120 |
+
commands = []
|
121 |
+
question_id_to_dict = {}
|
122 |
+
|
123 |
+
for pred_list, l, q_dict in tqdm(
|
124 |
+
zip(preds, languages, question_dicts), desc="Setup", total=len(preds)
|
125 |
+
):
|
126 |
+
qid = len(question_id_to_dict)
|
127 |
+
q_dict['language'] = l
|
128 |
+
question_id_to_dict[qid] = q_dict
|
129 |
+
for p in pred_list:
|
130 |
+
commands.append(
|
131 |
+
make_file_and_command(
|
132 |
+
qid=qid,
|
133 |
+
idx=str(len(commands)),
|
134 |
+
pred=p,
|
135 |
+
question=q_dict,
|
136 |
+
timeout_override=language_timeout.get(l),
|
137 |
+
working_dir=tmp_dir,
|
138 |
+
)
|
139 |
+
)
|
140 |
+
|
141 |
+
return question_id_to_dict, commands
|
142 |
+
|
143 |
+
|
144 |
+
@evaluate.utils.file_utils.add_start_docstrings(
|
145 |
+
_DESCRIPTION, _KWARGS_DESCRIPTION
|
146 |
+
)
|
147 |
+
class BabelCodeEval(evaluate.Metric):
|
148 |
+
def _info(self):
|
149 |
+
list_keys = ["timeouts", "commands", "test_case_ids"]
|
150 |
+
question_info_type = {
|
151 |
+
k: datasets.Value(dtype="string")
|
152 |
+
for k in _QUESTION_INFO_KEYS
|
153 |
+
if k not in list_keys
|
154 |
+
}
|
155 |
+
question_info_type["test_case_ids"] = datasets.Value("string")
|
156 |
+
question_info_type["commands"] = datasets.Sequence(
|
157 |
+
datasets.Value("string")
|
158 |
+
)
|
159 |
+
question_info_type["timeouts"] = datasets.Sequence(
|
160 |
+
datasets.Value("int32")
|
161 |
+
)
|
162 |
+
|
163 |
+
return evaluate.MetricInfo(
|
164 |
+
# This is the description that will appear on the metrics page.
|
165 |
+
description=_DESCRIPTION,
|
166 |
+
citation=_CITATION,
|
167 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
168 |
+
# This defines the format of each prediction and reference
|
169 |
+
features=datasets.Features(
|
170 |
+
{
|
171 |
+
"predictions": datasets.Sequence(datasets.Value("string")),
|
172 |
+
"languages": datasets.Value("string"),
|
173 |
+
}
|
174 |
+
),
|
175 |
+
homepage="https://github.com/google-research/babelcode",
|
176 |
+
codebase_urls=["https://github.com/google-research/babelcode"],
|
177 |
+
reference_urls=["https://github.com/google-research/babelcode"],
|
178 |
+
)
|
179 |
+
|
180 |
+
def _compute(
|
181 |
+
self,
|
182 |
+
predictions,
|
183 |
+
languages,
|
184 |
+
question_dicts,
|
185 |
+
k=[1, 10, 100],
|
186 |
+
num_workers=4,
|
187 |
+
language_timeout=None,
|
188 |
+
):
|
189 |
+
"""Returns the scores"""
|
190 |
+
|
191 |
+
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
192 |
+
raise ValueError(_WARNING)
|
193 |
+
|
194 |
+
language_timeout = language_timeout or {}
|
195 |
+
|
196 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
197 |
+
working_dir = Path(tmp_dir)
|
198 |
+
question_map, pred_commands = _write_preds(
|
199 |
+
preds=predictions,
|
200 |
+
languages=languages,
|
201 |
+
language_timeout=language_timeout,
|
202 |
+
question_dicts=question_dicts,
|
203 |
+
tmp_dir=working_dir,
|
204 |
+
)
|
205 |
+
|
206 |
+
results = execute_predictions(
|
207 |
+
pred_commands,
|
208 |
+
num_workers=num_workers,
|
209 |
+
max_task_per_child=5,
|
210 |
+
garbage_collection_freq=500,
|
211 |
+
)
|
212 |
+
|
213 |
+
|
214 |
+
all_results, q_passes, q_pct = _eval_predictions(
|
215 |
+
results, question_map
|
216 |
+
)
|
217 |
+
|
218 |
+
|
219 |
+
assert len(q_passes) == len(q_pct)
|
220 |
+
metrics = {}
|
221 |
+
for lang in q_passes:
|
222 |
+
metrics.update(_calculate_metrics(lang, q_passes[lang], q_pct[lang],k_vals=k))
|
223 |
+
return metrics, all_results
|
224 |
+
|
225 |
+
def _eval_single_pred(result, test_ids, num_expected_commands):
|
226 |
+
test_case_results = {k: "MISSING" for k in test_ids}
|
227 |
+
if len(result["results"]) != num_expected_commands:
|
228 |
+
return "HAD_ERROR", 0, test_case_results
|
229 |
+
|
230 |
+
last_result = result["results"][-1]
|
231 |
+
if last_result.timed_out:
|
232 |
+
return "TIMED_OUT", 0, test_case_results
|
233 |
+
elif last_result.return_code != 0:
|
234 |
+
return "HAD_ERROR", 0, test_case_results
|
235 |
+
elif not last_result.stdout:
|
236 |
+
return "HAD_ERROR", 0, test_case_results
|
237 |
+
|
238 |
+
for match in STDOUT_PARSE_REGEX.findall(last_result.stdout):
|
239 |
+
idx, test_result = match
|
240 |
+
if idx in test_ids:
|
241 |
+
if test_case_results[idx] != "MISSING":
|
242 |
+
return "UNKNOWN_ERROR", 0, test_case_results
|
243 |
+
test_case_results[idx] = test_result.strip()
|
244 |
+
|
245 |
+
did_test_fail = False
|
246 |
+
had_error = False
|
247 |
+
num_passed = 0
|
248 |
+
for r in test_case_results.values():
|
249 |
+
if r == "PASSED":
|
250 |
+
num_passed += 1
|
251 |
+
elif r == "FAILED":
|
252 |
+
did_test_fail = True
|
253 |
+
else:
|
254 |
+
had_error = True
|
255 |
+
|
256 |
+
if had_error:
|
257 |
+
return "HAD_ERROR", num_passed, test_case_results
|
258 |
+
if did_test_fail:
|
259 |
+
return "FAILED", num_passed, test_case_results
|
260 |
+
|
261 |
+
return "PASSED", num_passed, test_case_results
|
262 |
+
|
263 |
+
|
264 |
+
def _eval_predictions(pred_results, question_map):
|
265 |
+
out = []
|
266 |
+
question_results = defaultdict(lambda: defaultdict(list))
|
267 |
+
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
268 |
+
|
269 |
+
for p in pred_results:
|
270 |
+
question = question_map[p["qid"]]
|
271 |
+
test_cases = question["test_case_ids"]
|
272 |
+
num_expected_commands = len(question["commands"])
|
273 |
+
|
274 |
+
outcome, num_passed, test_case_results = _eval_single_pred(
|
275 |
+
p, test_ids=test_cases, num_expected_commands=num_expected_commands
|
276 |
+
)
|
277 |
+
|
278 |
+
p["results"] = [dataclasses.asdict(r) for r in p["results"]]
|
279 |
+
p["test_cases"] = test_case_results
|
280 |
+
p["outcome"] = outcome
|
281 |
+
|
282 |
+
lang = question['language']
|
283 |
+
question_results[lang][p["qid"]].append(
|
284 |
+
num_passed == len(test_case_results)
|
285 |
+
)
|
286 |
+
question_pct_pass[lang][p["qid"]].append(
|
287 |
+
num_passed / len(test_case_results)
|
288 |
+
)
|
289 |
+
|
290 |
+
out.append(p)
|
291 |
+
|
292 |
+
return out, question_results, question_pct_pass
|
293 |
+
|
294 |
+
|
295 |
+
def _calculate_metrics(lang,q_passed, q_pcts, k_vals):
|
296 |
+
assert len(q_passed) == len(q_pcts)
|
297 |
+
|
298 |
+
num_samples = np.zeros(len(q_passed))
|
299 |
+
num_correct = np.zeros(len(q_passed))
|
300 |
+
pcts_passed = np.zeros(len(q_passed))
|
301 |
+
for i, (k,v) in enumerate(q_passed.items()):
|
302 |
+
num_samples[i] = len(v)
|
303 |
+
num_correct[i] = sum(v)
|
304 |
+
pcts_passed[i] = np.mean(q_pcts[k])
|
305 |
+
|
306 |
+
|
307 |
+
out = {f'{lang}/pass@{k}': estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_vals}
|
308 |
+
out[f'{lang}/mean_pct_pass'] = np.mean(pcts_passed)
|
309 |
+
|
310 |
+
|
311 |
+
return out
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
def estimate_pass_at_k(num_samples, num_correct, k):
|
316 |
+
"""Estimates pass@k of each problem and returns them in an array."""
|
317 |
+
|
318 |
+
def estimator(n: int, c: int, k: int) -> float:
|
319 |
+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
|
320 |
+
if n - c < k:
|
321 |
+
return 1.0
|
322 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
323 |
+
|
324 |
+
if isinstance(num_samples, int):
|
325 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
326 |
+
else:
|
327 |
+
assert len(num_samples) == len(num_correct)
|
328 |
+
num_samples_it = iter(num_samples)
|
329 |
+
|
330 |
+
return np.array(
|
331 |
+
[
|
332 |
+
estimator(int(n), int(c), k)
|
333 |
+
for n, c in zip(num_samples_it, num_correct)
|
334 |
+
]
|
335 |
+
)
|
execution.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import gc
|
3 |
+
import multiprocessing as mp
|
4 |
+
import pathlib
|
5 |
+
import subprocess
|
6 |
+
from dataclasses import dataclass
|
7 |
+
from typing import Dict, List
|
8 |
+
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class CommandResult:
|
13 |
+
return_code: int
|
14 |
+
runtime: float
|
15 |
+
stdout: str
|
16 |
+
stderr: str
|
17 |
+
timed_out: bool
|
18 |
+
|
19 |
+
def safe_execute(
|
20 |
+
command_to_run: List[str],
|
21 |
+
working_dir: pathlib.Path,
|
22 |
+
timeout: int = 10,
|
23 |
+
) -> CommandResult:
|
24 |
+
"""Executes a list of commands safely.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
command_to_run: The command to run.
|
28 |
+
working_dir: The working directory to run them in.
|
29 |
+
timeout Timeout.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
The result of executing the command.
|
33 |
+
"""
|
34 |
+
timed_out = False
|
35 |
+
return_code = -1
|
36 |
+
runtime = timeout
|
37 |
+
stderr = None
|
38 |
+
stdout = None
|
39 |
+
start_time = datetime.datetime.now()
|
40 |
+
execution_process = subprocess.Popen(
|
41 |
+
command_to_run,
|
42 |
+
cwd=str(working_dir),
|
43 |
+
stdout=subprocess.PIPE,
|
44 |
+
stderr=subprocess.PIPE,
|
45 |
+
)
|
46 |
+
try:
|
47 |
+
outputs = execution_process.communicate(timeout=timeout)
|
48 |
+
|
49 |
+
stdout, stderr = outputs
|
50 |
+
stdout = stdout.decode('utf-8')
|
51 |
+
stderr = stderr.decode('utf-8')
|
52 |
+
runtime = (datetime.datetime.now() - start_time).total_seconds()
|
53 |
+
return_code = execution_process.returncode
|
54 |
+
except subprocess.TimeoutExpired:
|
55 |
+
timed_out = True
|
56 |
+
runtime = timeout
|
57 |
+
finally:
|
58 |
+
execution_process.kill()
|
59 |
+
|
60 |
+
return CommandResult(
|
61 |
+
return_code=return_code,
|
62 |
+
runtime=runtime,
|
63 |
+
stderr=stderr,
|
64 |
+
stdout=stdout,
|
65 |
+
timed_out=timed_out,
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
def execute_code(sample: Dict):
|
70 |
+
"""Execute a file of code.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
sample: The sample to run.
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
The execution result.
|
77 |
+
"""
|
78 |
+
file_path = sample["cwd"]
|
79 |
+
working_dir_for_execution = (
|
80 |
+
file_path.parent if file_path.is_file() else file_path
|
81 |
+
)
|
82 |
+
working_dir_for_execution = working_dir_for_execution.resolve().absolute()
|
83 |
+
timed_out = False
|
84 |
+
failed = False
|
85 |
+
results = []
|
86 |
+
for command in sample['commands']:
|
87 |
+
res = safe_execute(command['command'], working_dir=working_dir_for_execution, timeout=command['timeout'])
|
88 |
+
results.append(res)
|
89 |
+
if res.timed_out:
|
90 |
+
timed_out = True
|
91 |
+
break
|
92 |
+
if res.return_code != 0:
|
93 |
+
failed = True
|
94 |
+
break
|
95 |
+
return {
|
96 |
+
"qid":sample['qid'],
|
97 |
+
"idx": sample["idx"],
|
98 |
+
"file_path": str(file_path.absolute().resolve()),
|
99 |
+
"results": results,
|
100 |
+
"failed":failed,
|
101 |
+
"timed_out": timed_out,
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def execute_predictions(
|
108 |
+
predictions: List[Dict],
|
109 |
+
num_workers: int = 1,
|
110 |
+
max_task_per_child: int = 1,
|
111 |
+
garbage_collection_freq: int = 500,
|
112 |
+
):
|
113 |
+
"""Execute a list of predictions in a specific language.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
predictions: List of predictions.
|
117 |
+
num_workers: The number of workers to use.
|
118 |
+
max_task_per_child: The maximum tasks ran per child before it is killed.
|
119 |
+
garbage_collection_freq: How often to run garbage collection.
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
The the array of raw execution results and the total runtime.
|
123 |
+
"""
|
124 |
+
|
125 |
+
# Make the arguments to submit to the ThreadPoolExecutor. Do it here so we
|
126 |
+
# can have a progress bar as well.
|
127 |
+
num_to_complete = len(predictions)
|
128 |
+
num_completed = 0
|
129 |
+
results = []
|
130 |
+
with mp.Pool(num_workers, maxtasksperchild=max_task_per_child) as pool:
|
131 |
+
for result in tqdm(
|
132 |
+
pool.imap_unordered(execute_code, predictions),
|
133 |
+
total=num_to_complete,
|
134 |
+
desc="Executing",
|
135 |
+
):
|
136 |
+
num_completed += 1
|
137 |
+
|
138 |
+
results.append(result)
|
139 |
+
|
140 |
+
if num_completed % garbage_collection_freq == 0:
|
141 |
+
gc.collect()
|
142 |
+
# Cleanup pool
|
143 |
+
pool.close()
|
144 |
+
pool.terminate()
|
145 |
+
return results
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@main
|