Spaces:
Runtime error
Runtime error
BC eval
Browse files- README.md +242 -5
- app.py +5 -0
- bc_eval.py +335 -0
- execution.py +145 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -1,12 +1,249 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: pink
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: BabelCode Eval
|
| 3 |
+
colorFrom: blue
|
|
|
|
| 4 |
colorTo: red
|
| 5 |
sdk: gradio
|
| 6 |
+
sdk_version: 3.19.1
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
+
tags:
|
| 10 |
+
- evaluate
|
| 11 |
+
- metric
|
| 12 |
+
description: >-
|
| 13 |
+
This metric implements the evaluation harness for datasets translated with the
|
| 14 |
+
BabelCode framework as described in the paper "Measuring The Impact Of
|
| 15 |
+
Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
| 16 |
---
|
| 17 |
|
| 18 |
+
# Metric Card for bc_eval
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
## Metric Description
|
| 22 |
+
This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
| 23 |
+
|
| 24 |
+
## How to Use
|
| 25 |
+
1. Generate predictions for BabelCode supported datasets
|
| 26 |
+
2. Aggregate the predictions by their question.
|
| 27 |
+
3. With the aggregated predictions for each question, add the `question_info` from the original BabelCode dataset.
|
| 28 |
+
4. Run the metric on the `predictions`, `languages`, and `question_infos`.
|
| 29 |
+
5. The result of the metric is a tuple where the first is a metric dict and the second value is the results for each prediction.
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
import evaluate
|
| 33 |
+
from datasets import load_dataset
|
| 34 |
+
import os
|
| 35 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 36 |
+
|
| 37 |
+
predictions = []
|
| 38 |
+
languages = []
|
| 39 |
+
question_infos = []
|
| 40 |
+
ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
|
| 41 |
+
|
| 42 |
+
for row in ds:
|
| 43 |
+
languages.append(row['language'])
|
| 44 |
+
question_infos.append(row['question_info'])
|
| 45 |
+
|
| 46 |
+
# Replace this with however you generate and postprocess predictions.
|
| 47 |
+
predictions.append(model.generate(row['signature_with_docstring']))
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
metric = evaluate.load("bc_eval")
|
| 51 |
+
metrics, results = metric.compute(
|
| 52 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
| 53 |
+
)
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Inputs
|
| 57 |
+
* `predictions`(`List[List[str]]`): The list of predictions for each question to execute.
|
| 58 |
+
* `languages`(`List[str]`): The language to use for each question.
|
| 59 |
+
* `question_dicts`(`List[Dict]`): The information for each question.
|
| 60 |
+
* `k`(`List[int]`): number of code candidates to consider in the evaluation (Default: [1, 10, 100])
|
| 61 |
+
* `num_workers`(`int`): number of workers used to evaluate the candidate programs (Default: 4).
|
| 62 |
+
* `language_timeout`(`Dict[str,int]`): Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
|
| 63 |
+
|
| 64 |
+
### Output Values
|
| 65 |
+
|
| 66 |
+
The `bc_eval` metric outputs two things:
|
| 67 |
+
|
| 68 |
+
* `metrics`: a dictionary with the pass rates for each k value defined in the arguments and the mean percent of tests passed per question. The keys are formatted as `{LANGUAGE NAME}/{METRIC NAME}`
|
| 69 |
+
|
| 70 |
+
* `results`: a list of dictionaries with the results from each individual prediction.
|
| 71 |
+
|
| 72 |
+
#### Values from Popular Papers
|
| 73 |
+
[PaLM-2](https://arxiv.org/pdf/2305.10403.pdf) Performance on BC-HumanEval (`pass@1` with greedy decoding):
|
| 74 |
+
|
| 75 |
+
| Language | PaLM 2-S* | PaLM 540B | PaLM-Coder-540B |
|
| 76 |
+
|------------|-----------|-----------|-----------------|
|
| 77 |
+
| C# | 24.22 | 20.5 | **26.09** |
|
| 78 |
+
| C++ | **34.16** | 21.74 | 24.22 |
|
| 79 |
+
| Go | 19.25 | 13.66 | **21.12** |
|
| 80 |
+
| Haskell | **8.7** | 1.86 | 1.86 |
|
| 81 |
+
| Java | **31.06** | 20.5 | 25.47 |
|
| 82 |
+
| JavaScript | **32.3** | 23.6 | 29.81 |
|
| 83 |
+
| Julia | **16.77** | 2.48 | 4.35 |
|
| 84 |
+
| Lua | **26.09** | 19.25 | 24.84 |
|
| 85 |
+
| PHP | **26.09** | 18.63 | 25.47 |
|
| 86 |
+
| Python | **34.16** | 17.39 | 26.71 |
|
| 87 |
+
| Rust | **28.57** | 16.15 | 22.98 |
|
| 88 |
+
| TypeScript | **32.3** | 17.39 | 30.43 |
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
### Examples
|
| 92 |
+
Full example with inputs that fail tests, time out, have an error, and pass.
|
| 93 |
+
|
| 94 |
+
#### Passing Example
|
| 95 |
+
```python
|
| 96 |
+
import evaluate
|
| 97 |
+
from datasets import load_dataset
|
| 98 |
+
import os
|
| 99 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 100 |
+
ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
|
| 101 |
+
example = ds[0]
|
| 102 |
+
metric = evaluate.load("bc_eval")
|
| 103 |
+
languages = ["Python"]
|
| 104 |
+
question_infos = [example["question_info"]]
|
| 105 |
+
predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
| 106 |
+
for idx, elem in enumerate(numbers):
|
| 107 |
+
for idx2, elem2 in enumerate(numbers):
|
| 108 |
+
if idx != idx2:
|
| 109 |
+
distance = abs(elem - elem2)
|
| 110 |
+
if distance < threshold:
|
| 111 |
+
return True
|
| 112 |
+
|
| 113 |
+
return False"""
|
| 114 |
+
]]
|
| 115 |
+
metrics, results = metric.compute(
|
| 116 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
| 117 |
+
)
|
| 118 |
+
```
|
| 119 |
+
`metrics` is:
|
| 120 |
+
```
|
| 121 |
+
{"Python/pass@1": 1.0, "Python/mean_pct_pass": 1.0}
|
| 122 |
+
```
|
| 123 |
+
`results` is:
|
| 124 |
+
```
|
| 125 |
+
[{"qid": 0, "idx": "0", "file_path": ".../tmpqt_p3dwn/0", "results": [{"return_code": 0, "runtime": 0.076369, "stdout": "TEST-0...PASSED\r\nTEST-1...PASSED\r\nTEST-2...PASSED\r\nTEST-3...PASSED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...PASSED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "PASSED", "2": "PASSED", "3": "PASSED", "4": "PASSED", "5": "PASSED", "6": "PASSED"}, "outcome": "PASSED"}]
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
#### Fails Test Example
|
| 130 |
+
|
| 131 |
+
```python
|
| 132 |
+
import evaluate
|
| 133 |
+
from datasets import load_dataset
|
| 134 |
+
import os
|
| 135 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 136 |
+
ds = load_dataset(
|
| 137 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
| 138 |
+
)
|
| 139 |
+
example = ds[0]
|
| 140 |
+
metric = evaluate.load("bc_eval")
|
| 141 |
+
languages = ["Python"]
|
| 142 |
+
question_infos = [example["question_info"]]
|
| 143 |
+
predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
| 144 |
+
for idx, elem in enumerate(numbers):
|
| 145 |
+
for idx2, elem2 in enumerate(numbers):
|
| 146 |
+
if idx != idx2:
|
| 147 |
+
distance = elem - elem2
|
| 148 |
+
if distance < threshold:
|
| 149 |
+
return True
|
| 150 |
+
|
| 151 |
+
return False"""
|
| 152 |
+
]]
|
| 153 |
+
metrics, results = metric.compute(
|
| 154 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
| 155 |
+
)
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
`metrics` is:
|
| 159 |
+
```
|
| 160 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.5714285714285714}
|
| 161 |
+
```
|
| 162 |
+
`results` is:
|
| 163 |
+
```
|
| 164 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmp7u587vk5/0", "results": [{"return_code": 0, "runtime": 0.08255, "stdout": "TEST-0...PASSED\r\nTEST-1...FAILED\r\nTEST-2...PASSED\r\nTEST-3...FAILED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...FAILED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "FAILED", "2": "PASSED", "3": "FAILED", "4": "PASSED", "5": "PASSED", "6": "FAILED"}, "outcome": "FAILED"}]
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Note that the individual test results are located in results.
|
| 168 |
+
|
| 169 |
+
#### Timeout Example
|
| 170 |
+
|
| 171 |
+
```python
|
| 172 |
+
import evaluate
|
| 173 |
+
from datasets import load_dataset
|
| 174 |
+
import os
|
| 175 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 176 |
+
ds = load_dataset(
|
| 177 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
| 178 |
+
)
|
| 179 |
+
example = ds[0]
|
| 180 |
+
metric = evaluate.load("bc_eval")
|
| 181 |
+
languages = ["Python"]
|
| 182 |
+
question_infos = [example["question_info"]]
|
| 183 |
+
predictions = [["""import time
|
| 184 |
+
def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
| 185 |
+
time.sleep(100)
|
| 186 |
+
"""
|
| 187 |
+
]]
|
| 188 |
+
metrics, results = metric.compute(
|
| 189 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
| 190 |
+
)
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
`metrics` is:
|
| 194 |
+
```
|
| 195 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
|
| 196 |
+
```
|
| 197 |
+
`results` is:
|
| 198 |
+
```
|
| 199 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmp_rz6bhb9/0", "results": [{"return_code": -1, "runtime": 10, "stdout": null, "stderr": null, "timed_out": true}], "failed": false, "timed_out": true, "test_cases": {"0": "MISSING", "1": "MISSING", "2": "MISSING", "3": "MISSING", "4": "MISSING", "5": "MISSING", "6": "MISSING"}, "outcome": "TIMED_OUT"}]
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
#### Error Example
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
import evaluate
|
| 206 |
+
from datasets import load_dataset
|
| 207 |
+
import os
|
| 208 |
+
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 209 |
+
ds = load_dataset(
|
| 210 |
+
"gabeorlanski/bc-humaneval", "Python", split="test"
|
| 211 |
+
)
|
| 212 |
+
example = ds[0]
|
| 213 |
+
metric = evaluate.load("bc_eval")
|
| 214 |
+
languages = ["Python"]
|
| 215 |
+
question_infos = [example["question_info"]]
|
| 216 |
+
predictions = [["""import time
|
| 217 |
+
def has_close_elements(numbers: List[float], threshold: float) -> bool:
|
| 218 |
+
raise ValueError()
|
| 219 |
+
""",
|
| 220 |
+
"""def add(a, b):
|
| 221 |
+
return a+b"""
|
| 222 |
+
]]
|
| 223 |
+
metrics, results = metric.compute(
|
| 224 |
+
predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
|
| 225 |
+
)
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
`metrics` is:
|
| 229 |
+
```
|
| 230 |
+
{"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
|
| 231 |
+
```
|
| 232 |
+
`results` is:
|
| 233 |
+
```
|
| 234 |
+
[{"qid": 0, "idx": "0", "file_path": "/tmpjdn51aaa/0", "results": [{"return_code": 0, "runtime": 0.102855, "stdout": "TEST-0...ValueError\r\nTEST-1...ValueError\r\nTEST-2...ValueError\r\nTEST-3...ValueError\r\nTEST-4...ValueError\r\nTEST-5...ValueError\r\nTEST-6...ValueError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "ValueError", "1": "ValueError", "2": "ValueError", "3": "ValueError", "4": "ValueError", "5": "ValueError", "6": "ValueError"}, "outcome": "HAD_ERROR"},
|
| 235 |
+
{"qid": 0, "idx": "1", "file_path": "/tmpjdn51aaa/1", "results": [{"return_code": 0, "runtime": 0.094347, "stdout": "TEST-0...NameError\r\nTEST-1...NameError\r\nTEST-2...NameError\r\nTEST-3...NameError\r\nTEST-4...NameError\r\nTEST-5...NameError\r\nTEST-6...NameError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "NameError", "1": "NameError", "2": "NameError", "3": "NameError", "4": "NameError", "5": "NameError", "6": "NameError"}, "outcome": "HAD_ERROR"}]
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
## Limitations and Bias
|
| 239 |
+
This metric requires that the dataset be BabelCode compatible.
|
| 240 |
+
|
| 241 |
+
## Citation
|
| 242 |
+
```
|
| 243 |
+
@article{orlanski2023measuring,
|
| 244 |
+
title={Measuring The Impact Of Programming Language Distribution},
|
| 245 |
+
author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
|
| 246 |
+
journal={arXiv preprint arXiv:2302.01973},
|
| 247 |
+
year={2023}
|
| 248 |
+
}
|
| 249 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
from evaluate.utils import launch_gradio_widget
|
| 3 |
+
|
| 4 |
+
module = evaluate.load("gabeorlanski/bc_eval")
|
| 5 |
+
launch_gradio_widget(module)
|
bc_eval.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import itertools
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import tempfile
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import datasets
|
| 10 |
+
import evaluate
|
| 11 |
+
import numpy as np
|
| 12 |
+
from tqdm import tqdm
|
| 13 |
+
|
| 14 |
+
from .execution import execute_predictions
|
| 15 |
+
|
| 16 |
+
STDOUT_PARSE_REGEX = re.compile(r"^TEST-(.+)\.\.\.(.+)$", flags=re.MULTILINE)
|
| 17 |
+
|
| 18 |
+
_CITATION = """\
|
| 19 |
+
@article{orlanski2023measuring,
|
| 20 |
+
title={Measuring The Impact Of Programming Language Distribution},
|
| 21 |
+
author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
|
| 22 |
+
journal={arXiv preprint arXiv:2302.01973},
|
| 23 |
+
year={2023}
|
| 24 |
+
}
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
_DESCRIPTION = """\
|
| 28 |
+
This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
_KWARGS_DESCRIPTION = """
|
| 33 |
+
Calculates how many predictions per question pass a set of tests for the given problem.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
predictions: The list of predictions for each question to execute.
|
| 37 |
+
languages: The language to use for each question.
|
| 38 |
+
question_dicts: The information for each question.
|
| 39 |
+
k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
|
| 40 |
+
num_workers: number of workers used to evaluate the candidate programs (Default: 4).
|
| 41 |
+
language_timeout: Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
|
| 42 |
+
Returns:
|
| 43 |
+
pass_at_k: dict with pass rates for each k
|
| 44 |
+
results: dict with granular results of each unittest
|
| 45 |
+
Examples:
|
| 46 |
+
>>> bc_eval = evaluate.load("bc_eval")
|
| 47 |
+
>>> predictions = [["def add(a,b):\n\treturn a+b", "def add(a,b):\n\treturn a-b"]]
|
| 48 |
+
>>> languages = ["Python"]
|
| 49 |
+
>>> question_dicts = [{"test_code": "...", "entry_fn_name": "add","entry_cls_name":"Solution", "test_case_ids":["0","1"],"test_list":"..."}]
|
| 50 |
+
>>> pass_at_k, results = code_eval.compute(predictions=predictions,languages=languages, question_dicts=question_dicts, k=[1, 2])
|
| 51 |
+
>>> print(pass_at_k)
|
| 52 |
+
{'pass@1': 0.5, 'pass@2': 1.0}
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
_WARNING = """
|
| 57 |
+
################################################################################
|
| 58 |
+
!!!WARNING!!!
|
| 59 |
+
################################################################################
|
| 60 |
+
The "bc_eval" metric executes untrusted model-generated code in Python.
|
| 61 |
+
Although it is highly unlikely that model-generated code will do something
|
| 62 |
+
overtly malicious in response to this test suite, model-generated code may act
|
| 63 |
+
destructively due to a lack of model capability or alignment.
|
| 64 |
+
Users are strongly encouraged to sandbox this evaluation suite so that it
|
| 65 |
+
does not perform destructive actions on their host or network. For more
|
| 66 |
+
information on how OpenAI sandboxes its code, see the paper "Evaluating Large
|
| 67 |
+
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
|
| 68 |
+
Once you have read this disclaimer and taken appropriate precautions,
|
| 69 |
+
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
|
| 70 |
+
with:
|
| 71 |
+
>>> import os
|
| 72 |
+
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
| 73 |
+
################################################################################\
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
_QUESTION_INFO_KEYS = {
|
| 77 |
+
"entry_fn_name",
|
| 78 |
+
"entry_cls_name",
|
| 79 |
+
"test_code",
|
| 80 |
+
"test_list",
|
| 81 |
+
"test_case_ids",
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def make_file_and_command(
|
| 86 |
+
qid, idx, pred, question, working_dir, timeout_override=None
|
| 87 |
+
):
|
| 88 |
+
file_name = f"pred.{question['extension']}"
|
| 89 |
+
pred_dir = working_dir.joinpath(idx)
|
| 90 |
+
pred_dir.mkdir(parents=True)
|
| 91 |
+
pred_file = pred_dir.joinpath(file_name)
|
| 92 |
+
with pred_file.open("w") as f:
|
| 93 |
+
code = question["test_code"]
|
| 94 |
+
code = question["test_code"].replace("PLACEHOLDER_CODE_BODY", pred)
|
| 95 |
+
code = code.replace("PLACEHOLDER_FN_NAME", question["entry_fn_name"])
|
| 96 |
+
code = code.replace("PLACEHOLDER_CLS_NAME", question["entry_cls_name"])
|
| 97 |
+
f.write(code)
|
| 98 |
+
|
| 99 |
+
commands = []
|
| 100 |
+
for cmd, t in zip(question["commands"], question["timeouts"]):
|
| 101 |
+
commands.append(
|
| 102 |
+
{
|
| 103 |
+
"timeout": t if timeout_override is None else timeout_override,
|
| 104 |
+
"command": [
|
| 105 |
+
c if c != "__FILENAME__" else file_name for c in cmd
|
| 106 |
+
],
|
| 107 |
+
}
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
return {"qid": qid, "idx": idx, "commands": commands, "cwd": pred_dir}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _write_preds(
|
| 114 |
+
preds,
|
| 115 |
+
languages,
|
| 116 |
+
language_timeout,
|
| 117 |
+
question_dicts,
|
| 118 |
+
tmp_dir,
|
| 119 |
+
):
|
| 120 |
+
commands = []
|
| 121 |
+
question_id_to_dict = {}
|
| 122 |
+
|
| 123 |
+
for pred_list, l, q_dict in tqdm(
|
| 124 |
+
zip(preds, languages, question_dicts), desc="Setup", total=len(preds)
|
| 125 |
+
):
|
| 126 |
+
qid = len(question_id_to_dict)
|
| 127 |
+
q_dict['language'] = l
|
| 128 |
+
question_id_to_dict[qid] = q_dict
|
| 129 |
+
for p in pred_list:
|
| 130 |
+
commands.append(
|
| 131 |
+
make_file_and_command(
|
| 132 |
+
qid=qid,
|
| 133 |
+
idx=str(len(commands)),
|
| 134 |
+
pred=p,
|
| 135 |
+
question=q_dict,
|
| 136 |
+
timeout_override=language_timeout.get(l),
|
| 137 |
+
working_dir=tmp_dir,
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
return question_id_to_dict, commands
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@evaluate.utils.file_utils.add_start_docstrings(
|
| 145 |
+
_DESCRIPTION, _KWARGS_DESCRIPTION
|
| 146 |
+
)
|
| 147 |
+
class BabelCodeEval(evaluate.Metric):
|
| 148 |
+
def _info(self):
|
| 149 |
+
list_keys = ["timeouts", "commands", "test_case_ids"]
|
| 150 |
+
question_info_type = {
|
| 151 |
+
k: datasets.Value(dtype="string")
|
| 152 |
+
for k in _QUESTION_INFO_KEYS
|
| 153 |
+
if k not in list_keys
|
| 154 |
+
}
|
| 155 |
+
question_info_type["test_case_ids"] = datasets.Value("string")
|
| 156 |
+
question_info_type["commands"] = datasets.Sequence(
|
| 157 |
+
datasets.Value("string")
|
| 158 |
+
)
|
| 159 |
+
question_info_type["timeouts"] = datasets.Sequence(
|
| 160 |
+
datasets.Value("int32")
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
return evaluate.MetricInfo(
|
| 164 |
+
# This is the description that will appear on the metrics page.
|
| 165 |
+
description=_DESCRIPTION,
|
| 166 |
+
citation=_CITATION,
|
| 167 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
| 168 |
+
# This defines the format of each prediction and reference
|
| 169 |
+
features=datasets.Features(
|
| 170 |
+
{
|
| 171 |
+
"predictions": datasets.Sequence(datasets.Value("string")),
|
| 172 |
+
"languages": datasets.Value("string"),
|
| 173 |
+
}
|
| 174 |
+
),
|
| 175 |
+
homepage="https://github.com/google-research/babelcode",
|
| 176 |
+
codebase_urls=["https://github.com/google-research/babelcode"],
|
| 177 |
+
reference_urls=["https://github.com/google-research/babelcode"],
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
def _compute(
|
| 181 |
+
self,
|
| 182 |
+
predictions,
|
| 183 |
+
languages,
|
| 184 |
+
question_dicts,
|
| 185 |
+
k=[1, 10, 100],
|
| 186 |
+
num_workers=4,
|
| 187 |
+
language_timeout=None,
|
| 188 |
+
):
|
| 189 |
+
"""Returns the scores"""
|
| 190 |
+
|
| 191 |
+
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
| 192 |
+
raise ValueError(_WARNING)
|
| 193 |
+
|
| 194 |
+
language_timeout = language_timeout or {}
|
| 195 |
+
|
| 196 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 197 |
+
working_dir = Path(tmp_dir)
|
| 198 |
+
question_map, pred_commands = _write_preds(
|
| 199 |
+
preds=predictions,
|
| 200 |
+
languages=languages,
|
| 201 |
+
language_timeout=language_timeout,
|
| 202 |
+
question_dicts=question_dicts,
|
| 203 |
+
tmp_dir=working_dir,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
results = execute_predictions(
|
| 207 |
+
pred_commands,
|
| 208 |
+
num_workers=num_workers,
|
| 209 |
+
max_task_per_child=5,
|
| 210 |
+
garbage_collection_freq=500,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
all_results, q_passes, q_pct = _eval_predictions(
|
| 215 |
+
results, question_map
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
assert len(q_passes) == len(q_pct)
|
| 220 |
+
metrics = {}
|
| 221 |
+
for lang in q_passes:
|
| 222 |
+
metrics.update(_calculate_metrics(lang, q_passes[lang], q_pct[lang],k_vals=k))
|
| 223 |
+
return metrics, all_results
|
| 224 |
+
|
| 225 |
+
def _eval_single_pred(result, test_ids, num_expected_commands):
|
| 226 |
+
test_case_results = {k: "MISSING" for k in test_ids}
|
| 227 |
+
if len(result["results"]) != num_expected_commands:
|
| 228 |
+
return "HAD_ERROR", 0, test_case_results
|
| 229 |
+
|
| 230 |
+
last_result = result["results"][-1]
|
| 231 |
+
if last_result.timed_out:
|
| 232 |
+
return "TIMED_OUT", 0, test_case_results
|
| 233 |
+
elif last_result.return_code != 0:
|
| 234 |
+
return "HAD_ERROR", 0, test_case_results
|
| 235 |
+
elif not last_result.stdout:
|
| 236 |
+
return "HAD_ERROR", 0, test_case_results
|
| 237 |
+
|
| 238 |
+
for match in STDOUT_PARSE_REGEX.findall(last_result.stdout):
|
| 239 |
+
idx, test_result = match
|
| 240 |
+
if idx in test_ids:
|
| 241 |
+
if test_case_results[idx] != "MISSING":
|
| 242 |
+
return "UNKNOWN_ERROR", 0, test_case_results
|
| 243 |
+
test_case_results[idx] = test_result.strip()
|
| 244 |
+
|
| 245 |
+
did_test_fail = False
|
| 246 |
+
had_error = False
|
| 247 |
+
num_passed = 0
|
| 248 |
+
for r in test_case_results.values():
|
| 249 |
+
if r == "PASSED":
|
| 250 |
+
num_passed += 1
|
| 251 |
+
elif r == "FAILED":
|
| 252 |
+
did_test_fail = True
|
| 253 |
+
else:
|
| 254 |
+
had_error = True
|
| 255 |
+
|
| 256 |
+
if had_error:
|
| 257 |
+
return "HAD_ERROR", num_passed, test_case_results
|
| 258 |
+
if did_test_fail:
|
| 259 |
+
return "FAILED", num_passed, test_case_results
|
| 260 |
+
|
| 261 |
+
return "PASSED", num_passed, test_case_results
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _eval_predictions(pred_results, question_map):
|
| 265 |
+
out = []
|
| 266 |
+
question_results = defaultdict(lambda: defaultdict(list))
|
| 267 |
+
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
| 268 |
+
|
| 269 |
+
for p in pred_results:
|
| 270 |
+
question = question_map[p["qid"]]
|
| 271 |
+
test_cases = question["test_case_ids"]
|
| 272 |
+
num_expected_commands = len(question["commands"])
|
| 273 |
+
|
| 274 |
+
outcome, num_passed, test_case_results = _eval_single_pred(
|
| 275 |
+
p, test_ids=test_cases, num_expected_commands=num_expected_commands
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
p["results"] = [dataclasses.asdict(r) for r in p["results"]]
|
| 279 |
+
p["test_cases"] = test_case_results
|
| 280 |
+
p["outcome"] = outcome
|
| 281 |
+
|
| 282 |
+
lang = question['language']
|
| 283 |
+
question_results[lang][p["qid"]].append(
|
| 284 |
+
num_passed == len(test_case_results)
|
| 285 |
+
)
|
| 286 |
+
question_pct_pass[lang][p["qid"]].append(
|
| 287 |
+
num_passed / len(test_case_results)
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
out.append(p)
|
| 291 |
+
|
| 292 |
+
return out, question_results, question_pct_pass
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def _calculate_metrics(lang,q_passed, q_pcts, k_vals):
|
| 296 |
+
assert len(q_passed) == len(q_pcts)
|
| 297 |
+
|
| 298 |
+
num_samples = np.zeros(len(q_passed))
|
| 299 |
+
num_correct = np.zeros(len(q_passed))
|
| 300 |
+
pcts_passed = np.zeros(len(q_passed))
|
| 301 |
+
for i, (k,v) in enumerate(q_passed.items()):
|
| 302 |
+
num_samples[i] = len(v)
|
| 303 |
+
num_correct[i] = sum(v)
|
| 304 |
+
pcts_passed[i] = np.mean(q_pcts[k])
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
out = {f'{lang}/pass@{k}': estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_vals}
|
| 308 |
+
out[f'{lang}/mean_pct_pass'] = np.mean(pcts_passed)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
return out
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def estimate_pass_at_k(num_samples, num_correct, k):
|
| 316 |
+
"""Estimates pass@k of each problem and returns them in an array."""
|
| 317 |
+
|
| 318 |
+
def estimator(n: int, c: int, k: int) -> float:
|
| 319 |
+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
|
| 320 |
+
if n - c < k:
|
| 321 |
+
return 1.0
|
| 322 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
| 323 |
+
|
| 324 |
+
if isinstance(num_samples, int):
|
| 325 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
| 326 |
+
else:
|
| 327 |
+
assert len(num_samples) == len(num_correct)
|
| 328 |
+
num_samples_it = iter(num_samples)
|
| 329 |
+
|
| 330 |
+
return np.array(
|
| 331 |
+
[
|
| 332 |
+
estimator(int(n), int(c), k)
|
| 333 |
+
for n, c in zip(num_samples_it, num_correct)
|
| 334 |
+
]
|
| 335 |
+
)
|
execution.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import gc
|
| 3 |
+
import multiprocessing as mp
|
| 4 |
+
import pathlib
|
| 5 |
+
import subprocess
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Dict, List
|
| 8 |
+
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class CommandResult:
|
| 13 |
+
return_code: int
|
| 14 |
+
runtime: float
|
| 15 |
+
stdout: str
|
| 16 |
+
stderr: str
|
| 17 |
+
timed_out: bool
|
| 18 |
+
|
| 19 |
+
def safe_execute(
|
| 20 |
+
command_to_run: List[str],
|
| 21 |
+
working_dir: pathlib.Path,
|
| 22 |
+
timeout: int = 10,
|
| 23 |
+
) -> CommandResult:
|
| 24 |
+
"""Executes a list of commands safely.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
command_to_run: The command to run.
|
| 28 |
+
working_dir: The working directory to run them in.
|
| 29 |
+
timeout Timeout.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
The result of executing the command.
|
| 33 |
+
"""
|
| 34 |
+
timed_out = False
|
| 35 |
+
return_code = -1
|
| 36 |
+
runtime = timeout
|
| 37 |
+
stderr = None
|
| 38 |
+
stdout = None
|
| 39 |
+
start_time = datetime.datetime.now()
|
| 40 |
+
execution_process = subprocess.Popen(
|
| 41 |
+
command_to_run,
|
| 42 |
+
cwd=str(working_dir),
|
| 43 |
+
stdout=subprocess.PIPE,
|
| 44 |
+
stderr=subprocess.PIPE,
|
| 45 |
+
)
|
| 46 |
+
try:
|
| 47 |
+
outputs = execution_process.communicate(timeout=timeout)
|
| 48 |
+
|
| 49 |
+
stdout, stderr = outputs
|
| 50 |
+
stdout = stdout.decode('utf-8')
|
| 51 |
+
stderr = stderr.decode('utf-8')
|
| 52 |
+
runtime = (datetime.datetime.now() - start_time).total_seconds()
|
| 53 |
+
return_code = execution_process.returncode
|
| 54 |
+
except subprocess.TimeoutExpired:
|
| 55 |
+
timed_out = True
|
| 56 |
+
runtime = timeout
|
| 57 |
+
finally:
|
| 58 |
+
execution_process.kill()
|
| 59 |
+
|
| 60 |
+
return CommandResult(
|
| 61 |
+
return_code=return_code,
|
| 62 |
+
runtime=runtime,
|
| 63 |
+
stderr=stderr,
|
| 64 |
+
stdout=stdout,
|
| 65 |
+
timed_out=timed_out,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def execute_code(sample: Dict):
|
| 70 |
+
"""Execute a file of code.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
sample: The sample to run.
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
The execution result.
|
| 77 |
+
"""
|
| 78 |
+
file_path = sample["cwd"]
|
| 79 |
+
working_dir_for_execution = (
|
| 80 |
+
file_path.parent if file_path.is_file() else file_path
|
| 81 |
+
)
|
| 82 |
+
working_dir_for_execution = working_dir_for_execution.resolve().absolute()
|
| 83 |
+
timed_out = False
|
| 84 |
+
failed = False
|
| 85 |
+
results = []
|
| 86 |
+
for command in sample['commands']:
|
| 87 |
+
res = safe_execute(command['command'], working_dir=working_dir_for_execution, timeout=command['timeout'])
|
| 88 |
+
results.append(res)
|
| 89 |
+
if res.timed_out:
|
| 90 |
+
timed_out = True
|
| 91 |
+
break
|
| 92 |
+
if res.return_code != 0:
|
| 93 |
+
failed = True
|
| 94 |
+
break
|
| 95 |
+
return {
|
| 96 |
+
"qid":sample['qid'],
|
| 97 |
+
"idx": sample["idx"],
|
| 98 |
+
"file_path": str(file_path.absolute().resolve()),
|
| 99 |
+
"results": results,
|
| 100 |
+
"failed":failed,
|
| 101 |
+
"timed_out": timed_out,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def execute_predictions(
|
| 108 |
+
predictions: List[Dict],
|
| 109 |
+
num_workers: int = 1,
|
| 110 |
+
max_task_per_child: int = 1,
|
| 111 |
+
garbage_collection_freq: int = 500,
|
| 112 |
+
):
|
| 113 |
+
"""Execute a list of predictions in a specific language.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
predictions: List of predictions.
|
| 117 |
+
num_workers: The number of workers to use.
|
| 118 |
+
max_task_per_child: The maximum tasks ran per child before it is killed.
|
| 119 |
+
garbage_collection_freq: How often to run garbage collection.
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
The the array of raw execution results and the total runtime.
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
# Make the arguments to submit to the ThreadPoolExecutor. Do it here so we
|
| 126 |
+
# can have a progress bar as well.
|
| 127 |
+
num_to_complete = len(predictions)
|
| 128 |
+
num_completed = 0
|
| 129 |
+
results = []
|
| 130 |
+
with mp.Pool(num_workers, maxtasksperchild=max_task_per_child) as pool:
|
| 131 |
+
for result in tqdm(
|
| 132 |
+
pool.imap_unordered(execute_code, predictions),
|
| 133 |
+
total=num_to_complete,
|
| 134 |
+
desc="Executing",
|
| 135 |
+
):
|
| 136 |
+
num_completed += 1
|
| 137 |
+
|
| 138 |
+
results.append(result)
|
| 139 |
+
|
| 140 |
+
if num_completed % garbage_collection_freq == 0:
|
| 141 |
+
gc.collect()
|
| 142 |
+
# Cleanup pool
|
| 143 |
+
pool.close()
|
| 144 |
+
pool.terminate()
|
| 145 |
+
return results
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/evaluate@main
|