martinjosifoski
commited on
Commit
·
3c7fd6a
1
Parent(s):
716d007
Add Codeforces Flows.
Browse files- .gitignore +149 -0
- CF_Code.py +6 -0
- CF_Code.yaml +82 -0
- CF_CodeCollab.py +6 -0
- CF_CodeCollab.yaml +93 -0
- CF_CodeCritic.py +6 -0
- CF_CodeCritic.yaml +87 -0
- CF_CodeCriticWrongAttempt.py +6 -0
- CF_CodeCriticWrongAttempt.yaml +90 -0
- CF_CodeDebug.py +12 -0
- CF_CodeDebug.yaml +158 -0
- CF_CodeDebugCollab.py +12 -0
- CF_CodeDebugCollab.yaml +89 -0
- CF_CodeDebugCritic.py +13 -0
- CF_CodeDebugCritic.yaml +131 -0
- CF_CodeReflect.py +6 -0
- CF_CodeReflect.yaml +85 -0
- CF_CodeTesting.py +52 -0
- CF_CodeTesting.yaml +12 -0
- CodeTesting.py +30 -0
- FixedReply_CodeReflect.py +6 -0
- FixedReply_CodeReflect.yaml +18 -0
- __init__.py +20 -0
- src/__init__.py +0 -0
- src/data_transformations/__init__.py +2 -0
- src/data_transformations/correctness_flag.py +14 -0
- src/data_transformations/testing_results_summary_generation.py +107 -0
- src/datasets/__init__.py +0 -0
- src/datasets/schema.py +74 -0
- src/evaluation/__init__.py +0 -0
- src/evaluation/testing_utils_codeforces.py +449 -0
- src/evaluation/testing_utils_leetcode.py +258 -0
.gitignore
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### macOS ###
|
2 |
+
# General
|
3 |
+
.DS_Store
|
4 |
+
.AppleDouble
|
5 |
+
.LSOverride
|
6 |
+
|
7 |
+
# Icon must end with two \r
|
8 |
+
Icon
|
9 |
+
|
10 |
+
# Thumbnails
|
11 |
+
._*
|
12 |
+
|
13 |
+
# Files that might appear in the root of a volume
|
14 |
+
.DocumentRevisions-V100
|
15 |
+
.fseventsd
|
16 |
+
.Spotlight-V100
|
17 |
+
.TemporaryItems
|
18 |
+
.Trashes
|
19 |
+
.VolumeIcon.icns
|
20 |
+
.com.apple.timemachine.donotpresent
|
21 |
+
|
22 |
+
# Directories potentially created on remote AFP share
|
23 |
+
.AppleDB
|
24 |
+
.AppleDesktop
|
25 |
+
Network Trash Folder
|
26 |
+
Temporary Items
|
27 |
+
.apdisk
|
28 |
+
|
29 |
+
# Byte-compiled / optimized / DLL files
|
30 |
+
__pycache__/
|
31 |
+
*.py[cod]
|
32 |
+
*$py.class
|
33 |
+
|
34 |
+
# C extensions
|
35 |
+
*.so
|
36 |
+
|
37 |
+
# Distribution / packaging
|
38 |
+
.Python
|
39 |
+
env/
|
40 |
+
build/
|
41 |
+
develop-eggs/
|
42 |
+
dist/
|
43 |
+
downloads/
|
44 |
+
eggs/
|
45 |
+
.eggs/
|
46 |
+
lib/
|
47 |
+
lib64/
|
48 |
+
parts/
|
49 |
+
sdist/
|
50 |
+
var/
|
51 |
+
wheels/
|
52 |
+
*.egg-info/
|
53 |
+
.installed.cfg
|
54 |
+
*.egg
|
55 |
+
|
56 |
+
# PyInstaller
|
57 |
+
# Usually these files are written by a python script from a template
|
58 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
59 |
+
*.manifest
|
60 |
+
*.spec
|
61 |
+
|
62 |
+
# Installer logs
|
63 |
+
pip-log.txt
|
64 |
+
pip-delete-this-directory.txt
|
65 |
+
|
66 |
+
# Unit tests / coverage reports
|
67 |
+
htmlcov/
|
68 |
+
.tox/
|
69 |
+
.coverage
|
70 |
+
.coverage.*
|
71 |
+
.cache
|
72 |
+
nosetests.xml
|
73 |
+
coverage.xml
|
74 |
+
*.cover
|
75 |
+
.hypothesis/
|
76 |
+
|
77 |
+
# Translations
|
78 |
+
*.mo
|
79 |
+
*.pot
|
80 |
+
|
81 |
+
# Django stuff:
|
82 |
+
*.log
|
83 |
+
local_settings.py
|
84 |
+
|
85 |
+
# Flask stuff:
|
86 |
+
instance/
|
87 |
+
.webassets-cache
|
88 |
+
|
89 |
+
# Scrapy stuff:
|
90 |
+
.scrapy
|
91 |
+
|
92 |
+
# Sphinx documentation
|
93 |
+
docs/_build/
|
94 |
+
docs/build/
|
95 |
+
docs/docs/
|
96 |
+
|
97 |
+
# PyBuilder
|
98 |
+
target/
|
99 |
+
|
100 |
+
# Jupyter Notebook
|
101 |
+
.ipynb_checkpoints
|
102 |
+
|
103 |
+
# pyenv
|
104 |
+
.python-version
|
105 |
+
|
106 |
+
# celery beat schedule file
|
107 |
+
celerybeat-schedule
|
108 |
+
|
109 |
+
# SageMath parsed files
|
110 |
+
*.sage.py
|
111 |
+
|
112 |
+
# dotenv
|
113 |
+
.env
|
114 |
+
|
115 |
+
# virtualenv
|
116 |
+
.venv
|
117 |
+
venv/
|
118 |
+
ENV/
|
119 |
+
|
120 |
+
# Spyder project settings
|
121 |
+
.spyderproject
|
122 |
+
.spyproject
|
123 |
+
|
124 |
+
# Rope project settings
|
125 |
+
.ropeproject
|
126 |
+
|
127 |
+
# mkdocs documentation
|
128 |
+
/site
|
129 |
+
|
130 |
+
# mypy
|
131 |
+
.mypy_cache/
|
132 |
+
|
133 |
+
# datafiles
|
134 |
+
.xml
|
135 |
+
.pkl
|
136 |
+
|
137 |
+
# misc
|
138 |
+
.idea/
|
139 |
+
.iml
|
140 |
+
.dropbox
|
141 |
+
|
142 |
+
# media files
|
143 |
+
.png
|
144 |
+
.jpg
|
145 |
+
.pdf
|
146 |
+
|
147 |
+
|
148 |
+
# auto-generated by flows, all synced modules will be ignored by default
|
149 |
+
FLOW_MODULE_ID
|
CF_Code.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.application_flows import OpenAIChatAtomicFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_Code(OpenAIChatAtomicFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
CF_Code.yaml
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "Code_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description, generate code directly.
|
4 |
+
|
5 |
+
# ~~~ Input interface specification ~~~
|
6 |
+
input_interface_non_initialized: # Applied when constructing the first user message.
|
7 |
+
- "problem_description"
|
8 |
+
- "input_description"
|
9 |
+
- "output_description"
|
10 |
+
- "io_examples_and_explanation"
|
11 |
+
|
12 |
+
input_interface_initialized: # Applied when constructing all subsequent user messages.
|
13 |
+
- "query"
|
14 |
+
|
15 |
+
# ~~~ Output interface specification ~~~
|
16 |
+
output_interface:
|
17 |
+
- "api_output"
|
18 |
+
|
19 |
+
# ~~~ Flow specification ~~~
|
20 |
+
model_name: "gpt-4"
|
21 |
+
|
22 |
+
generation_parameters:
|
23 |
+
n: 1
|
24 |
+
max_tokens: 2000
|
25 |
+
temperature: 0.3
|
26 |
+
|
27 |
+
model_kwargs:
|
28 |
+
top_p: 0.2
|
29 |
+
frequency_penalty: 0
|
30 |
+
presence_penalty: 0
|
31 |
+
|
32 |
+
system_message_prompt_template:
|
33 |
+
_target_: langchain.PromptTemplate
|
34 |
+
template: |2-
|
35 |
+
Your goal is to provide executable Python code that solves a competitive programming problem. The code should correctly handle all corner cases in order to pass the hidden test cases, which are used to evaluate the correctness of the solution.
|
36 |
+
|
37 |
+
The user will specify the problem by providing you with:
|
38 |
+
- the problem statement
|
39 |
+
- input description
|
40 |
+
- output description
|
41 |
+
- example test cases
|
42 |
+
- (optional) explanation of the test cases
|
43 |
+
|
44 |
+
The user will provide you with a task and an output format that you will strictly follow.
|
45 |
+
input_variables: []
|
46 |
+
template_format: jinja2
|
47 |
+
|
48 |
+
human_message_prompt_template:
|
49 |
+
_target_: langchain.PromptTemplate
|
50 |
+
template: "{{query}}"
|
51 |
+
input_variables:
|
52 |
+
- "query"
|
53 |
+
template_format: jinja2
|
54 |
+
|
55 |
+
init_human_message_prompt_template:
|
56 |
+
_target_: langchain.PromptTemplate
|
57 |
+
template: |2-
|
58 |
+
# Problem statement
|
59 |
+
{{problem_description}}
|
60 |
+
|
61 |
+
# Input description
|
62 |
+
{{input_description}}
|
63 |
+
|
64 |
+
# Output description
|
65 |
+
{{output_description}}
|
66 |
+
|
67 |
+
{{io_examples_and_explanation}}
|
68 |
+
|
69 |
+
|
70 |
+
The input should be read from the standard input and the output should be passed to the standard output.
|
71 |
+
Return Python code that solves the problem. Reply in the following format:
|
72 |
+
```python
|
73 |
+
{{code_placeholder}}
|
74 |
+
```
|
75 |
+
input_variables:
|
76 |
+
- "problem_description"
|
77 |
+
- "input_description"
|
78 |
+
- "output_description"
|
79 |
+
- "io_examples_and_explanation"
|
80 |
+
partial_variables:
|
81 |
+
code_placeholder: "{{python_code}}"
|
82 |
+
template_format: jinja2
|
CF_CodeCollab.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import GeneratorCriticFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeCollab(GeneratorCriticFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
CF_CodeCollab.yaml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeCollab_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description, alternate between a step in which code is generated, and a step in which the produced code is evaluated and useful feedback is provided.
|
4 |
+
|
5 |
+
|
6 |
+
max_rounds: 4
|
7 |
+
|
8 |
+
input_interface:
|
9 |
+
- "problem_description"
|
10 |
+
- "input_description"
|
11 |
+
- "output_description"
|
12 |
+
- "io_examples_and_explanation"
|
13 |
+
output_interface:
|
14 |
+
- "code"
|
15 |
+
|
16 |
+
subflows_config:
|
17 |
+
CodeGenerator:
|
18 |
+
_target_: .CF_Code.instantiate_from_default_config
|
19 |
+
name: "CodeGenerator"
|
20 |
+
human_message_prompt_template:
|
21 |
+
_target_: langchain.PromptTemplate
|
22 |
+
template: |2-
|
23 |
+
# Feedback on the last proposed solution
|
24 |
+
{{code_feedback}}
|
25 |
+
|
26 |
+
|
27 |
+
Consider the original problem statement, the last proposed solution and the provided feedback. Does the solution need to be updated? If so, provide the corrected version of the code in the following format:
|
28 |
+
```python
|
29 |
+
{{code_placeholder}}
|
30 |
+
```
|
31 |
+
otherwise, reply:
|
32 |
+
"Final answer."
|
33 |
+
input_variables:
|
34 |
+
- code_feedback
|
35 |
+
partial_variables:
|
36 |
+
code_placeholder: "{{python_code}}"
|
37 |
+
input_interface_initialized:
|
38 |
+
- "code_feedback"
|
39 |
+
CodeCritic:
|
40 |
+
_target_: .CF_CodeCritic.instantiate_from_default_config
|
41 |
+
|
42 |
+
topology:
|
43 |
+
# ~~~ Code Generator ~~~
|
44 |
+
- goal: "Generate/refine a solution."
|
45 |
+
|
46 |
+
### Input Interface
|
47 |
+
input_interface:
|
48 |
+
_target_: flows.interfaces.KeyInterface
|
49 |
+
additional_transformations:
|
50 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
51 |
+
|
52 |
+
### Flow Specification
|
53 |
+
flow: CodeGenerator
|
54 |
+
|
55 |
+
### Output Interface
|
56 |
+
output_interface:
|
57 |
+
_target_: flows.interfaces.KeyInterface
|
58 |
+
additional_transformations:
|
59 |
+
- _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
|
60 |
+
regex: '(?<=```python)([\s\S]*?)(?=```)'
|
61 |
+
regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
|
62 |
+
input_key: "api_output"
|
63 |
+
output_key: "code"
|
64 |
+
strip: True
|
65 |
+
assert_unique: True
|
66 |
+
- _target_: flows.data_transformations.EndOfInteraction
|
67 |
+
end_of_interaction_string: "Final answer"
|
68 |
+
input_key: "api_output"
|
69 |
+
output_key: "end_of_interaction"
|
70 |
+
- _target_: flows.data_transformations.PrintPreviousMessages
|
71 |
+
reset: false
|
72 |
+
|
73 |
+
# ~~~ Code Critic ~~~
|
74 |
+
- goal: "Provide feedback for the candidate solution."
|
75 |
+
|
76 |
+
### Input Interface
|
77 |
+
input_interface:
|
78 |
+
_target_: flows.interfaces.KeyInterface
|
79 |
+
additional_transformations:
|
80 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
81 |
+
|
82 |
+
### Flow Specification
|
83 |
+
flow: CodeCritic
|
84 |
+
|
85 |
+
### Output Interface
|
86 |
+
output_interface:
|
87 |
+
_target_: flows.interfaces.KeyInterface
|
88 |
+
keys_to_rename:
|
89 |
+
api_output: "code_feedback"
|
90 |
+
|
91 |
+
reset: true
|
92 |
+
|
93 |
+
early_exit_key: "end_of_interaction"
|
CF_CodeCritic.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.application_flows import OpenAIChatAtomicFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeCritic(OpenAIChatAtomicFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
CF_CodeCritic.yaml
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeCritic_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description and a solution candidate, provide useful feedback concerning the correctness of the solution candidate.
|
4 |
+
|
5 |
+
# ~~~ Input interface specification ~~~
|
6 |
+
input_interface_non_initialized:
|
7 |
+
- "problem_description"
|
8 |
+
- "input_description"
|
9 |
+
- "output_description"
|
10 |
+
- "io_examples_and_explanation"
|
11 |
+
- "code"
|
12 |
+
|
13 |
+
input_interface_initialized:
|
14 |
+
- "query"
|
15 |
+
|
16 |
+
# ~~~ Output interface specification ~~~
|
17 |
+
output_interface:
|
18 |
+
- "api_output"
|
19 |
+
|
20 |
+
# ~~~ Flow specification ~~~
|
21 |
+
model_name: "gpt-4"
|
22 |
+
|
23 |
+
generation_parameters:
|
24 |
+
n: 1
|
25 |
+
max_tokens: 3000
|
26 |
+
temperature: 0.3
|
27 |
+
|
28 |
+
model_kwargs:
|
29 |
+
top_p: 0.2
|
30 |
+
frequency_penalty: 0
|
31 |
+
presence_penalty: 0
|
32 |
+
|
33 |
+
system_message_prompt_template:
|
34 |
+
_target_: langchain.PromptTemplate
|
35 |
+
template: |2-
|
36 |
+
Your goal is to identify potential issues with a competitive programming solution attempt.
|
37 |
+
|
38 |
+
The user will specify the problem by providing you with:
|
39 |
+
- the problem statement
|
40 |
+
- input description
|
41 |
+
- output description
|
42 |
+
- example test cases
|
43 |
+
- (optional) explanation of the test cases
|
44 |
+
- a Python solution attempt
|
45 |
+
|
46 |
+
Crucially, your goal is to correctly identify potential issues with the solution attempt, and not to provide the code implementation yourself.
|
47 |
+
The user will provide you with a task and an output format that you will strictly follow.
|
48 |
+
input_variables: []
|
49 |
+
template_format: jinja2
|
50 |
+
|
51 |
+
human_message_prompt_template:
|
52 |
+
_target_: langchain.PromptTemplate
|
53 |
+
template: "{{query}}"
|
54 |
+
input_variables:
|
55 |
+
- "query"
|
56 |
+
template_format: jinja2
|
57 |
+
|
58 |
+
init_human_message_prompt_template:
|
59 |
+
_target_: langchain.PromptTemplate
|
60 |
+
template: |2-
|
61 |
+
# Problem statement
|
62 |
+
{{problem_description}}
|
63 |
+
|
64 |
+
# Input description
|
65 |
+
{{input_description}}
|
66 |
+
|
67 |
+
# Output description
|
68 |
+
{{output_description}}
|
69 |
+
|
70 |
+
{{io_examples_and_explanation}}
|
71 |
+
|
72 |
+
# Python solution attempt:
|
73 |
+
```python
|
74 |
+
{{code}}
|
75 |
+
```
|
76 |
+
|
77 |
+
|
78 |
+
Consider the problem statement and the solution attempt. Are there any issues with the proposed solution or it is correct? Explain your reasoning very concisely, and do not provide code.
|
79 |
+
input_variables:
|
80 |
+
- "problem_description"
|
81 |
+
- "input_description"
|
82 |
+
- "output_description"
|
83 |
+
- "io_examples_and_explanation"
|
84 |
+
- "code"
|
85 |
+
template_format: jinja2
|
86 |
+
|
87 |
+
|
CF_CodeCriticWrongAttempt.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.application_flows import OpenAIChatAtomicFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeCriticWrongAttempt(OpenAIChatAtomicFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
CF_CodeCriticWrongAttempt.yaml
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeCriticWrongAttempt_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description and an incorrect solution candidate, provide useful feedback for correcting the mistakes in the solution.
|
4 |
+
|
5 |
+
# ~~~ Input interface specification ~~~
|
6 |
+
input_interface_non_initialized:
|
7 |
+
- "problem_description"
|
8 |
+
- "input_description"
|
9 |
+
- "output_description"
|
10 |
+
- "io_examples_and_explanation"
|
11 |
+
- "testing_results_summary"
|
12 |
+
- "code"
|
13 |
+
|
14 |
+
input_interface_initialized:
|
15 |
+
- "query"
|
16 |
+
|
17 |
+
# ~~~ Output interface specification ~~~
|
18 |
+
output_interface:
|
19 |
+
- "api_output"
|
20 |
+
|
21 |
+
# ~~~ Flow specification ~~~
|
22 |
+
model_name: "gpt-4"
|
23 |
+
|
24 |
+
generation_parameters:
|
25 |
+
n: 1
|
26 |
+
max_tokens: 3000
|
27 |
+
temperature: 0.3
|
28 |
+
|
29 |
+
model_kwargs:
|
30 |
+
top_p: 0.2
|
31 |
+
frequency_penalty: 0
|
32 |
+
presence_penalty: 0
|
33 |
+
|
34 |
+
system_message_prompt_template:
|
35 |
+
_target_: langchain.PromptTemplate
|
36 |
+
template: |2-
|
37 |
+
Your goal is to identify the issues with an incorrect competitive programming solution attempt.
|
38 |
+
|
39 |
+
The user will specify the problem by providing you with:
|
40 |
+
- the problem statement
|
41 |
+
- input description
|
42 |
+
- output description
|
43 |
+
- example test cases
|
44 |
+
- (optional) explanation of the test cases
|
45 |
+
- an incorrect Python solution attempt and a description of its issue
|
46 |
+
|
47 |
+
Crucially, your goal is to consider all aspects of the problem and pinpoint the issues with the solution attempt, and not to provide the code implementation yourself.
|
48 |
+
Some aspects to consider: Is the input correctly parsed? Is the output correctly formatted? Are the corner cases correctly handled? Is there a logical mistake with the algorithm itself?
|
49 |
+
Use the code execution results provided in the issue description to guide your reasoning/debugging.
|
50 |
+
input_variables: []
|
51 |
+
template_format: jinja2
|
52 |
+
|
53 |
+
human_message_prompt_template:
|
54 |
+
_target_: langchain.PromptTemplate
|
55 |
+
template: "{{query}}"
|
56 |
+
input_variables:
|
57 |
+
- "query"
|
58 |
+
template_format: jinja2
|
59 |
+
|
60 |
+
init_human_message_prompt_template:
|
61 |
+
_target_: langchain.PromptTemplate
|
62 |
+
template: |2-
|
63 |
+
# Problem statement
|
64 |
+
{{problem_description}}
|
65 |
+
|
66 |
+
# Input description
|
67 |
+
{{input_description}}
|
68 |
+
|
69 |
+
# Output description
|
70 |
+
{{output_description}}
|
71 |
+
|
72 |
+
{{io_examples_and_explanation}}
|
73 |
+
|
74 |
+
# Solution attempt to be fixed
|
75 |
+
```python
|
76 |
+
{{code}}
|
77 |
+
```
|
78 |
+
|
79 |
+
{{testing_results_summary}}
|
80 |
+
|
81 |
+
|
82 |
+
Consider the problem statement, the solution attempt and the issue. Why is the solution attempt incorrect? How should it be fixed? Explain your reasoning very concisely, and do not provide code.
|
83 |
+
input_variables:
|
84 |
+
- "problem_description"
|
85 |
+
- "input_description"
|
86 |
+
- "output_description"
|
87 |
+
- "io_examples_and_explanation"
|
88 |
+
- "code"
|
89 |
+
- "testing_results_summary"
|
90 |
+
template_format: jinja2
|
CF_CodeDebug.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import GeneratorCriticFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeDebug(GeneratorCriticFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
7 |
+
|
8 |
+
def _early_exit(self):
|
9 |
+
if self.flow_state.get("all_tests_passed", False):
|
10 |
+
return True
|
11 |
+
|
12 |
+
return super()._early_exit()
|
CF_CodeDebug.yaml
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeDebug_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description, generate code, test and refine it until all tests pass or a maximum number of rounds is reached.
|
4 |
+
|
5 |
+
# ~~~ Input interface specification ~~~
|
6 |
+
input_interface:
|
7 |
+
- "problem_description"
|
8 |
+
- "input_description"
|
9 |
+
- "output_description"
|
10 |
+
- "io_examples_and_explanation"
|
11 |
+
- "public_tests_individual_io"
|
12 |
+
|
13 |
+
# ~~~ Output interface specification ~~~
|
14 |
+
output_interface:
|
15 |
+
- "code"
|
16 |
+
|
17 |
+
# ~~~ Flow specification ~~~
|
18 |
+
max_rounds: 4
|
19 |
+
|
20 |
+
### Subflows specification
|
21 |
+
subflows_config:
|
22 |
+
CodeGenerator:
|
23 |
+
_target_: .CF_Code.instantiate_from_default_config
|
24 |
+
name: "CodeGenerator"
|
25 |
+
model_name: "gpt-4"
|
26 |
+
human_message_prompt_template:
|
27 |
+
template: |2-
|
28 |
+
{{testing_results_summary}}
|
29 |
+
|
30 |
+
|
31 |
+
Consider the problem statement, the last proposed solution, and its issue. Provide a corrected version of the code that solves the original problem and resolves the issue, without any explanation, in the following format:
|
32 |
+
```python
|
33 |
+
{{code_placeholder}}
|
34 |
+
```
|
35 |
+
input_variables:
|
36 |
+
- testing_results_summary
|
37 |
+
partial_variables:
|
38 |
+
code_placeholder: "{{python_code}}"
|
39 |
+
input_interface_initialized:
|
40 |
+
- "testing_results_summary"
|
41 |
+
CodeTestingCritic:
|
42 |
+
_target_: .CF_CodeTesting.instantiate_from_default_config
|
43 |
+
|
44 |
+
### Topology specification (specifies how the sequence of messages will flow from one of the subflows to another)
|
45 |
+
topology:
|
46 |
+
# ~~~ Code Generator ~~~
|
47 |
+
- goal: "Generate/refine a solution."
|
48 |
+
|
49 |
+
### Input Interface
|
50 |
+
input_interface:
|
51 |
+
_target_: flows.interfaces.KeyInterface
|
52 |
+
additional_transformations:
|
53 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
54 |
+
|
55 |
+
### Flow Specification
|
56 |
+
flow: CodeGenerator
|
57 |
+
|
58 |
+
### Output Interface
|
59 |
+
output_interface:
|
60 |
+
_target_: flows.interfaces.KeyInterface
|
61 |
+
additional_transformations:
|
62 |
+
- _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
|
63 |
+
regex: '(?<=```python)([\s\S]*?)(?=```)'
|
64 |
+
regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
|
65 |
+
input_key: "api_output"
|
66 |
+
output_key: "code"
|
67 |
+
strip: True
|
68 |
+
assert_unique: True
|
69 |
+
- _target_: flows.data_transformations.PrintPreviousMessages
|
70 |
+
keys_to_select:
|
71 |
+
- "code"
|
72 |
+
|
73 |
+
### Reset flag
|
74 |
+
reset: false
|
75 |
+
|
76 |
+
# ~~~ Code Testing Critic ~~~
|
77 |
+
- goal: "Test the code on the public tests and provide a results summary."
|
78 |
+
|
79 |
+
### Input Interface
|
80 |
+
input_interface:
|
81 |
+
_target_: flows.interfaces.KeyInterface
|
82 |
+
additional_transformations:
|
83 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
84 |
+
|
85 |
+
### Flow Specification
|
86 |
+
flow: CodeTestingCritic
|
87 |
+
|
88 |
+
### Output Interface
|
89 |
+
output_interface:
|
90 |
+
_target_: flows.interfaces.KeyInterface
|
91 |
+
additional_transformations:
|
92 |
+
- _target_: martinjosifoski.CC_flows.src.data_transformations.CorrectnessFlag
|
93 |
+
input_key: "public_tests_results"
|
94 |
+
output_key: "all_tests_passed"
|
95 |
+
- _target_: martinjosifoski.CC_flows.src.data_transformations.TestingResultsSummaryGeneration
|
96 |
+
output_key: "testing_results_summary"
|
97 |
+
|
98 |
+
single_test_error_message: True
|
99 |
+
|
100 |
+
no_error_template: |2-
|
101 |
+
${.issue_title}
|
102 |
+
All of the executed tests passed.
|
103 |
+
|
104 |
+
compilation_error_template: |2-
|
105 |
+
${.issue_title}
|
106 |
+
The execution resulted in a compilation error.
|
107 |
+
## Compilation error message:
|
108 |
+
{{error_message}}
|
109 |
+
timeout_error_template: |2-
|
110 |
+
${.issue_title}
|
111 |
+
The execution timed out, the solution is not efficient enough.
|
112 |
+
runtime_error_template: |2-
|
113 |
+
${.issue_title}
|
114 |
+
The execution resulted in a runtime error on the following test.
|
115 |
+
## [Failed test] Input
|
116 |
+
```
|
117 |
+
{{test_input}}
|
118 |
+
```
|
119 |
+
## [Failed test] Runtime error message
|
120 |
+
{{error_message}}
|
121 |
+
single_test_error_template: |2-
|
122 |
+
${.issue_title}
|
123 |
+
The Python code does not solve the problem in the problem description due to logical errors. It fails the following test:
|
124 |
+
## [Failed test] Input
|
125 |
+
```
|
126 |
+
{{test_input}}
|
127 |
+
```
|
128 |
+
## [Failed test] Expected output
|
129 |
+
```
|
130 |
+
{{expected_output}}
|
131 |
+
```
|
132 |
+
## [Failed test] Generated output
|
133 |
+
```
|
134 |
+
{{generated_output}}
|
135 |
+
```
|
136 |
+
all_tests_header: |2-
|
137 |
+
${.issue_title}
|
138 |
+
The Python code does not solve the problem in the problem description due to logical errors. It fails on the following tests.
|
139 |
+
test_error_template: |2-
|
140 |
+
## [Failed test {{idx}}]
|
141 |
+
### [Failed test {{idx}}] Input
|
142 |
+
```
|
143 |
+
{{test_input}}
|
144 |
+
```
|
145 |
+
### [Failed test {{idx}}] Expected output
|
146 |
+
```
|
147 |
+
{{expected_output}}
|
148 |
+
```
|
149 |
+
### [Failed test {{idx}}] Generated output
|
150 |
+
```
|
151 |
+
{{generated_output}}
|
152 |
+
```
|
153 |
+
tests_separator: "\n\n"
|
154 |
+
|
155 |
+
issue_title: "# Issue with the last proposed solution"
|
156 |
+
|
157 |
+
### Reset flag
|
158 |
+
reset: true
|
CF_CodeDebugCollab.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import GeneratorCriticFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeDebugCollab(GeneratorCriticFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
7 |
+
|
8 |
+
def _early_exit(self):
|
9 |
+
if self.flow_state.get("all_tests_passed", False):
|
10 |
+
return True
|
11 |
+
|
12 |
+
return super()._early_exit()
|
CF_CodeDebugCollab.yaml
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeDebugCollab_Flow"
|
2 |
+
description: |2-
|
3 |
+
Given a problem description, alternate between a step in which code is generated, and a step in which the produced code is evaluated and useful feedback is provided.
|
4 |
+
|
5 |
+
# ~~~ Input interface specification ~~~
|
6 |
+
input_interface:
|
7 |
+
- "problem_description"
|
8 |
+
- "input_description"
|
9 |
+
- "output_description"
|
10 |
+
- "io_examples_and_explanation"
|
11 |
+
- "public_tests_individual_io"
|
12 |
+
|
13 |
+
# ~~~ Output interface specification ~~~
|
14 |
+
output_interface:
|
15 |
+
- "code"
|
16 |
+
|
17 |
+
# ~~~ Flow specification ~~~
|
18 |
+
max_rounds: 4
|
19 |
+
|
20 |
+
subflows_config:
|
21 |
+
CodeGenerator:
|
22 |
+
_target_: .CF_Code.instantiate_from_default_config
|
23 |
+
name: "CodeGenerator"
|
24 |
+
model_name: "gpt-4"
|
25 |
+
human_message_prompt_template:
|
26 |
+
_target_: langchain.PromptTemplate
|
27 |
+
template: |2-
|
28 |
+
{{testing_results_summary}}
|
29 |
+
|
30 |
+
{{code_feedback}}
|
31 |
+
|
32 |
+
|
33 |
+
Consider the problem statement, the last proposed solution, its issue and the provided feedback. Return a corrected version of the code that solves the original problem and resolves the issue, without any explanation, in the following format:
|
34 |
+
```python
|
35 |
+
{{code_placeholder}}
|
36 |
+
```
|
37 |
+
input_variables:
|
38 |
+
- code_feedback
|
39 |
+
- testing_results_summary
|
40 |
+
partial_variables:
|
41 |
+
code_placeholder: "{{python_code}}"
|
42 |
+
input_interface_initialized:
|
43 |
+
- "code_feedback"
|
44 |
+
- "testing_results_summary"
|
45 |
+
CodeDebugCritic:
|
46 |
+
_target_: .CF_CodeDebugCritic.instantiate_from_default_config
|
47 |
+
|
48 |
+
topology:
|
49 |
+
# ~~~ Code Generator ~~~
|
50 |
+
- goal: "Generate/refine a solution."
|
51 |
+
|
52 |
+
### Input Interface
|
53 |
+
input_interface:
|
54 |
+
_target_: flows.interfaces.KeyInterface
|
55 |
+
additional_transformations:
|
56 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
57 |
+
|
58 |
+
### Flow Specification
|
59 |
+
flow: CodeGenerator
|
60 |
+
|
61 |
+
### Output Interface
|
62 |
+
output_interface:
|
63 |
+
_target_: flows.interfaces.KeyInterface
|
64 |
+
additional_transformations:
|
65 |
+
- _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
|
66 |
+
regex: '(?<=```python)([\s\S]*?)(?=```)'
|
67 |
+
regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
|
68 |
+
input_key: "api_output"
|
69 |
+
output_key: "code"
|
70 |
+
strip: True
|
71 |
+
assert_unique: True
|
72 |
+
keys_to_select:
|
73 |
+
- "code"
|
74 |
+
|
75 |
+
reset: false
|
76 |
+
|
77 |
+
# ~~~ Code Critic Grounded in Tests ~~~
|
78 |
+
- goal: "Provide feedback for the candidate solution that is grounded in test results."
|
79 |
+
|
80 |
+
### Input Interface
|
81 |
+
input_interface:
|
82 |
+
_target_: flows.interfaces.KeyInterface
|
83 |
+
additional_transformations:
|
84 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
85 |
+
|
86 |
+
### Flow Specification
|
87 |
+
flow: CodeDebugCritic
|
88 |
+
|
89 |
+
reset: true
|
CF_CodeDebugCritic.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import SequentialFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeDebugCritic(SequentialFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
7 |
+
|
8 |
+
def _early_exit(self):
|
9 |
+
if self.flow_state.get("all_tests_passed", False):
|
10 |
+
self.flow_state["code_feedback"] = None
|
11 |
+
return True
|
12 |
+
|
13 |
+
return super()._early_exit()
|
CF_CodeDebugCritic.yaml
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeDebugCritic_Flow"
|
2 |
+
description: "Given a problem description and a candidate solution, test the code and provide useful feedback concerning the correctness of the solution and the potential mistakes."
|
3 |
+
|
4 |
+
# ~~~ Input interface specification ~~~
|
5 |
+
input_interface:
|
6 |
+
- "problem_description"
|
7 |
+
- "input_description"
|
8 |
+
- "output_description"
|
9 |
+
- "io_examples_and_explanation"
|
10 |
+
- "public_tests_individual_io"
|
11 |
+
- "code"
|
12 |
+
|
13 |
+
# ~~~ Output interface specification ~~~
|
14 |
+
output_interface:
|
15 |
+
- "testing_results_summary"
|
16 |
+
- "all_tests_passed"
|
17 |
+
- "code_feedback"
|
18 |
+
|
19 |
+
# ~~~ Flow specification ~~~
|
20 |
+
public_tests_key: "public_tests_individual_io"
|
21 |
+
|
22 |
+
subflows_config:
|
23 |
+
CodeTestingCritic:
|
24 |
+
_target_: .CF_CodeTesting.instantiate_from_default_config
|
25 |
+
CodeCriticWrongAttempt:
|
26 |
+
_target_: .CF_CodeCriticWrongAttempt.instantiate_from_default_config
|
27 |
+
|
28 |
+
topology:
|
29 |
+
# ~~~ Code Testing Critic ~~~
|
30 |
+
- goal: "Test the code on the public tests and provide a results summary."
|
31 |
+
|
32 |
+
### Input Interface
|
33 |
+
input_interface:
|
34 |
+
_target_: flows.interfaces.KeyInterface
|
35 |
+
additional_transformations:
|
36 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
37 |
+
|
38 |
+
### Flow Specification
|
39 |
+
flow: CodeTestingCritic
|
40 |
+
|
41 |
+
### Output Interface
|
42 |
+
output_interface:
|
43 |
+
_target_: flows.interfaces.KeyInterface
|
44 |
+
additional_transformations:
|
45 |
+
- _target_: martinjosifoski.CC_flows.src.data_transformations.CorrectnessFlag
|
46 |
+
input_key: "public_tests_results"
|
47 |
+
output_key: "all_tests_passed"
|
48 |
+
- _target_: martinjosifoski.CC_flows.src.data_transformations.TestingResultsSummaryGeneration
|
49 |
+
output_key: "testing_results_summary"
|
50 |
+
|
51 |
+
single_test_error_message: True
|
52 |
+
|
53 |
+
no_error_template: |2-
|
54 |
+
${.issue_title}
|
55 |
+
All of the executed tests passed.
|
56 |
+
|
57 |
+
compilation_error_template: |2-
|
58 |
+
${.issue_title}
|
59 |
+
The execution resulted in a compilation error.
|
60 |
+
## Compilation error message:
|
61 |
+
{{error_message}}
|
62 |
+
timeout_error_template: |2-
|
63 |
+
${.issue_title}
|
64 |
+
The execution timed out, the solution is not efficient enough.
|
65 |
+
runtime_error_template: |2-
|
66 |
+
${.issue_title}
|
67 |
+
The execution resulted in a runtime error on the following test.
|
68 |
+
## [Failed test] Input
|
69 |
+
```
|
70 |
+
{{test_input}}
|
71 |
+
```
|
72 |
+
## [Failed test] Runtime error message
|
73 |
+
{{error_message}}
|
74 |
+
single_test_error_template: |2-
|
75 |
+
${.issue_title}
|
76 |
+
The Python code does not solve the problem in the problem description due to logical errors. It fails the following test:
|
77 |
+
## [Failed test] Input
|
78 |
+
```
|
79 |
+
{{test_input}}
|
80 |
+
```
|
81 |
+
## [Failed test] Expected output
|
82 |
+
```
|
83 |
+
{{expected_output}}
|
84 |
+
```
|
85 |
+
## [Failed test] Generated output
|
86 |
+
```
|
87 |
+
{{generated_output}}
|
88 |
+
```
|
89 |
+
all_tests_header: |2-
|
90 |
+
${.issue_title}
|
91 |
+
The Python code does not solve the problem in the problem description due to logical errors. It fails on the following tests.
|
92 |
+
test_error_template: |2-
|
93 |
+
## [Failed test {{idx}}]
|
94 |
+
### [Failed test {{idx}}] Input
|
95 |
+
```
|
96 |
+
{{test_input}}
|
97 |
+
```
|
98 |
+
### [Failed test {{idx}}] Expected output
|
99 |
+
```
|
100 |
+
{{expected_output}}
|
101 |
+
```
|
102 |
+
### [Failed test {{idx}}] Generated output
|
103 |
+
```
|
104 |
+
{{generated_output}}
|
105 |
+
```
|
106 |
+
tests_separator: "\n\n"
|
107 |
+
|
108 |
+
issue_title: "# Issue with the last proposed solution"
|
109 |
+
|
110 |
+
# ~~~ Feedback Generator ~~~
|
111 |
+
- goal: "Generate feedback grounded in the test results summary."
|
112 |
+
|
113 |
+
### Input Interface
|
114 |
+
input_interface:
|
115 |
+
_target_: flows.interfaces.KeyInterface
|
116 |
+
additional_transformations:
|
117 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
118 |
+
|
119 |
+
### Flow Specification
|
120 |
+
flow: CodeCriticWrongAttempt
|
121 |
+
|
122 |
+
### Output Interface
|
123 |
+
output_interface:
|
124 |
+
_target_: flows.interfaces.KeyInterface
|
125 |
+
additional_transformations:
|
126 |
+
- _target_: flows.data_transformations.KeyRename
|
127 |
+
old_key2new_key:
|
128 |
+
api_output: "code_feedback"
|
129 |
+
|
130 |
+
reset: true
|
131 |
+
|
CF_CodeReflect.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import GeneratorCriticFlow
|
2 |
+
|
3 |
+
|
4 |
+
class CF_CodeReflect(GeneratorCriticFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
CF_CodeReflect.yaml
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeReflect_Flow"
|
2 |
+
description: "Given a problem description, generate code, reflecting on it and improving it until a message suggesting that the code seems correct or a maximum number of rounds is reached."
|
3 |
+
|
4 |
+
# ~~~ Input interface specification ~~~
|
5 |
+
input_interface:
|
6 |
+
- "problem_description"
|
7 |
+
- "input_description"
|
8 |
+
- "output_description"
|
9 |
+
- "io_examples_and_explanation"
|
10 |
+
|
11 |
+
# ~~~ Output interface specification ~~~
|
12 |
+
output_interface:
|
13 |
+
- "code"
|
14 |
+
|
15 |
+
# ~~~ Flow specification ~~~
|
16 |
+
max_rounds: 4
|
17 |
+
|
18 |
+
### Subflows specification
|
19 |
+
subflows_config:
|
20 |
+
CodeGenerator:
|
21 |
+
_target_: .CF_Code.instantiate_from_default_config
|
22 |
+
CodeReflectCritic:
|
23 |
+
_target_: .FixedReply_CodeReflect.instantiate_from_default_config
|
24 |
+
|
25 |
+
### Topology specification (specifies how the sequence of messages will flow from one of the subflows to another)
|
26 |
+
topology:
|
27 |
+
# ~~~ Code Generator ~~~
|
28 |
+
- goal: "Generate/refine a solution."
|
29 |
+
|
30 |
+
### Input Interface
|
31 |
+
input_interface:
|
32 |
+
_target_: flows.interfaces.KeyInterface
|
33 |
+
additional_transformations:
|
34 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
35 |
+
keys_to_rename:
|
36 |
+
code_reflect_message: "query"
|
37 |
+
|
38 |
+
### Flow Specification
|
39 |
+
flow: CodeGenerator
|
40 |
+
|
41 |
+
### Output Interface
|
42 |
+
output_interface:
|
43 |
+
_target_: flows.interfaces.KeyInterface
|
44 |
+
additional_transformations:
|
45 |
+
- _target_: flows.data_transformations.RegexFirstOccurrenceExtractor
|
46 |
+
regex: '(?<=```python)([\s\S]*?)(?=```)'
|
47 |
+
regex_fallback: '(?<=```)([\s\S]*?)(?=```)'
|
48 |
+
input_key: "api_output"
|
49 |
+
output_key: "code"
|
50 |
+
strip: True
|
51 |
+
assert_unique: True
|
52 |
+
- _target_: flows.data_transformations.EndOfInteraction
|
53 |
+
end_of_interaction_string: "Final answer"
|
54 |
+
input_key: "api_output"
|
55 |
+
output_key: "end_of_interaction"
|
56 |
+
- _target_: flows.data_transformations.PrintPreviousMessages
|
57 |
+
keys_to_select:
|
58 |
+
- "code"
|
59 |
+
- "end_of_interaction"
|
60 |
+
|
61 |
+
### Reset flag
|
62 |
+
reset: false
|
63 |
+
|
64 |
+
- goal: "Generate a message that encourages reflection."
|
65 |
+
|
66 |
+
### Input Interface
|
67 |
+
input_interface:
|
68 |
+
_target_: flows.interfaces.KeyInterface
|
69 |
+
additional_transformations:
|
70 |
+
- _target_: flows.data_transformations.KeyMatchInput
|
71 |
+
|
72 |
+
|
73 |
+
### Flow Specification
|
74 |
+
flow: CodeReflectCritic
|
75 |
+
|
76 |
+
### Output Interface
|
77 |
+
output_interface:
|
78 |
+
_target_: flows.interfaces.KeyInterface
|
79 |
+
keys_to_rename:
|
80 |
+
fixed_reply: "code_reflect_message"
|
81 |
+
|
82 |
+
### Reset flag
|
83 |
+
reset: true
|
84 |
+
|
85 |
+
early_exit_key: "end_of_interaction"
|
CF_CodeTesting.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict
|
2 |
+
|
3 |
+
from flows import logging
|
4 |
+
from flows.utils.general_helpers import validate_parameters
|
5 |
+
from .src.evaluation import testing_utils_codeforces
|
6 |
+
from .CodeTesting import CodeTesting
|
7 |
+
|
8 |
+
log = logging.get_logger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
class CF_CodeTesting(CodeTesting):
|
12 |
+
REQUIRED_KEYS_CONFIG = []
|
13 |
+
REQUIRED_KEYS_CONSTRUCTOR = []
|
14 |
+
|
15 |
+
def __init__(self, **kwargs):
|
16 |
+
super().__init__(**kwargs)
|
17 |
+
|
18 |
+
@classmethod
|
19 |
+
def _validate_parameters(cls, kwargs):
|
20 |
+
validate_parameters(cls, kwargs)
|
21 |
+
|
22 |
+
if "public_tests_key" not in kwargs["flow_config"] and "hidden_tests_key" not in kwargs["flow_config"]:
|
23 |
+
raise ValueError("At least one of 'public_tests_key' "
|
24 |
+
"and 'hidden_tests_key' must be specified in the config.")
|
25 |
+
|
26 |
+
def _get_test_data(self, input_data: Dict):
|
27 |
+
"""This function retrieves (or generates) input-output pairs that will be used to test the implementation."""
|
28 |
+
test_data = {"public_tests_io": None, "hidden_tests_io": None}
|
29 |
+
|
30 |
+
if "public_tests_key" in self.flow_config:
|
31 |
+
test_data["public_tests_io"] = input_data[self.flow_config["public_tests_key"]]
|
32 |
+
|
33 |
+
if "hidden_tests_key" in self.flow_config:
|
34 |
+
test_data["hidden_tests_io"] = input_data[self.flow_config["hidden_tests_key"]]
|
35 |
+
|
36 |
+
return test_data
|
37 |
+
|
38 |
+
def _run_tests(self, input_data: Dict, test_data: Dict) -> Dict[str, Any]:
|
39 |
+
testing_results = testing_utils_codeforces.evaluate_solution_for_problem(
|
40 |
+
candidate_solution=input_data["code"],
|
41 |
+
**test_data
|
42 |
+
)
|
43 |
+
|
44 |
+
if "public_tests_results" in testing_results:
|
45 |
+
for test_output in testing_results["public_tests_results"]:
|
46 |
+
test_output["input"] = "\n".join(test_output["input"])
|
47 |
+
|
48 |
+
if "hidden_tests_results" in testing_results:
|
49 |
+
for test_output in testing_results["hidden_tests_results"]:
|
50 |
+
test_output["input"] = "\n".join(test_output["input"])
|
51 |
+
|
52 |
+
return testing_results
|
CF_CodeTesting.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CF_CodeTesting"
|
2 |
+
description: "Given code and test specified by input-output pairs, executes the code with the specific input, compares it with the output, and returns an informative message."
|
3 |
+
|
4 |
+
input_interface:
|
5 |
+
- "code"
|
6 |
+
- "public_tests_individual_io"
|
7 |
+
|
8 |
+
output_interface:
|
9 |
+
- "all_tests_passed"
|
10 |
+
- "testing_results_summary"
|
11 |
+
|
12 |
+
public_tests_key: "public_tests_individual_io"
|
CodeTesting.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict
|
2 |
+
|
3 |
+
from flows import logging
|
4 |
+
from flows.base_flows import AtomicFlow
|
5 |
+
|
6 |
+
log = logging.get_logger(__name__)
|
7 |
+
|
8 |
+
|
9 |
+
class CodeTesting(AtomicFlow):
|
10 |
+
REQUIRED_KEYS_CONFIG = []
|
11 |
+
REQUIRED_KEYS_CONSTRUCTOR = []
|
12 |
+
|
13 |
+
def __init__(self, **kwargs):
|
14 |
+
super().__init__(**kwargs)
|
15 |
+
|
16 |
+
def _get_test_data(self, input_data: Dict):
|
17 |
+
"""This function retrieves (or generates) input-output pairs that will be used to test the implementation."""
|
18 |
+
raise NotImplementedError()
|
19 |
+
|
20 |
+
def _run_tests(self, input_data: Dict, test_data: Dict) -> Dict[str, Any]:
|
21 |
+
raise NotImplementedError()
|
22 |
+
|
23 |
+
def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
24 |
+
# ~~~ Retrieve the test data ~~~
|
25 |
+
test_data = self._get_test_data(input_data)
|
26 |
+
|
27 |
+
# ~~~ Run tests ~~~
|
28 |
+
response: Dict[str, Any] = self._run_tests(input_data, test_data)
|
29 |
+
|
30 |
+
return response
|
FixedReply_CodeReflect.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flows.base_flows import FixedReplyFlow
|
2 |
+
|
3 |
+
|
4 |
+
class FixedReply_CodeReflect(FixedReplyFlow):
|
5 |
+
def __init__(self, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
FixedReply_CodeReflect.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "CodeReflectCritic"
|
2 |
+
description: "A flow that prompts the caller to reflect on the generated code and provide a corrected version if necessary."
|
3 |
+
|
4 |
+
input_interface: []
|
5 |
+
output_interface:
|
6 |
+
- fixed_reply
|
7 |
+
|
8 |
+
fixed_reply: |2-
|
9 |
+
Consider the problem statement and the last proposed solution. Are you sure that the solution is provided in the requested format, and crucially, solves the problem?
|
10 |
+
If that is not the case, provide the corrected version of the code in the following format:
|
11 |
+
```python
|
12 |
+
{{python_code}}
|
13 |
+
```
|
14 |
+
otherwise, reply:
|
15 |
+
"Final answer."
|
16 |
+
|
17 |
+
|
18 |
+
|
__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ~~~ Codeforces ~~~
|
2 |
+
# code
|
3 |
+
from .CF_Code import CF_Code
|
4 |
+
|
5 |
+
# code_reflect
|
6 |
+
from .FixedReply_CodeReflect import FixedReply_CodeReflect
|
7 |
+
from .CF_CodeReflect import CF_CodeReflect
|
8 |
+
|
9 |
+
# code_collab
|
10 |
+
from .CF_CodeCritic import CF_CodeCritic
|
11 |
+
from .CF_CodeCollab import CF_CodeCollab
|
12 |
+
|
13 |
+
# code_debug
|
14 |
+
from .CF_CodeTesting import CF_CodeTesting
|
15 |
+
from .CF_CodeDebug import CF_CodeDebug
|
16 |
+
|
17 |
+
# cf-code_debug_collab
|
18 |
+
from .CF_CodeCriticWrongAttempt import CF_CodeCriticWrongAttempt
|
19 |
+
from .CF_CodeDebugCritic import CF_CodeDebugCritic
|
20 |
+
from .CF_CodeDebugCollab import CF_CodeDebugCollab
|
src/__init__.py
ADDED
File without changes
|
src/data_transformations/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .correctness_flag import CorrectnessFlag
|
2 |
+
from .testing_results_summary_generation import TestingResultsSummaryGeneration
|
src/data_transformations/correctness_flag.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
from flows.data_transformations.abstract import DataTransformation
|
4 |
+
|
5 |
+
|
6 |
+
class CorrectnessFlag(DataTransformation):
|
7 |
+
def __init__(self, output_key, input_key):
|
8 |
+
super().__init__(output_key)
|
9 |
+
self.input_key = input_key
|
10 |
+
|
11 |
+
def __call__(self, data_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
12 |
+
all_tests_passed = all([test_result["status"] for test_result in data_dict[self.input_key]])
|
13 |
+
data_dict[self.output_key] = all_tests_passed
|
14 |
+
return data_dict
|
src/data_transformations/testing_results_summary_generation.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
import jinja2
|
4 |
+
|
5 |
+
from flows.data_transformations.abstract import DataTransformation
|
6 |
+
from flows.utils.general_helpers import unflatten_dict
|
7 |
+
|
8 |
+
|
9 |
+
class TestingResultsSummaryGeneration(DataTransformation):
|
10 |
+
def __init__(self, output_key, **kwargs):
|
11 |
+
super().__init__(output_key)
|
12 |
+
self.params = kwargs
|
13 |
+
if "test_results_key" not in self.params:
|
14 |
+
self.params["test_results_key"] = "public_tests_results"
|
15 |
+
if "tests_passed_key" not in self.params:
|
16 |
+
self.params["tests_passed_key"] = "all_tests_passed"
|
17 |
+
|
18 |
+
def __call__(self, data_dict: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
19 |
+
if data_dict[self.params["tests_passed_key"]]:
|
20 |
+
# the execution did not result in any errors
|
21 |
+
data_dict[self.output_key] = self.params["no_error_template"]
|
22 |
+
return data_dict
|
23 |
+
|
24 |
+
test_data = unflatten_dict(data_dict)
|
25 |
+
|
26 |
+
if not test_data["compilation_status"]:
|
27 |
+
# compilation error occurred
|
28 |
+
kwargs = {
|
29 |
+
"error_message": test_data["compilation_error_message"].strip(),
|
30 |
+
}
|
31 |
+
|
32 |
+
message_content = (
|
33 |
+
jinja2.Environment(loader=jinja2.BaseLoader())
|
34 |
+
.from_string(self.params["compilation_error_template"])
|
35 |
+
.render(**kwargs)
|
36 |
+
)
|
37 |
+
elif test_data["timeout_error"]:
|
38 |
+
# timeout error occurred
|
39 |
+
|
40 |
+
message_content = self.params["timeout_error_template"]
|
41 |
+
else:
|
42 |
+
# code compiled successfully without timeouts
|
43 |
+
|
44 |
+
# retrieve the failed tests
|
45 |
+
failed_tests = [
|
46 |
+
test_result
|
47 |
+
for test_result in test_data[self.params["test_results_key"]]
|
48 |
+
if not test_result["status"]
|
49 |
+
]
|
50 |
+
|
51 |
+
runtime_error_test = None
|
52 |
+
for test_result in failed_tests:
|
53 |
+
if test_result["generated_output"] is None:
|
54 |
+
# runtime error occurred
|
55 |
+
runtime_error_test = test_result
|
56 |
+
|
57 |
+
if runtime_error_test:
|
58 |
+
# construct the error message for the runtime error
|
59 |
+
kwargs = {
|
60 |
+
"test_input": runtime_error_test["input"],
|
61 |
+
"error_message": runtime_error_test["error_message"].strip(),
|
62 |
+
}
|
63 |
+
|
64 |
+
message_content = (
|
65 |
+
jinja2.Environment(loader=jinja2.BaseLoader())
|
66 |
+
.from_string(self.params["runtime_error_template"])
|
67 |
+
.render(**kwargs)
|
68 |
+
)
|
69 |
+
else:
|
70 |
+
# construct the error message corresponding to a logical error
|
71 |
+
|
72 |
+
if self.params["single_test_error_message"]:
|
73 |
+
# construct the error message for a single (the first) failed test
|
74 |
+
first_failed_test = failed_tests[0]
|
75 |
+
|
76 |
+
kwargs = {
|
77 |
+
"test_input": first_failed_test["input"],
|
78 |
+
"expected_output": first_failed_test["expected_output"],
|
79 |
+
"generated_output": first_failed_test["generated_output"],
|
80 |
+
}
|
81 |
+
|
82 |
+
message_content = (
|
83 |
+
jinja2.Environment(loader=jinja2.BaseLoader())
|
84 |
+
.from_string(self.params["single_test_error_template"])
|
85 |
+
.render(**kwargs)
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
# construct the error message covering all failed tests
|
89 |
+
parts = [self.params["all_tests_header"]]
|
90 |
+
|
91 |
+
for idx, test_result in enumerate(failed_tests):
|
92 |
+
kwargs = {
|
93 |
+
"idx": idx + 1,
|
94 |
+
"test_input": test_result["input"],
|
95 |
+
"expected_output": test_result["expected_output"],
|
96 |
+
"generated_output": test_result["generated_output"],
|
97 |
+
}
|
98 |
+
|
99 |
+
parts.append(
|
100 |
+
jinja2.Environment(loader=jinja2.BaseLoader())
|
101 |
+
.from_string(self.params["test_error_template"])
|
102 |
+
.render(**kwargs)
|
103 |
+
)
|
104 |
+
|
105 |
+
message_content = self.params["tests_separator"].join(parts)
|
106 |
+
data_dict[self.output_key] = message_content
|
107 |
+
return data_dict
|
src/datasets/__init__.py
ADDED
File without changes
|
src/datasets/schema.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict
|
2 |
+
|
3 |
+
|
4 |
+
def assert_test_format_codeforces(tests: List[Tuple[List[str], str]]):
|
5 |
+
assert isinstance(tests, list) or tests is None
|
6 |
+
if tests is None:
|
7 |
+
return
|
8 |
+
for test in tests:
|
9 |
+
assert isinstance(test, list)
|
10 |
+
assert len(test) == 2
|
11 |
+
inputs, outputs = test
|
12 |
+
assert isinstance(inputs, list)
|
13 |
+
assert isinstance(outputs, str)
|
14 |
+
for input in inputs:
|
15 |
+
assert isinstance(input, str)
|
16 |
+
|
17 |
+
|
18 |
+
def assert_entry_format_codeforces(obj: Dict):
|
19 |
+
# each data point must follow the same schema
|
20 |
+
assert isinstance(obj["id"], str) # contest + problem_name = id, will not change when formatting changes
|
21 |
+
assert isinstance(obj["id_hash"], str) # hashsum of all entries, any change to obj will change this
|
22 |
+
assert isinstance(obj["contest"], int)
|
23 |
+
assert isinstance(obj["problem_name"], str)
|
24 |
+
assert isinstance(obj["problem_url"], str)
|
25 |
+
assert isinstance(obj["solution_url"], str)
|
26 |
+
|
27 |
+
assert isinstance(obj["header"], str)
|
28 |
+
assert isinstance(obj["problem_description"], str)
|
29 |
+
assert isinstance(obj["input_description"], str)
|
30 |
+
assert isinstance(obj["output_description"], str)
|
31 |
+
assert isinstance(obj["note"], str) or obj["note"] is None
|
32 |
+
|
33 |
+
assert isinstance(obj["difficulty"], int)
|
34 |
+
assert isinstance(obj["tags"], list)
|
35 |
+
assert isinstance(obj["working_solution"], str) # can be empty
|
36 |
+
|
37 |
+
assert_test_format_codeforces(obj["public_tests_io"])
|
38 |
+
assert_test_format_codeforces(obj["public_tests_individual_io"])
|
39 |
+
assert_test_format_codeforces(obj["hidden_tests_io"])
|
40 |
+
|
41 |
+
|
42 |
+
def assert_test_format_leetcode(tests: List[Tuple[List[str], str]]):
|
43 |
+
pass
|
44 |
+
# ToDo: Uncomment after the test format is updated
|
45 |
+
# assert isinstance(tests, list)
|
46 |
+
# for test in tests:
|
47 |
+
# assert isinstance(test, tuple)
|
48 |
+
# assert len(test) == 2
|
49 |
+
# x, y = test
|
50 |
+
# assert isinstance(x, str)
|
51 |
+
# assert isinstance(y, str)
|
52 |
+
|
53 |
+
|
54 |
+
def assert_entry_format_leetcode(obj: Dict):
|
55 |
+
# each data point must follow the same schema
|
56 |
+
assert isinstance(obj["id"], str) # contest + problem_name = id, will not change when formatting changes
|
57 |
+
assert isinstance(obj["id_hash"], str) # hashsum of all entries, any change to obj will change this
|
58 |
+
assert isinstance(obj["index"], int)
|
59 |
+
assert isinstance(obj["problem_name"], str)
|
60 |
+
assert isinstance(obj["problem_url"], str)
|
61 |
+
|
62 |
+
assert isinstance(obj["problem_description"], str)
|
63 |
+
assert isinstance(obj["constraints"], str)
|
64 |
+
assert isinstance(obj["python_stub"], str)
|
65 |
+
assert isinstance(obj["difficulty"], str) and obj["difficulty"] in {"easy", "medium", "hard"}
|
66 |
+
|
67 |
+
# ToDo: Should be added
|
68 |
+
# assert isinstance(obj['tags'], list)
|
69 |
+
# assert isinstance(obj['solution_url'], str)
|
70 |
+
# assert isinstance(obj['working_solution'], str) # can be empty
|
71 |
+
|
72 |
+
# ToDo: Uncomment after the test format is updated
|
73 |
+
# assert_test_format_leetcode(obj['public_tests_io'])
|
74 |
+
# assert_test_format_leetcode(obj['hidden_tests_io'])
|
src/evaluation/__init__.py
ADDED
File without changes
|
src/evaluation/testing_utils_codeforces.py
ADDED
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is based heavily on the huggingface APPS metric
|
2 |
+
import re
|
3 |
+
|
4 |
+
# to run the solution files we're using a timing based approach
|
5 |
+
import signal
|
6 |
+
import sys
|
7 |
+
|
8 |
+
# for capturing the stdout
|
9 |
+
from io import StringIO
|
10 |
+
from typing import List, Tuple
|
11 |
+
|
12 |
+
# used for testing the code that reads from input
|
13 |
+
from unittest.mock import patch, mock_open
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
from pyext import RuntimeModule
|
17 |
+
from wrapt_timeout_decorator import timeout as wrapt_timeout
|
18 |
+
import threading
|
19 |
+
|
20 |
+
from ..datasets.schema import assert_test_format_codeforces
|
21 |
+
|
22 |
+
from flows import logging
|
23 |
+
|
24 |
+
log = logging.get_logger(__name__)
|
25 |
+
lock = threading.Lock()
|
26 |
+
|
27 |
+
|
28 |
+
def evaluate_solution_for_problem(
|
29 |
+
candidate_solution,
|
30 |
+
hidden_tests_io=None,
|
31 |
+
public_tests_io=None,
|
32 |
+
timeout=10,
|
33 |
+
debug=False,
|
34 |
+
add_extra_imports=False,
|
35 |
+
allow_truncated_io=False,
|
36 |
+
):
|
37 |
+
"""See the readme for the output format of this function."""
|
38 |
+
if hidden_tests_io is None:
|
39 |
+
hidden_tests_io = []
|
40 |
+
if public_tests_io is None:
|
41 |
+
public_tests_io = []
|
42 |
+
|
43 |
+
if candidate_solution is None:
|
44 |
+
results_dict = {
|
45 |
+
"compilation_status": False,
|
46 |
+
"compilation_error_message": "No code was provided.",
|
47 |
+
"timeout_error": False,
|
48 |
+
"hidden_tests_results": [
|
49 |
+
{
|
50 |
+
"status": False,
|
51 |
+
"error_message": "No code was provided.",
|
52 |
+
"generated_output": None,
|
53 |
+
"input": test[0],
|
54 |
+
"expected_output": test[1],
|
55 |
+
}
|
56 |
+
for test in hidden_tests_io
|
57 |
+
],
|
58 |
+
"public_tests_results": [
|
59 |
+
{
|
60 |
+
"status": False,
|
61 |
+
"error_message": "No code was provided.",
|
62 |
+
"generated_output": None,
|
63 |
+
"input": test[0],
|
64 |
+
"expected_output": test[1],
|
65 |
+
}
|
66 |
+
for test in public_tests_io
|
67 |
+
],
|
68 |
+
}
|
69 |
+
return results_dict
|
70 |
+
|
71 |
+
@wrapt_timeout(timeout, use_signals=False)
|
72 |
+
def run_tests():
|
73 |
+
hidden_tests_results = check_correctness(
|
74 |
+
candidate_solution, hidden_tests_io, timeout, debug, add_extra_imports, allow_truncated_io
|
75 |
+
)
|
76 |
+
public_tests_results = check_correctness(
|
77 |
+
candidate_solution, public_tests_io, timeout, debug, add_extra_imports, allow_truncated_io
|
78 |
+
)
|
79 |
+
|
80 |
+
return hidden_tests_results, public_tests_results
|
81 |
+
|
82 |
+
try:
|
83 |
+
lock.acquire()
|
84 |
+
hidden_tests_results, public_tests_results = run_tests()
|
85 |
+
timeout_error_occurred = False
|
86 |
+
lock.release()
|
87 |
+
except BaseException as e:
|
88 |
+
lock.release()
|
89 |
+
log.info(e)
|
90 |
+
hidden_tests_results = {}
|
91 |
+
public_tests_results = {}
|
92 |
+
|
93 |
+
hidden_tests_results["compilation_status"] = True
|
94 |
+
public_tests_results["compilation_status"] = True
|
95 |
+
timeout_error_occurred = True
|
96 |
+
hidden_tests_results["error_message"] = "Timeout error."
|
97 |
+
|
98 |
+
hidden_tests_results["results"] = [
|
99 |
+
{
|
100 |
+
"status": False,
|
101 |
+
"error_message": hidden_tests_results["error_message"],
|
102 |
+
"generated_output": None,
|
103 |
+
"input": test[0],
|
104 |
+
"expected_output": test[1],
|
105 |
+
}
|
106 |
+
for test in hidden_tests_io
|
107 |
+
]
|
108 |
+
public_tests_results["results"] = [
|
109 |
+
{
|
110 |
+
"status": False,
|
111 |
+
"error_message": hidden_tests_results["error_message"],
|
112 |
+
"generated_output": None,
|
113 |
+
"input": test[0],
|
114 |
+
"expected_output": test[1],
|
115 |
+
}
|
116 |
+
for test in public_tests_io
|
117 |
+
]
|
118 |
+
|
119 |
+
# the compilation status shouldn't depend on the tests
|
120 |
+
assert hidden_tests_results["compilation_status"] == public_tests_results["compilation_status"]
|
121 |
+
|
122 |
+
results_dict = {
|
123 |
+
"compilation_status": hidden_tests_results["compilation_status"],
|
124 |
+
"compilation_error_message": hidden_tests_results["error_message"],
|
125 |
+
"timeout_error": timeout_error_occurred,
|
126 |
+
"hidden_tests_results": hidden_tests_results["results"],
|
127 |
+
"public_tests_results": public_tests_results["results"],
|
128 |
+
}
|
129 |
+
|
130 |
+
return results_dict
|
131 |
+
|
132 |
+
|
133 |
+
def check_correctness(
|
134 |
+
candidate_solution: str,
|
135 |
+
tests: List[Tuple[List[str], str]],
|
136 |
+
timeout: int = 6000,
|
137 |
+
debug=True,
|
138 |
+
add_extra_imports=False,
|
139 |
+
allow_truncated_io=True,
|
140 |
+
):
|
141 |
+
"""
|
142 |
+
wrapping the testing code in a global timeout, based on huggingface code
|
143 |
+
"""
|
144 |
+
|
145 |
+
assert_test_format_codeforces(tests)
|
146 |
+
inputs, outputs = [], []
|
147 |
+
if len(tests) > 0:
|
148 |
+
inputs, outputs = zip(*tests)
|
149 |
+
|
150 |
+
compilation_error, results = run_test(
|
151 |
+
candidate_solution, inputs, outputs, timeout, debug, add_extra_imports, allow_truncated_io
|
152 |
+
)
|
153 |
+
|
154 |
+
assert len(results) == len(inputs)
|
155 |
+
|
156 |
+
for result in results:
|
157 |
+
assert isinstance(result["generated_output"], str) or result["generated_output"] is None
|
158 |
+
assert isinstance(result["status"], bool)
|
159 |
+
assert isinstance(result["error_message"], str) or result["error_message"] is None
|
160 |
+
assert isinstance(result["input"], list)
|
161 |
+
assert isinstance(result["expected_output"], str)
|
162 |
+
|
163 |
+
compilation_status = compilation_error == ""
|
164 |
+
if compilation_status:
|
165 |
+
compilation_error = None
|
166 |
+
|
167 |
+
return {"compilation_status": compilation_status, "error_message": compilation_error, "results": results}
|
168 |
+
|
169 |
+
|
170 |
+
class TimeoutException(Exception):
|
171 |
+
pass
|
172 |
+
|
173 |
+
|
174 |
+
def timeout_handler(signum, frame):
|
175 |
+
log.info("Alarm went off")
|
176 |
+
# return
|
177 |
+
raise TimeoutException
|
178 |
+
|
179 |
+
|
180 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
181 |
+
|
182 |
+
|
183 |
+
# used to capture stdout as a list
|
184 |
+
# from https://stackoverflow.com/a/16571630/6416660
|
185 |
+
# alternative use redirect_stdout() from contextlib
|
186 |
+
class Capturing(list):
|
187 |
+
def __enter__(self):
|
188 |
+
self._stdout = sys.stdout
|
189 |
+
sys.stdout = self._stringio = StringIO()
|
190 |
+
# Make closing the StringIO a no-op
|
191 |
+
self._stringio.close = lambda x: 1
|
192 |
+
return self
|
193 |
+
|
194 |
+
def __exit__(self, *args):
|
195 |
+
self.extend(self._stringio.getvalue().splitlines())
|
196 |
+
del self._stringio # free up some memory
|
197 |
+
sys.stdout = self._stdout
|
198 |
+
|
199 |
+
|
200 |
+
def run_test(code, inputs, outputs, timeout: int = 10, debug=True, add_extra_imports=False, allow_truncated_io=True):
|
201 |
+
"""
|
202 |
+
runs the code and tries to match inputs and outputs
|
203 |
+
the scraped testcases may be incomplete
|
204 |
+
if allow_truncated_io==True, then we ignore an EOF exception at the end of the generated output
|
205 |
+
"""
|
206 |
+
# Disable functionalities that can make destructive changes to the test.
|
207 |
+
|
208 |
+
results = []
|
209 |
+
|
210 |
+
if isinstance(code, list):
|
211 |
+
tmp_test = code
|
212 |
+
elif isinstance(code, str):
|
213 |
+
tmp_test = code.split("\n")
|
214 |
+
else:
|
215 |
+
raise AssertionError("code must be provided as list of lines or string with \\n linebreaks.")
|
216 |
+
|
217 |
+
# parse the code into code and imports
|
218 |
+
import_lines = []
|
219 |
+
future_import_lines = []
|
220 |
+
code_lines = []
|
221 |
+
for x in tmp_test:
|
222 |
+
if (not x.startswith("from ")) and (not x.startswith("import ")):
|
223 |
+
code_lines.append("\t" + x + "\n")
|
224 |
+
else:
|
225 |
+
if "__future__" in x:
|
226 |
+
future_import_lines.append(x + "\n")
|
227 |
+
else:
|
228 |
+
import_lines.append(x + "\n")
|
229 |
+
|
230 |
+
# assemble a new solution snippet which wraps the generated solution in a function code()
|
231 |
+
new_test = "stdin = sys.stdin\nstdout = sys.stdout\n"
|
232 |
+
new_test += '__name__="__main__"\n'
|
233 |
+
new_test += "def code():\n"
|
234 |
+
new_test += "\tstdin = sys.stdin\n\tstdout = sys.stdout\n"
|
235 |
+
|
236 |
+
for line in code_lines:
|
237 |
+
new_test += line
|
238 |
+
|
239 |
+
sol = "\n".join(future_import_lines)
|
240 |
+
sol += "import sys\n"
|
241 |
+
if add_extra_imports:
|
242 |
+
sol += "import time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
|
243 |
+
sol += "\n".join(import_lines) + "\n" + new_test
|
244 |
+
|
245 |
+
if debug:
|
246 |
+
log.info(f"sol = {sol}")
|
247 |
+
method_name = "code"
|
248 |
+
signal.alarm(timeout)
|
249 |
+
|
250 |
+
# convert the solution snippet into a pyext runtime module
|
251 |
+
sol_module = None
|
252 |
+
try:
|
253 |
+
sol_module = RuntimeModule.from_string("tmp_sol", "", sol)
|
254 |
+
signal.alarm(0)
|
255 |
+
except Exception as e:
|
256 |
+
signal.alarm(0)
|
257 |
+
if debug:
|
258 |
+
log.info(f"type 1 compilation error = {e}")
|
259 |
+
for inp, out in zip(inputs, outputs):
|
260 |
+
# consider all inputs failed
|
261 |
+
results.append(
|
262 |
+
{
|
263 |
+
"status": False,
|
264 |
+
"input": inp,
|
265 |
+
"expected_output": out,
|
266 |
+
"generated_output": None,
|
267 |
+
"error_message": repr(e),
|
268 |
+
}
|
269 |
+
)
|
270 |
+
return repr(e), results
|
271 |
+
|
272 |
+
assert sol_module is not None
|
273 |
+
signal.alarm(0)
|
274 |
+
|
275 |
+
try:
|
276 |
+
method = getattr(sol_module, method_name) # get_attr second arg must be str
|
277 |
+
except:
|
278 |
+
signal.alarm(0)
|
279 |
+
e = sys.exc_info()
|
280 |
+
log.info(f"unable to get function error = {e}")
|
281 |
+
|
282 |
+
for inp, out in zip(inputs, outputs):
|
283 |
+
# consider all inputs failed
|
284 |
+
results.append(
|
285 |
+
{
|
286 |
+
"status": False,
|
287 |
+
"input": inp,
|
288 |
+
"expected_output": out,
|
289 |
+
"generated_output": None,
|
290 |
+
"error_message": repr(e),
|
291 |
+
}
|
292 |
+
)
|
293 |
+
return repr(e), results
|
294 |
+
|
295 |
+
# go through all tests, call our runtime module with the inputs
|
296 |
+
# then compare with the reference output
|
297 |
+
for index, (test_input, reference_output) in enumerate(zip(inputs, outputs)):
|
298 |
+
|
299 |
+
result_object = {
|
300 |
+
"input": test_input,
|
301 |
+
"expected_output": reference_output,
|
302 |
+
}
|
303 |
+
|
304 |
+
# if the last token of the input is truncated and marked with "..." we delete it
|
305 |
+
input_truncated = False
|
306 |
+
if "".join(test_input).strip().endswith("...") and allow_truncated_io:
|
307 |
+
test_input = test_input[:-1]
|
308 |
+
input_truncated = True
|
309 |
+
|
310 |
+
# sometimes the last input token is ""
|
311 |
+
# if len(test_input)>0:
|
312 |
+
# if test_input[-1]=="":
|
313 |
+
# test_input = test_input[:-1]
|
314 |
+
|
315 |
+
error_code = None
|
316 |
+
with Capturing() as generated_output:
|
317 |
+
try:
|
318 |
+
call_method(method, test_input)
|
319 |
+
# reset the alarm
|
320 |
+
signal.alarm(0)
|
321 |
+
except Exception as e:
|
322 |
+
# runtime error or took too long
|
323 |
+
signal.alarm(0)
|
324 |
+
error_code = e
|
325 |
+
if debug:
|
326 |
+
log.info(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
|
327 |
+
signal.alarm(0)
|
328 |
+
|
329 |
+
# in some cases we run into truncated tests
|
330 |
+
# in such cases we expect the error code to be None, EOFError or ValueError
|
331 |
+
if (
|
332 |
+
(input_truncated or reference_output.strip().endswith("..."))
|
333 |
+
and allow_truncated_io
|
334 |
+
and (error_code is None or isinstance(error_code, EOFError) or isinstance(error_code, ValueError))
|
335 |
+
):
|
336 |
+
|
337 |
+
generated_output = generated_output[:-1]
|
338 |
+
reference_output = reference_output.rstrip("...")
|
339 |
+
if len(generated_output) == 0:
|
340 |
+
# no output left, we pass by default
|
341 |
+
result_object.update(
|
342 |
+
**{
|
343 |
+
"status": True,
|
344 |
+
"generated_output": "\n".join(generated_output),
|
345 |
+
"error_message": None,
|
346 |
+
}
|
347 |
+
)
|
348 |
+
results.append(result_object)
|
349 |
+
else:
|
350 |
+
result_object.update(
|
351 |
+
**{
|
352 |
+
"status": string_compare(generated_output, reference_output, True),
|
353 |
+
"generated_output": "\n".join(generated_output),
|
354 |
+
"error_message": None,
|
355 |
+
}
|
356 |
+
)
|
357 |
+
results.append(result_object)
|
358 |
+
|
359 |
+
# if the input and output are not truncated, we don't allow any errors
|
360 |
+
elif error_code is not None:
|
361 |
+
result_object.update(**{"status": False, "generated_output": None, "error_message": repr(error_code)})
|
362 |
+
results.append(result_object)
|
363 |
+
# finally, if there are no errors, we expect the output to match the reference output
|
364 |
+
else:
|
365 |
+
# the execution went well, let's compare the outputs
|
366 |
+
result_object.update(
|
367 |
+
**{
|
368 |
+
"status": string_compare(generated_output, reference_output, False),
|
369 |
+
"generated_output": "\n".join(generated_output),
|
370 |
+
"error_message": None,
|
371 |
+
}
|
372 |
+
)
|
373 |
+
results.append(result_object)
|
374 |
+
|
375 |
+
return "", results
|
376 |
+
|
377 |
+
|
378 |
+
def string_compare(candidate, correct, truncate_output=False, floating_point_accuracy=0.01):
|
379 |
+
candidate = [o.strip().lower() for o in candidate]
|
380 |
+
correct = correct.strip().lower()
|
381 |
+
|
382 |
+
# normalize whitespace
|
383 |
+
candidate = "\n".join(candidate)
|
384 |
+
candidate = re.sub("\s+", " ", candidate).strip()
|
385 |
+
correct = re.sub("\s+", " ", correct).strip()
|
386 |
+
|
387 |
+
# split into individual tokens
|
388 |
+
candidate = candidate.split(" ")
|
389 |
+
correct = correct.split(" ")
|
390 |
+
|
391 |
+
# some tests may be truncated, if we allow this we don't enforce equal length of inputs/outputs
|
392 |
+
if not truncate_output:
|
393 |
+
if not len(candidate) == len(correct):
|
394 |
+
return False
|
395 |
+
|
396 |
+
# if we allow truncated io, the last token of the output may have been corrupted
|
397 |
+
if truncate_output:
|
398 |
+
correct = correct[:-1]
|
399 |
+
|
400 |
+
# when zip is used for lists of unequal length it will give as many pairs as there are items in the shorter list
|
401 |
+
for left, right in zip(candidate, correct):
|
402 |
+
if left == right:
|
403 |
+
continue
|
404 |
+
|
405 |
+
try:
|
406 |
+
int_left = int(left)
|
407 |
+
int_right = int(right)
|
408 |
+
if int_left == int_right:
|
409 |
+
continue
|
410 |
+
except ValueError:
|
411 |
+
pass
|
412 |
+
|
413 |
+
try:
|
414 |
+
float_left = float(left)
|
415 |
+
float_right = float(right)
|
416 |
+
if np.abs(float_left - float_right) < floating_point_accuracy:
|
417 |
+
continue
|
418 |
+
except ValueError:
|
419 |
+
pass
|
420 |
+
|
421 |
+
return False
|
422 |
+
|
423 |
+
return True
|
424 |
+
|
425 |
+
|
426 |
+
def call_method(method, inputs):
|
427 |
+
if isinstance(inputs, list):
|
428 |
+
inputs = "\n".join(inputs)
|
429 |
+
|
430 |
+
inputs_line_iterator = iter(inputs.split("\n"))
|
431 |
+
|
432 |
+
# sys.setrecursionlimit(10000)
|
433 |
+
|
434 |
+
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
435 |
+
@patch("builtins.open", mock_open(read_data=inputs))
|
436 |
+
@patch("sys.stdin", StringIO(inputs))
|
437 |
+
@patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
|
438 |
+
@patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
|
439 |
+
@patch("sys.stdin.read", lambda *args: inputs)
|
440 |
+
# @patch('sys.stdout.write', print)
|
441 |
+
def _inner_call_method(_method):
|
442 |
+
try:
|
443 |
+
return _method()
|
444 |
+
except SystemExit as e:
|
445 |
+
pass
|
446 |
+
finally:
|
447 |
+
pass
|
448 |
+
|
449 |
+
return _inner_call_method(method)
|
src/evaluation/testing_utils_leetcode.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is based heavily on the huggingface APPS metric
|
2 |
+
# to run the solution files we're using a timing based approach
|
3 |
+
# for capturing the stdout
|
4 |
+
# used for testing the code that reads from input
|
5 |
+
import logging
|
6 |
+
import re
|
7 |
+
from subprocess import Popen, PIPE, TimeoutExpired
|
8 |
+
from typing import List, Tuple
|
9 |
+
import threading
|
10 |
+
|
11 |
+
log = logging.getLogger(__name__)
|
12 |
+
lock = threading.Lock()
|
13 |
+
|
14 |
+
def evaluate_solution_for_problem(
|
15 |
+
candidate_solution,
|
16 |
+
python_stub,
|
17 |
+
hidden_tests_io=None,
|
18 |
+
public_tests_io=None,
|
19 |
+
timeout=10,
|
20 |
+
debug=False,
|
21 |
+
add_extra_imports=False,
|
22 |
+
):
|
23 |
+
with lock:
|
24 |
+
"""See the readme for the output format of this function."""
|
25 |
+
if hidden_tests_io is None:
|
26 |
+
hidden_tests_io = []
|
27 |
+
if public_tests_io is None:
|
28 |
+
public_tests_io = []
|
29 |
+
|
30 |
+
if candidate_solution is None:
|
31 |
+
results_dict = {
|
32 |
+
"compilation_status": False,
|
33 |
+
"compilation_error_message": "No code was provided.",
|
34 |
+
"timeout_error": False,
|
35 |
+
"hidden_tests_results": [
|
36 |
+
{
|
37 |
+
"status": False,
|
38 |
+
"error_message": "No code was provided.",
|
39 |
+
"generated_output": None,
|
40 |
+
"input": test[0],
|
41 |
+
"expected_output": test[1],
|
42 |
+
}
|
43 |
+
for test in hidden_tests_io
|
44 |
+
],
|
45 |
+
"public_tests_results": [
|
46 |
+
{
|
47 |
+
"status": False,
|
48 |
+
"error_message": "No code was provided.",
|
49 |
+
"generated_output": None,
|
50 |
+
"input": test[0],
|
51 |
+
"expected_output": test[1],
|
52 |
+
}
|
53 |
+
for test in public_tests_io
|
54 |
+
],
|
55 |
+
}
|
56 |
+
return results_dict
|
57 |
+
|
58 |
+
hidden_tests_results = check_correctness(
|
59 |
+
candidate_solution, python_stub, hidden_tests_io, timeout, debug, add_extra_imports
|
60 |
+
)
|
61 |
+
public_tests_results = check_correctness(
|
62 |
+
candidate_solution, python_stub, public_tests_io, timeout, debug, add_extra_imports
|
63 |
+
)
|
64 |
+
|
65 |
+
# the compilation status shouldn't depend on the tests
|
66 |
+
if len(hidden_tests_io) > 0 and len(public_tests_io) > 0:
|
67 |
+
assert hidden_tests_results["compilation_status"] == public_tests_results["compilation_status"]
|
68 |
+
|
69 |
+
compilation_status = True
|
70 |
+
error_message = None
|
71 |
+
timeout_error = False
|
72 |
+
|
73 |
+
if len(hidden_tests_io) > 0:
|
74 |
+
compilation_status = compilation_status and hidden_tests_results["compilation_status"]
|
75 |
+
error_message = hidden_tests_results["error_message"]
|
76 |
+
timeout_error = timeout_error or hidden_tests_results["timeout_error"]
|
77 |
+
|
78 |
+
if len(public_tests_io) > 0:
|
79 |
+
compilation_status = compilation_status and public_tests_results["compilation_status"]
|
80 |
+
error_message = public_tests_results["error_message"]
|
81 |
+
timeout_error = timeout_error or public_tests_results["timeout_error"]
|
82 |
+
|
83 |
+
results_dict = {
|
84 |
+
"compilation_status": compilation_status,
|
85 |
+
"compilation_error_message": error_message,
|
86 |
+
"timeout_error": timeout_error,
|
87 |
+
"hidden_tests_results": hidden_tests_results["results"],
|
88 |
+
"public_tests_results": public_tests_results["results"],
|
89 |
+
}
|
90 |
+
|
91 |
+
return results_dict
|
92 |
+
|
93 |
+
|
94 |
+
def check_correctness(
|
95 |
+
candidate_solution: str,
|
96 |
+
python_stub: str,
|
97 |
+
tests: List[Tuple[List[str], str]],
|
98 |
+
timeout: int = 6000,
|
99 |
+
debug=True,
|
100 |
+
add_extra_imports=False,
|
101 |
+
):
|
102 |
+
compilation_status = True
|
103 |
+
compilation_error = None
|
104 |
+
results = []
|
105 |
+
timeout_occurred = False
|
106 |
+
|
107 |
+
for idx, test in enumerate(tests):
|
108 |
+
inp, out, expl = test
|
109 |
+
result = one_test(
|
110 |
+
candidate_solution, python_stub, inp, out, timeout=timeout, debug=debug, add_extra_imports=add_extra_imports
|
111 |
+
)
|
112 |
+
error_message = result["error_message"]
|
113 |
+
|
114 |
+
if error_message is not None:
|
115 |
+
if "syntaxerror" in error_message.lower():
|
116 |
+
compilation_status = False
|
117 |
+
compilation_error = error_message
|
118 |
+
if "timeout" in error_message.lower():
|
119 |
+
timeout_occurred = True
|
120 |
+
results.append(result)
|
121 |
+
|
122 |
+
if timeout_occurred:
|
123 |
+
break
|
124 |
+
|
125 |
+
if timeout_occurred:
|
126 |
+
return {
|
127 |
+
"compilation_status": True,
|
128 |
+
"timeout_error": True,
|
129 |
+
"error_message": "Timeout error.",
|
130 |
+
"results": results,
|
131 |
+
}
|
132 |
+
|
133 |
+
return {
|
134 |
+
"compilation_status": compilation_status,
|
135 |
+
"timeout_error": False,
|
136 |
+
"error_message": compilation_error,
|
137 |
+
"results": results,
|
138 |
+
}
|
139 |
+
|
140 |
+
|
141 |
+
def one_test(candidate_solution, python_stub, inp, out, timeout=10, debug=False, add_extra_imports=False):
|
142 |
+
python_stub = python_stub.strip()
|
143 |
+
candidate_solution = candidate_solution.strip()
|
144 |
+
|
145 |
+
out = out.replace("null", "None").replace("true", "True").replace("false", "False")
|
146 |
+
|
147 |
+
# reformat the solution and parse class and method name
|
148 |
+
class_def, signature = python_stub.split(" def ")
|
149 |
+
class_name = class_def.split("class ")[1].strip().rstrip(":")
|
150 |
+
func_name, _ = signature.split("(")
|
151 |
+
|
152 |
+
# reformatting the input
|
153 |
+
first_param = r"^\w+\s\=\s"
|
154 |
+
later_params = r",\s\w+\s\=\s"
|
155 |
+
|
156 |
+
inp = re.sub(first_param, "", inp)
|
157 |
+
inp = re.sub(later_params, ", ", inp)
|
158 |
+
|
159 |
+
# we add custom code to invoke the solution
|
160 |
+
before_output = "AFTER THIS COMES OUR OWN GENERATED OUTPUT !@#!@!"
|
161 |
+
after_output = "AFTER THIS COMES OUR VERDICT !@#!@!"
|
162 |
+
|
163 |
+
if add_extra_imports:
|
164 |
+
sol = f"""
|
165 |
+
from collections import *
|
166 |
+
from math import *
|
167 |
+
import math
|
168 |
+
from functools import *
|
169 |
+
from heapq import *
|
170 |
+
import heapq
|
171 |
+
import itertools
|
172 |
+
from itertools import *
|
173 |
+
import bisect
|
174 |
+
from bisect import *
|
175 |
+
"""
|
176 |
+
else:
|
177 |
+
sol = ""
|
178 |
+
|
179 |
+
sol += f"""
|
180 |
+
from typing import List, Tuple, Optional
|
181 |
+
{candidate_solution}
|
182 |
+
sfohsdfdsfjhsdkfjhsdkjfh = {class_name}()
|
183 |
+
res = sfohsdfdsfjhsdkfjhsdkjfh.{func_name}({inp})
|
184 |
+
|
185 |
+
def nested_list_convert(inp):
|
186 |
+
try:
|
187 |
+
try:
|
188 |
+
inp = list(inp)
|
189 |
+
except BaseException as e:
|
190 |
+
return inp
|
191 |
+
out = []
|
192 |
+
for i in inp:
|
193 |
+
out.append(nested_list_convert(i))
|
194 |
+
except BaseException as e:
|
195 |
+
return inp
|
196 |
+
return out
|
197 |
+
|
198 |
+
matching = False
|
199 |
+
matching = matching or res == {out}
|
200 |
+
matching = matching or nested_list_convert(res) == {out}
|
201 |
+
matching = matching or nested_list_convert(res) == nested_list_convert({out})
|
202 |
+
matching = matching or str({out})==str(res).replace("{{","[").replace("(","[").replace("}}","]").replace(")","]")
|
203 |
+
matching = matching or str({out})==str(res).replace("{{","[").replace("(","[").replace("}}","]").replace(")","]")
|
204 |
+
print("res: ", res)
|
205 |
+
print("out: ", {out})
|
206 |
+
print("{before_output}")
|
207 |
+
print(res)
|
208 |
+
print("{after_output}")
|
209 |
+
print(matching)
|
210 |
+
"""
|
211 |
+
|
212 |
+
cmd = "python3"
|
213 |
+
|
214 |
+
proc = Popen([cmd, "-c", sol], stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
215 |
+
|
216 |
+
result_object = {"input": inp, "expected_output": out.strip('"')}
|
217 |
+
|
218 |
+
try:
|
219 |
+
stdout, stderr = proc.communicate("", timeout=timeout)
|
220 |
+
except TimeoutExpired as e:
|
221 |
+
if debug:
|
222 |
+
log.info(f"Timeout error, timeout={timeout}")
|
223 |
+
result_object.update({"status": False, "error_message": "Timeout error.", "generated_output": None})
|
224 |
+
return result_object
|
225 |
+
|
226 |
+
finally:
|
227 |
+
proc.kill()
|
228 |
+
|
229 |
+
stdout = stdout.decode()
|
230 |
+
stderr = stderr.decode().lower()
|
231 |
+
|
232 |
+
if stderr == "":
|
233 |
+
# No compilation or runtime error
|
234 |
+
stderr = None
|
235 |
+
else:
|
236 |
+
# Runtime or compilation error (distinction is made by the presence of "syntaxerror" in the error message)
|
237 |
+
result_object.update(**{"status": False, "error_message": stderr, "generated_output": None})
|
238 |
+
return result_object
|
239 |
+
|
240 |
+
try:
|
241 |
+
generated_output = stdout.split(before_output)[1]
|
242 |
+
generated_output, verdict = generated_output.split(after_output)
|
243 |
+
result_object.update(
|
244 |
+
**{
|
245 |
+
"status": verdict.strip() == "True",
|
246 |
+
"error_message": stderr,
|
247 |
+
"generated_output": generated_output.strip(),
|
248 |
+
}
|
249 |
+
)
|
250 |
+
return result_object
|
251 |
+
except IndexError as e:
|
252 |
+
raise Exception(f"An unexpected error has occurred while parsing the following generated output: {stdout}")
|
253 |
+
# Used in debugging
|
254 |
+
# log.info(e)
|
255 |
+
# result_object.update(
|
256 |
+
# **{"status": False, "error_message": "The output couldn't be parsed", "generated_output": None}
|
257 |
+
# )
|
258 |
+
# return result_object
|