forked from aorwall/SWE-bench-docker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_instance_tests.py
146 lines (120 loc) · 5.51 KB
/
run_instance_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""Run evaluation"""
import argparse
import asyncio
import logging
import os
import re
import tempfile
from swebench.metrics.getters import get_eval_refs
from swebench_docker.constants import (
KEY_INSTANCE_ID,
KEY_MODEL,
KEY_PREDICTION, MAP_REPO_TO_TEST_FRAMEWORK, )
from swebench_docker.run_docker import run_docker_evaluation
from swebench_docker.utils import get_instances
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("run_evaluation")
def parse_test_output(log_str: str, test_cmd: str, instance_id: str) -> str:
# Start from the first instance of "Std. Output" or "Std. Error" after the test command
test_cmd_idx = log_str.find(test_cmd)
if test_cmd_idx == -1:
return "Test command not found in log"
stdout_idx = log_str.find("Std. Output", test_cmd_idx)
stderr_idx = log_str.find("Std. Error", test_cmd_idx)
if stdout_idx == -1 and stderr_idx == -1:
return "No stdout or stderr found"
start_idx = stdout_idx if stdout_idx != -1 else stderr_idx
# End on the next log entry ("[instance_version] [instance_id]")
pattern = re.compile(r"\n\[[^\]]*\] \[" + instance_id + r"\]")
match = pattern.search(log_str, start_idx)
end_idx = None if match is None else match.start()
return log_str[start_idx:end_idx]
async def run_instance_tests(
instance_id: str,
patch: str,
test_directives: list[str] = [],
model: str = "diff",
swe_bench_tasks: str = "princeton-nlp/SWE-bench_Lite",
namespace: str = "aorwall",
log_output: bool = False,
) -> str:
"""Run specified tests on an instance/patch and return the stdout and stderr"""
if not patch:
# TODO: Add support for running tests with empty patch
raise ValueError("Must provide a valid patch to evaluate")
tasks = get_eval_refs(swe_bench_tasks)
task = tasks[instance_id]
test_type = MAP_REPO_TO_TEST_FRAMEWORK[task["repo"]]
# Show more detailed test output for troubleshooting
if "--tb=no" in test_type:
test_type = test_type.replace("--tb=no", "")
test_cmd = f"{test_type} {' '.join(test_directives)}"
instance = {
KEY_INSTANCE_ID: instance_id,
KEY_PREDICTION: patch,
KEY_MODEL: model,
"repo": task["repo"],
"version": task["version"],
"base_commit": task["base_commit"],
"test_patch": task["test_patch"],
"test_directives": test_directives,
"test_cmd": test_cmd
}
with tempfile.TemporaryDirectory() as temp_dir:
await run_docker_evaluation(instance, namespace, temp_dir)
eval_log = os.path.join(temp_dir, f"{instance_id}.{instance[KEY_MODEL]}.eval.log")
with open(eval_log, "r") as f:
log_str = f.read()
if log_output:
logger.info(f"Instance {instance_id} evaluation logs:")
logger.info(log_str)
return parse_test_output(log_str, test_cmd, instance_id)
async def main(
instance_id: str,
predictions_path: str,
test_directives: list[str],
swe_bench_tasks: str,
namespace: str,
test_output_dir: str,
):
"""
Runs arbitrary tests on a single instance's prediction and returns the stdout and stderr
Args:
instance_id (str): Path to the predictions file.
predictions_path (str): Path to the predictions file. If not specified the golden patch will be run.
test_directives (list[str]): A list of directives to pass to the test.
swe_bench_tasks (str): Path to the SWE-bench tasks file OR HF dataset name.
namespace (str): Docker repository namespace.
test_output_dir (str): Optional path to write the test output.
"""
if predictions_path:
predictions_path = os.path.abspath(predictions_path)
predictions = get_instances(predictions_path)
prediction = [p for p in predictions if p[KEY_INSTANCE_ID] == instance_id][0]
patch = prediction[KEY_PREDICTION]
model = prediction[KEY_MODEL]
else:
tasks = get_eval_refs(swe_bench_tasks)
task = tasks[instance_id]
patch = task["patch"]
model = "golden"
output = await run_instance_tests(instance_id, patch, test_directives, model, swe_bench_tasks, namespace, log_output=True)
if test_output_dir:
test_output_dir = os.path.abspath(test_output_dir)
os.makedirs(os.path.dirname(test_output_dir), exist_ok=True)
test_output_path = os.path.join(test_output_dir, f"{instance_id}.test_output.txt")
with open(test_output_path, "w") as f:
f.write(output)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--instance_id", type=str, help="Instance ID", required=True)
parser.add_argument("--swe_bench_tasks", type=str, help="Path to dataset file or HF datasets name", required=False, default="princeton-nlp/SWE-bench_Lite")
parser.add_argument("--namespace", type=str, help="Docker repository namespace", required=False, default="aorwall")
parser.add_argument("--predictions_path", type=str, help="Path to predictions file (must be .json)", required=False)
parser.add_argument("--test_directives", type=str, help="Directives to pass to the test", required=False, nargs="*", default=[])
parser.add_argument("--test_output_dir", type=str, help="Path to write the test output", required=False)
args = parser.parse_args()
output = asyncio.run(main(**vars(args)))