diff --git a/swebench/harness/dockerfiles.py b/swebench/harness/dockerfiles.py index 6543476e..846ce45c 100644 --- a/swebench/harness/dockerfiles.py +++ b/swebench/harness/dockerfiles.py @@ -1,5 +1,3 @@ -from functools import partial - # IF you change the base image, you need to rebuild all images (run with --force_rebuild) _DOCKERFILE_BASE = r""" FROM --platform={platform} ubuntu:22.04 diff --git a/swebench/harness/grading.py b/swebench/harness/grading.py index b41c1a81..9f2ddee7 100644 --- a/swebench/harness/grading.py +++ b/swebench/harness/grading.py @@ -6,6 +6,7 @@ APPLY_PATCH_PASS, FAIL_TO_FAIL, FAIL_TO_PASS, + KEY_INSTANCE_ID, PASS_TO_FAIL, PASS_TO_PASS, RESET_FAILED, @@ -225,7 +226,7 @@ def get_eval_report( """ report_map = {} - instance_id = prediction["instance_id"] + instance_id = prediction[KEY_INSTANCE_ID] if instance_id not in report_map: report_map[instance_id] = { "patch_is_None": False, @@ -248,13 +249,13 @@ def get_eval_report( report_map[instance_id]["patch_successfully_applied"] = True eval_ref = { - "instance_id": test_spec.instance_id, - "FAIL_TO_PASS": test_spec.FAIL_TO_PASS, - "PASS_TO_PASS": test_spec.PASS_TO_PASS, + KEY_INSTANCE_ID: test_spec.instance_id, + FAIL_TO_PASS: test_spec.FAIL_TO_PASS, + PASS_TO_PASS: test_spec.PASS_TO_PASS, } report = get_eval_tests_report(eval_sm, eval_ref) - if get_resolution_status(report) == "RESOLVED_FULL": + if get_resolution_status(report) == ResolvedStatus.FULL.value: report_map[instance_id]["resolved"] = True if include_tests_status: diff --git a/swebench/harness/log_parsers.py b/swebench/harness/log_parsers.py index e3e249be..fcf274c1 100644 --- a/swebench/harness/log_parsers.py +++ b/swebench/harness/log_parsers.py @@ -1,12 +1,6 @@ import re from enum import Enum - - -class TestStatus(Enum): - FAILED = "FAILED" - PASSED = "PASSED" - SKIPPED = "SKIPPED" - ERROR = "ERROR" +from swebench.harness.constants import TestStatus def parse_log_pytest(log: str) -> dict[str, str]: diff --git a/swebench/harness/prepare_images.py b/swebench/harness/prepare_images.py index ea11c302..d6ad710d 100644 --- a/swebench/harness/prepare_images.py +++ b/swebench/harness/prepare_images.py @@ -3,6 +3,7 @@ from argparse import ArgumentParser +from swebench.harness.constants import KEY_INSTANCE_ID from swebench.harness.docker_build import build_instance_images from swebench.harness.docker_utils import list_images from swebench.harness.test_spec import make_test_spec @@ -29,12 +30,12 @@ def filter_dataset_to_build( data_to_build = [] # Check if all instance IDs are in the dataset - not_in_dataset = set(instance_ids).difference(set([instance["instance_id"] for instance in dataset])) + not_in_dataset = set(instance_ids).difference(set([instance[KEY_INSTANCE_ID] for instance in dataset])) if not_in_dataset: raise ValueError(f"Instance IDs not found in dataset: {not_in_dataset}") for instance in dataset: - if instance["instance_id"] not in instance_ids: + if instance[KEY_INSTANCE_ID] not in instance_ids: # Skip instances not in the list continue diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index a3c589bf..0cd2dd1a 100644 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -14,6 +14,7 @@ APPLY_PATCH_FAIL, APPLY_PATCH_PASS, INSTANCE_IMAGE_BUILD_DIR, + KEY_INSTANCE_ID, RUN_EVALUATION_LOG_DIR, ) from swebench.harness.docker_utils import ( @@ -302,7 +303,7 @@ def get_dataset_from_preds( """ # load dataset dataset = load_swebench_dataset(dataset_name, split) - dataset_ids = {i["instance_id"] for i in dataset} + dataset_ids = {i[KEY_INSTANCE_ID] for i in dataset} if instance_ids: # check that all instance IDs are in the dataset @@ -331,34 +332,34 @@ def get_dataset_from_preds( if instance_ids: # filter dataset to just the instance IDs - dataset = [i for i in dataset if i["instance_id"] in instance_ids] + dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids] # check which instance IDs have already been run completed_ids = set() for instance in dataset: - if instance["instance_id"] not in prediction_ids: + if instance[KEY_INSTANCE_ID] not in prediction_ids: # skip instances without predictions continue - prediction = predictions[instance["instance_id"]] + prediction = predictions[instance[KEY_INSTANCE_ID]] report_file = ( RUN_EVALUATION_LOG_DIR / run_id / prediction["model_name_or_path"].replace("/", "__") - / prediction["instance_id"] + / prediction[KEY_INSTANCE_ID] / "report.json" ) if report_file.exists(): - completed_ids.add(instance["instance_id"]) + completed_ids.add(instance[KEY_INSTANCE_ID]) if completed_ids and exclude_completed: # filter dataset to only instances that have not been run print(f"{len(completed_ids)} instances already run, skipping...") - dataset = [i for i in dataset if i["instance_id"] not in completed_ids] + dataset = [i for i in dataset if i[KEY_INSTANCE_ID] not in completed_ids] empty_patch_ids = {k for k, v in predictions.items() if v["model_patch"] == "" or v["model_patch"] is None} # filter dataset to only instances with predictions - dataset = [i for i in dataset if i["instance_id"] in prediction_ids and i["instance_id"] not in empty_patch_ids] + dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in prediction_ids and i[KEY_INSTANCE_ID] not in empty_patch_ids] return dataset @@ -394,7 +395,7 @@ def make_run_report( # iterate through dataset and check if the instance has been run for instance in full_dataset: - instance_id = instance["instance_id"] + instance_id = instance[KEY_INSTANCE_ID] if instance_id not in predictions: # skip instances without incomplete_ids.add(instance_id) @@ -407,7 +408,7 @@ def make_run_report( RUN_EVALUATION_LOG_DIR / run_id / prediction["model_name_or_path"].replace("/", "__") - / prediction["instance_id"] + / prediction[KEY_INSTANCE_ID] / "report.json" ) if report_file.exists(): @@ -486,7 +487,7 @@ def get_gold_predictions(dataset_name: str, split: str): dataset = load_swebench_dataset(dataset_name, split) return [ { - "instance_id": datum["instance_id"], + KEY_INSTANCE_ID: datum[KEY_INSTANCE_ID], "model_patch": datum["patch"], "model_name_or_path": "gold", } for datum in dataset @@ -527,7 +528,7 @@ def main( predictions = [json.loads(line) for line in f] else: raise ValueError("Predictions path must be \"gold\", .json, or .jsonl") - predictions = {pred["instance_id"]: pred for pred in predictions} + predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions} # get dataset from predictions dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id) diff --git a/swebench/harness/test_spec.py b/swebench/harness/test_spec.py index 93913916..b59db676 100644 --- a/swebench/harness/test_spec.py +++ b/swebench/harness/test_spec.py @@ -8,6 +8,9 @@ from swebench.harness.constants import ( SWEbenchInstance, + KEY_INSTANCE_ID, + FAIL_TO_PASS, + PASS_TO_PASS, MAP_REPO_TO_INSTALL, MAP_REPO_VERSION_TO_SPECS, USE_X86, @@ -255,7 +258,7 @@ def make_eval_script_list(instance, specs, env_name, repo_directory, base_commit def make_test_spec(instance: SWEbenchInstance) -> TestSpec: if isinstance(instance, TestSpec): return instance - instance_id = instance["instance_id"] + instance_id = instance[KEY_INSTANCE_ID] repo = instance["repo"] version = instance["version"] base_commit = instance["base_commit"] @@ -269,8 +272,8 @@ def _from_json_or_obj(key: str) -> Any: return json.loads(instance[key]) return instance[key] - pass_to_pass = _from_json_or_obj("PASS_TO_PASS") - fail_to_pass = _from_json_or_obj("FAIL_TO_PASS") + pass_to_pass = _from_json_or_obj(PASS_TO_PASS) + fail_to_pass = _from_json_or_obj(FAIL_TO_PASS) env_name = "testbed" repo_directory = f"/{env_name}" diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index 41005ad1..e3160ee5 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -6,10 +6,8 @@ from argparse import ArgumentTypeError from datasets import Dataset, load_dataset -from datetime import datetime from dotenv import load_dotenv from functools import cache -from git import Repo from typing import cast from swebench.harness.constants import ( diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index eab3f660..7dff357e 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -2,19 +2,30 @@ import json import docker +from swebench.harness.constants import ( + FAIL_TO_PASS, + PASS_TO_PASS, + KEY_INSTANCE_ID, + KEY_MODEL, +) from swebench.harness.run_evaluation import make_run_report TEST_INSTANCE = collections.defaultdict(lambda: "test") -TEST_INSTANCE["PASS_TO_PASS"] = '[]' +TEST_INSTANCE[PASS_TO_PASS] = '[]' TEST_INSTANCE["repo"] = 'pvlib/pvlib-python' TEST_INSTANCE["version"] = '0.1' -TEST_INSTANCE["FAIL_TO_PASS"] = '[]' +TEST_INSTANCE[FAIL_TO_PASS] = '[]' def test_make_run_report(tmpdir) -> None: client = docker.from_env() with tmpdir.as_cwd(): output_path = make_run_report( - {"test": {"instance_id": "test", "model_name_or_path": "test"}}, + { + "test": { + KEY_INSTANCE_ID: "test", + KEY_MODEL: "test" + } + }, [TEST_INSTANCE], client, "test"