Skip to content

Commit

Permalink
Replace strings with constants
Browse files Browse the repository at this point in the history
  • Loading branch information
john-b-yang committed Jul 15, 2024
1 parent 55a5dd3 commit 5836826
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 36 deletions.
2 changes: 0 additions & 2 deletions swebench/harness/dockerfiles.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from functools import partial

# IF you change the base image, you need to rebuild all images (run with --force_rebuild)
_DOCKERFILE_BASE = r"""
FROM --platform={platform} ubuntu:22.04
Expand Down
11 changes: 6 additions & 5 deletions swebench/harness/grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
APPLY_PATCH_PASS,
FAIL_TO_FAIL,
FAIL_TO_PASS,
KEY_INSTANCE_ID,
PASS_TO_FAIL,
PASS_TO_PASS,
RESET_FAILED,
Expand Down Expand Up @@ -225,7 +226,7 @@ def get_eval_report(
"""
report_map = {}

instance_id = prediction["instance_id"]
instance_id = prediction[KEY_INSTANCE_ID]
if instance_id not in report_map:
report_map[instance_id] = {
"patch_is_None": False,
Expand All @@ -248,13 +249,13 @@ def get_eval_report(
report_map[instance_id]["patch_successfully_applied"] = True

eval_ref = {
"instance_id": test_spec.instance_id,
"FAIL_TO_PASS": test_spec.FAIL_TO_PASS,
"PASS_TO_PASS": test_spec.PASS_TO_PASS,
KEY_INSTANCE_ID: test_spec.instance_id,
FAIL_TO_PASS: test_spec.FAIL_TO_PASS,
PASS_TO_PASS: test_spec.PASS_TO_PASS,
}

report = get_eval_tests_report(eval_sm, eval_ref)
if get_resolution_status(report) == "RESOLVED_FULL":
if get_resolution_status(report) == ResolvedStatus.FULL.value:
report_map[instance_id]["resolved"] = True

if include_tests_status:
Expand Down
8 changes: 1 addition & 7 deletions swebench/harness/log_parsers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
import re
from enum import Enum


class TestStatus(Enum):
FAILED = "FAILED"
PASSED = "PASSED"
SKIPPED = "SKIPPED"
ERROR = "ERROR"
from swebench.harness.constants import TestStatus


def parse_log_pytest(log: str) -> dict[str, str]:
Expand Down
5 changes: 3 additions & 2 deletions swebench/harness/prepare_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from argparse import ArgumentParser

from swebench.harness.constants import KEY_INSTANCE_ID
from swebench.harness.docker_build import build_instance_images
from swebench.harness.docker_utils import list_images
from swebench.harness.test_spec import make_test_spec
Expand All @@ -29,12 +30,12 @@ def filter_dataset_to_build(
data_to_build = []

# Check if all instance IDs are in the dataset
not_in_dataset = set(instance_ids).difference(set([instance["instance_id"] for instance in dataset]))
not_in_dataset = set(instance_ids).difference(set([instance[KEY_INSTANCE_ID] for instance in dataset]))
if not_in_dataset:
raise ValueError(f"Instance IDs not found in dataset: {not_in_dataset}")

for instance in dataset:
if instance["instance_id"] not in instance_ids:
if instance[KEY_INSTANCE_ID] not in instance_ids:
# Skip instances not in the list
continue

Expand Down
25 changes: 13 additions & 12 deletions swebench/harness/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
APPLY_PATCH_FAIL,
APPLY_PATCH_PASS,
INSTANCE_IMAGE_BUILD_DIR,
KEY_INSTANCE_ID,
RUN_EVALUATION_LOG_DIR,
)
from swebench.harness.docker_utils import (
Expand Down Expand Up @@ -302,7 +303,7 @@ def get_dataset_from_preds(
"""
# load dataset
dataset = load_swebench_dataset(dataset_name, split)
dataset_ids = {i["instance_id"] for i in dataset}
dataset_ids = {i[KEY_INSTANCE_ID] for i in dataset}

if instance_ids:
# check that all instance IDs are in the dataset
Expand Down Expand Up @@ -331,34 +332,34 @@ def get_dataset_from_preds(

if instance_ids:
# filter dataset to just the instance IDs
dataset = [i for i in dataset if i["instance_id"] in instance_ids]
dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids]

# check which instance IDs have already been run
completed_ids = set()
for instance in dataset:
if instance["instance_id"] not in prediction_ids:
if instance[KEY_INSTANCE_ID] not in prediction_ids:
# skip instances without predictions
continue
prediction = predictions[instance["instance_id"]]
prediction = predictions[instance[KEY_INSTANCE_ID]]
report_file = (
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction["model_name_or_path"].replace("/", "__")
/ prediction["instance_id"]
/ prediction[KEY_INSTANCE_ID]
/ "report.json"
)
if report_file.exists():
completed_ids.add(instance["instance_id"])
completed_ids.add(instance[KEY_INSTANCE_ID])

if completed_ids and exclude_completed:
# filter dataset to only instances that have not been run
print(f"{len(completed_ids)} instances already run, skipping...")
dataset = [i for i in dataset if i["instance_id"] not in completed_ids]
dataset = [i for i in dataset if i[KEY_INSTANCE_ID] not in completed_ids]

empty_patch_ids = {k for k, v in predictions.items() if v["model_patch"] == "" or v["model_patch"] is None}

# filter dataset to only instances with predictions
dataset = [i for i in dataset if i["instance_id"] in prediction_ids and i["instance_id"] not in empty_patch_ids]
dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in prediction_ids and i[KEY_INSTANCE_ID] not in empty_patch_ids]
return dataset


Expand Down Expand Up @@ -394,7 +395,7 @@ def make_run_report(

# iterate through dataset and check if the instance has been run
for instance in full_dataset:
instance_id = instance["instance_id"]
instance_id = instance[KEY_INSTANCE_ID]
if instance_id not in predictions:
# skip instances without
incomplete_ids.add(instance_id)
Expand All @@ -407,7 +408,7 @@ def make_run_report(
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction["model_name_or_path"].replace("/", "__")
/ prediction["instance_id"]
/ prediction[KEY_INSTANCE_ID]
/ "report.json"
)
if report_file.exists():
Expand Down Expand Up @@ -486,7 +487,7 @@ def get_gold_predictions(dataset_name: str, split: str):
dataset = load_swebench_dataset(dataset_name, split)
return [
{
"instance_id": datum["instance_id"],
KEY_INSTANCE_ID: datum[KEY_INSTANCE_ID],
"model_patch": datum["patch"],
"model_name_or_path": "gold",
} for datum in dataset
Expand Down Expand Up @@ -527,7 +528,7 @@ def main(
predictions = [json.loads(line) for line in f]
else:
raise ValueError("Predictions path must be \"gold\", .json, or .jsonl")
predictions = {pred["instance_id"]: pred for pred in predictions}
predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}

# get dataset from predictions
dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
Expand Down
9 changes: 6 additions & 3 deletions swebench/harness/test_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

from swebench.harness.constants import (
SWEbenchInstance,
KEY_INSTANCE_ID,
FAIL_TO_PASS,
PASS_TO_PASS,
MAP_REPO_TO_INSTALL,
MAP_REPO_VERSION_TO_SPECS,
USE_X86,
Expand Down Expand Up @@ -255,7 +258,7 @@ def make_eval_script_list(instance, specs, env_name, repo_directory, base_commit
def make_test_spec(instance: SWEbenchInstance) -> TestSpec:
if isinstance(instance, TestSpec):
return instance
instance_id = instance["instance_id"]
instance_id = instance[KEY_INSTANCE_ID]
repo = instance["repo"]
version = instance["version"]
base_commit = instance["base_commit"]
Expand All @@ -269,8 +272,8 @@ def _from_json_or_obj(key: str) -> Any:
return json.loads(instance[key])
return instance[key]

pass_to_pass = _from_json_or_obj("PASS_TO_PASS")
fail_to_pass = _from_json_or_obj("FAIL_TO_PASS")
pass_to_pass = _from_json_or_obj(PASS_TO_PASS)
fail_to_pass = _from_json_or_obj(FAIL_TO_PASS)

env_name = "testbed"
repo_directory = f"/{env_name}"
Expand Down
2 changes: 0 additions & 2 deletions swebench/harness/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@

from argparse import ArgumentTypeError
from datasets import Dataset, load_dataset
from datetime import datetime
from dotenv import load_dotenv
from functools import cache
from git import Repo
from typing import cast

from swebench.harness.constants import (
Expand Down
17 changes: 14 additions & 3 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,30 @@
import json
import docker

from swebench.harness.constants import (
FAIL_TO_PASS,
PASS_TO_PASS,
KEY_INSTANCE_ID,
KEY_MODEL,
)
from swebench.harness.run_evaluation import make_run_report

TEST_INSTANCE = collections.defaultdict(lambda: "test")
TEST_INSTANCE["PASS_TO_PASS"] = '[]'
TEST_INSTANCE[PASS_TO_PASS] = '[]'
TEST_INSTANCE["repo"] = 'pvlib/pvlib-python'
TEST_INSTANCE["version"] = '0.1'
TEST_INSTANCE["FAIL_TO_PASS"] = '[]'
TEST_INSTANCE[FAIL_TO_PASS] = '[]'

def test_make_run_report(tmpdir) -> None:
client = docker.from_env()
with tmpdir.as_cwd():
output_path = make_run_report(
{"test": {"instance_id": "test", "model_name_or_path": "test"}},
{
"test": {
KEY_INSTANCE_ID: "test",
KEY_MODEL: "test"
}
},
[TEST_INSTANCE],
client,
"test"
Expand Down

0 comments on commit 5836826

Please sign in to comment.