iskng
diff --git a/‎run.py
Lines changed: 30 additions & 13 deletions b/‎run.py
Lines changed: 30 additions & 13 deletions
diff --git a/‎run_replay.py
Lines changed: 6 additions & 11 deletions b/‎run_replay.py
Lines changed: 6 additions & 11 deletions
diff --git a/‎sweagent/environment/swe_env.py
Lines changed: 60 additions & 29 deletions b/‎sweagent/environment/swe_env.py
Lines changed: 60 additions & 29 deletions
@@ -2,6 +2,7 @@
 import logging
 import os
 import re
+import subprocess
 import traceback
 from typing import Any, Dict, Optional
 import rich.console
@@ -42,22 +43,20 @@
 @dataclass(frozen=True)
 class ActionsArguments(FlattenedAccess, FrozenSerializable):
     """Run real-life actions (opening PRs, etc.) if we can solve the issue."""
-    open_pr: bool = False  # Open a PR with the patch if we can solve the issue
-    # Skip action if there are already commits claiming to fix the issue. Please only
-    # set this to False if you are sure the commits are not fixes or if this is your
-    # own repository!
+    # Open a PR with the patch if we can solve the issue
+    open_pr: bool = False  
+    # When working with local repository: Apply patch
+    apply_patch_locally: bool = False
+    # Option to be used with open_pr: Skip action if there are already commits claiming 
+    # to fix the issue. Please only set this to False if you are sure the commits are 
+    # not fixes or if this is your own repository!
     skip_if_commits_reference_issue: bool = True  
-    # For PRs: If you want to push the branch to a fork (e.g., because you lack
-    # permissions to push to the main repo), set this to the URL of the fork.
+    # OBSOLETE. Do not use, will raise error.
     push_gh_repo_url: str = ""
 
     def __post_init__(self):
-        if not self.skip_if_commits_reference_issue and self.push_gh_repo_url:
-            raise ValueError(
-                "Overriding `skip_if_commits_reference_issue` when you are "
-                "pushing to a fork is not supported. You should manually "
-                "apply the patch to the forked repository."
-            )
+        if self.push_gh_repo_url:
+            raise ValueError("push_gh_repo_url is obsolete. Use repo_path instead")
 
 @dataclass(frozen=True)
 class ScriptArguments(FlattenedAccess, FrozenSerializable):
@@ -118,6 +117,7 @@ def main(args: ScriptArguments):
             # Get info, patch information
             issue = getattr(env, "query", None)
             files = []
+            assert env.record is not None  # mypy
             if "patch" in env.record:
                 files = "\n".join(
                     [f"- {x.path}" for x in PatchSet(env.record["patch"]).modified_files]
@@ -147,9 +147,11 @@ def main(args: ScriptArguments):
                 return_type="info_trajectory",
             )
             save_predictions(traj_dir, instance_id, info)
-            save_patch(traj_dir, instance_id, info)
+            patch_path = save_patch(traj_dir, instance_id, info)
             if args.actions.open_pr and should_open_pr(args, info, token=env._github_token):
                 env.open_pr(trajectory=trajectory, push_gh_repo_url=args.actions.push_gh_repo_url)
+            if args.actions.apply_patch_locally and patch_path is not None and env.record["repo_type"] == "local":
+                apply_patch(Path(args.environment.repo_path), patch_file=patch_path)
 
         except KeyboardInterrupt:
             logger.info("Exiting InterCode environment...")
@@ -281,6 +283,21 @@ def save_patch(traj_dir: Path, instance_id: str, info) -> Optional[Path]:
     return patch_output_file
 
 
+def apply_patch(local_dir: Path, patch_file: Path) -> None:
+    """Apply a patch to a local directory."""
+    assert local_dir.is_dir()
+    assert patch_file.exists()
+    # The resolve() is important, because we're gonna run the cmd
+    # somewhere else
+    cmd = ["git", "apply", str(patch_file.resolve())]
+    try:
+        subprocess.run(cmd, cwd=local_dir, check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to apply patch {patch_file} to {local_dir}: {e}")
+        return
+    logger.info(f"Applied patch {patch_file} to {local_dir}")
+
+    
 def _print_patch_message(patch_output_file: Path):
     console = rich.console.Console()
     msg = [
 
@@ -5,7 +5,6 @@
 import yaml
 
 from argparse import ArgumentParser
-from sweagent.environment.utils import is_from_github_url
 from typing import Any, Dict, List
 import run as runscript
 
@@ -66,16 +65,15 @@ def create_task_instances_tmp_file(data: List[Dict[str, Any]]) -> str:
                 print(json.dumps(d), file=f, end="\n", flush=True)
         return tmp_path
 
-    is_github = False
+    is_other = False
     if data_path.endswith(".jsonl"):
         replay_task_instances_path = create_task_instances_tmp_file([json.loads(x) for x in open(data_path, "r").readlines()])
     elif data_path.endswith(".json"):
         replay_task_instances_path = create_task_instances_tmp_file(json.load(open(data_path)))
-    elif is_from_github_url(data_path):
-        is_github = True
-        replay_task_instances_path = data_path
     else:
-        raise ValueError("--data_path must be a .json or .jsonl")
+        # Assume data_path is a github url or local url
+        is_other = True
+        replay_task_instances_path = data_path
 
     # Call run.py via subprocess
     run_args = [
@@ -86,7 +84,7 @@ def create_task_instances_tmp_file(data: List[Dict[str, Any]]) -> str:
         "--replay_path", replay_action_trajs_path,
         *forward_args,
     ]
-    if is_github:
+    if is_other:
         # Not sure if this only applies to github urls for data_path
         run_args.extend(["--skip_existing", "False"])
     if suffix is not None:
@@ -95,11 +93,8 @@ def create_task_instances_tmp_file(data: List[Dict[str, Any]]) -> str:
     runscript.main(script_args)
 
     os.remove(replay_action_trajs_path)
-    try:
+    if not is_other:
         os.remove(replay_task_instances_path)
-    except FileNotFoundError:
-        pass
-
 
 def main(
     traj_path: str,
 
@@ -19,12 +19,12 @@
 from simple_parsing.helpers.serialization.serializable import FrozenSerializable
 import yaml
 from sweagent.environment.utils import (
+    copy_anything_to_container,
     copy_file_to_container,
     format_trajectory_markdown,
     get_container,
     get_gh_issue_data,
     get_instances,
-    is_from_github_url,
     parse_gh_issue_url,
     parse_gh_repo_url,
     read_with_timeout,
@@ -53,6 +53,9 @@
 class EnvironmentArguments(FrozenSerializable):
     """Configure data sources and setup instructions for th environment in which we solve the tasks.
     """
+    # Source of issue statement/problem statement. To run over a batch of issues: Path to a data file 
+    # (`json`, `jsonl`) or directory. To run over single issue: github issue url or path to markdown file
+    # with problem statement.
     data_path: str
     image_name: str
     split: str = "dev"
@@ -62,11 +65,13 @@ class EnvironmentArguments(FrozenSerializable):
     timeout: int = 35
     verbose: bool = False
     no_mirror: bool = False
-    # Custom environment setup. Currently only used when data_path is a GitHub URL.
+    # Custom environment setup. Currently only used when data_path points to a single issue.
     # This needs to be either a string pointing to a yaml file (with yaml, yml file extension)
     # or a shell script (with sh extension).
     # See https://github.com/princeton-nlp/SWE-agent/pull/153 for more information
     environment_setup: Optional[str] = None
+    # Only used when running on single issue. Path to local repository or github repository. 
+    repo_path: str = ""
 
 
 class SWEEnv(gym.Env):
@@ -84,7 +89,6 @@ def __init__(self, args: EnvironmentArguments):
         self.logger = logger
         self.persistent = args.container_name is not None
         self.returncode = None
-        self.is_from_github_url = is_from_github_url(args.data_path)
         if not self.args.verbose:
             self.logger.disabled = True
 
@@ -107,7 +111,9 @@ def __init__(self, args: EnvironmentArguments):
 
         # Load Task Instances
         self.data_path = self.args.data_path
-        self.data = get_instances(self.data_path, self.args.base_commit, self.args.split, token=self._github_token)
+        self.data = get_instances(self.data_path, self.args.base_commit, self.args.split, token=self._github_token, repo_path=self.args.repo_path)
+        #: Instance we're currently processing. Gets set in self.reset.
+        self.record = None
         self.logger.info(f"💽 Loaded dataset from {self.data_path}")
 
         # Establish connection with execution container
@@ -119,7 +125,48 @@ def __init__(self, args: EnvironmentArguments):
         self.idx = 0
         self.clean_multi_line_functions = lambda x: x
 
-    def reset(self, index: int = None, apply_test_patch: bool = False) -> Tuple[str, dict]:
+    @property
+    def _repo_name(self) -> str:
+        """Name of the local copy of the repository"""
+        assert self.record is not None
+        return self.record["repo"].replace("/", "__")
+    
+    def _copy_repo(self) -> str:
+        """Clone/copy repository/codebase in container
+        Returns:
+            folder name of clone
+        """
+        assert self.record is not None  # mypy
+        if self.record["repo_type"] == "local":
+            copy_anything_to_container(self.container_obj, self.record["repo"].removeprefix("local://"), "/"+self._repo_name)
+            self.communicate_with_handling(
+                input=f"chown -R root:root {self._repo_name}",
+                error_msg="Failed to change permissions on copied repository",
+            )
+            return self._repo_name
+        assert self.record["repo_type"] == "github"
+        token_prefix = ""
+        if self._github_token:
+            token_prefix = f"{self._github_token}@"
+        # fixme: This if statement is brittle and should probably be replaced with better logic
+        if not self.args.no_mirror and self.record["problem_statement_source"] == "swe-bench":
+            self.logger.info(f"{self._repo_name} not found in container, cloning...")
+            self.communicate_with_handling(
+                input=f"git clone https://{token_prefix}github.com/swe-bench/{self._repo_name}.git",
+                error_msg="Failed to clone repository from mirror",
+                timeout_duration=LONG_TIMEOUT,
+            )
+            return self._repo_name
+        else:
+            logger.info(f"Trying to clone from non-mirror...")
+            self.communicate_with_handling(
+                input=f"git clone https://{token_prefix}github.com/{self.record['repo']}.git {self._repo_name}",
+                error_msg="Failed to clone repository from non-mirror",
+                timeout_duration=LONG_TIMEOUT,
+            )
+            return self._repo_name
+
+    def reset(self, index: Optional[int] = None, apply_test_patch: bool = False) -> Tuple[Optional[str], dict]:
         """
         Function to reset container between each task instance.
         * Clones instance's repository
@@ -151,30 +198,13 @@ def reset(self, index: int = None, apply_test_patch: bool = False) -> Tuple[str,
         # Clone repository if not already cloned
         self.communicate(input="cd /")
         folders = self.communicate(input="ls").split("\n")
-        repo_name = self.record["repo"].replace("/", "__")
-        if repo_name not in folders:
-            token_prefix = ""
-            if self._github_token:
-                token_prefix = f"{self._github_token}@"
-            if not self.args.no_mirror and not self.is_from_github_url:
-                self.logger.info(f"{repo_name} not found in container, cloning...")
-                self.communicate_with_handling(
-                    input=f"git clone https://{token_prefix}github.com/swe-bench/{repo_name}.git",
-                    error_msg="Failed to clone repository from mirror",
-                    timeout_duration=LONG_TIMEOUT,
-                )
-            else:
-                logger.info(f"Trying to clone from non-mirror...")
-                self.communicate_with_handling(
-                    input=f"git clone https://{token_prefix}github.com/{self.record['repo']}.git {repo_name}",
-                    error_msg="Failed to clone repository from non-mirror",
-                    timeout_duration=LONG_TIMEOUT,
-                )
+        if self._repo_name not in folders:
+            self._copy_repo()
 
         # Clean repository of any modifications + Checkout base commit
         for cmd in [
             "echo -n > /root/files_to_edit.txt",
-            f"cd {repo_name}",
+            f"cd {self._repo_name}",
             "export ROOT=$(pwd -P)",
             "git status",
             "git restore .",
@@ -559,14 +589,15 @@ def install_env(self) -> None:
         """
         Creates conda environment and installs third party dependencies to allow code execution
         """
-        if self.is_from_github_url and self.args.environment_setup is None:
+        assert self.record is not None  # mypy
+        if (self.record["problem_statement_source"] != "swe-bench" or \
+            self.record["repo_type"] == "local") and self.args.environment_setup is None:
             logger.warning((
                 "install_environment is set to True, but the data path is a GitHub URL "
                 "without an environment config file (environment_config key/flag). "
                 "Skipping conda environment installation."
                 ))
             return
-        repo_name = self.record["repo"].replace("/", "__")
         if self.args.environment_setup is not None:
             assert isinstance(self.args.environment_setup, (str, os.PathLike))
             if Path(self.args.environment_setup).suffix in [".yml", ".yaml"]:
@@ -592,7 +623,7 @@ def install_env(self) -> None:
                 )
                 raise ValueError(msg) from e
         # Create environment if does not exist yet
-        env_name = f"{repo_name}__{self.record['version']}"
+        env_name = f"{self._repo_name}__{self.record['version']}"
         env_check = self.communicate(
             f"conda env list | grep {env_name}", timeout_duration=LONG_TIMEOUT
         )
@@ -676,7 +707,7 @@ def install_env(self) -> None:
                     pre_install_cmd,
                     error_msg="Pre-install commands failed to execute successfully",
                 )
-        self.logger.info(f"Installing {repo_name} at base commit...")
+        self.logger.info(f"Installing {self._repo_name} at base commit...")
         if "install" in install_configs:
             install_cmd = install_configs["install"]
             self.communicate_with_handling(