From 10f789d35fb1278c1874e872d5730bd0a315bdce Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Thu, 26 Jun 2025 21:21:13 +0100 Subject: [PATCH 1/8] Initial version of a git_bdiff module A simple module that wraps around git diff and generates a complete list of files changed on a branch since it diverged from the parent. This is intended to replace the functionality of fcm_bdiff. --- bdiff/git_bdiff.py | 134 +++++++++++++++++++++++++++++ bdiff/tests/test_git_bdiff.py | 154 ++++++++++++++++++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 bdiff/git_bdiff.py create mode 100644 bdiff/tests/test_git_bdiff.py diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py new file mode 100644 index 0000000..1ec843e --- /dev/null +++ b/bdiff/git_bdiff.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# *********************************COPYRIGHT************************************ +# (C) Crown copyright Met Office. All rights reserved. +# For further details please refer to the file COPYRIGHT.txt +# which you should have received as part of this distribution. +# *********************************COPYRIGHT************************************ +""" +Module to obtain a list of all altered files on a git branch from +point where it diverged from the parent branch to the most recent +commit. + +Usage is as follows: + +>>> bdiff = GitBDiff() + +And then: + +>>> for change in bdiff.files(): +... print(change) +""" + +import re +import subprocess + + +class GitBDiffError(Exception): + """Base bdiff error class.""" + + +class GitBDiffNotGit(GitBDiffError): + """Error if the target not part of a git repository.""" + + def __init__(self, cmd): + super().__init__("not a repository (cmd:" + " ".join(cmd) + ")") + + +class GitBDiff: + """Class which generates a branch diff.""" + + # Name of primary branch - default is main + primary_branch = "main" + + # Match hex commit IDs + _hash_pattern = re.compile(r"^([0-9a-f]+)$") + + # Match branch names + _branch_pattern = re.compile(r"^(\S+)$") + + def __init__(self, parent=None): + self.parent = parent or self.primary_branch + self.ancestor = self.get_branch_point() + self.current = self.get_latest_commit() + self.branch = self.get_branch_name() + + def get_branch_point(self): + """Get the branch point from the parent repo. + + Find the commit which marks the point of divergence from the + parent repository. If there are no changes or this is the + trunk, the branch point will be the same as the most recent + commit. + """ + + result = None + for line in self.run_git(["merge-base", self.parent, "HEAD"]): + if m := self._hash_pattern.match(line): + result = m.group(1) + break + else: + raise GitBDiffError("branch point not found") + return result + + def get_latest_commit(self): + """Get the last commit ID on the branch.""" + + result = None + for line in self.run_git(["show", "--pretty=%H", "--no-patch"]): + if m := self._hash_pattern.match(line): + result = m.group(1) + break + else: + raise GitBDiffError("current revision not found") + return result + + def get_branch_name(self): + """Get the name of the current branch.""" + result = None + for line in self.run_git(["branch", "--show-current"]): + if m := self._branch_pattern.match(line): + result = m.group(1) + break + else: + raise GitBDiffError("unable to get branch name") + return result + + @property + def is_branch(self): + """Whether this is a branch or main.""" + return self.branch != self.primary_branch + + @property + def has_diverged(self): + """Whether the branch has diverged from its parent.""" + return self.ancestor != self.current + + def files(self): + """Iterate over files changed on the branch.""" + + for line in self.run_git( + ["diff", "--name-only", "--diff-filter=AMX", self.ancestor] + ): + if line != "": + yield line + + @staticmethod + def run_git(args): + """Run a git command and yield the output.""" + + if isinstance(args, str): + args = args.split() + cmd = ["git"] + args + + proc = subprocess.run(cmd, capture_output=True, check=False) + + for line in proc.stderr.decode("utf-8").split("\n"): + if line.startswith("fatal: not a git repository"): + raise GitBDiffNotGit(cmd) + if line.startswith("fatal: "): + raise GitBDiffError(line[7:].rstrip()) + + if proc.returncode != 0: + raise GitBDiffError(f"command returned {proc.returncode}") + + yield from proc.stdout.decode("utf-8").split("\n") diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py new file mode 100644 index 0000000..f13d8a1 --- /dev/null +++ b/bdiff/tests/test_git_bdiff.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# *********************************COPYRIGHT************************************ +# (C) Crown copyright Met Office. All rights reserved. +# For further details please refer to the file COPYRIGHT.txt +# which you should have received as part of this distribution. +# *********************************COPYRIGHT************************************ +""" +Test suite for git_bdiff module. +""" + +import os +import subprocess +import pytest + +from git_bdiff import GitBDiff, GitBDiffError, GitBDiffNotGit + + +# Disable warnings caused by the use of pytest fixtures +# pylint: disable=redefined-outer-name + + +def add_to_repo(start, end, message): + """Add and commit dummy files to a repo.""" + + for i in range(start, end): + with open(f"file{i}", "wt", encoding="utf-8") as fd: + print(f"Hello {i}", file=fd) + + subprocess.run(["git", "add", "-A"], check=True) + subprocess.run(["git", "commit", "--no-gpg-sign", "-m", message], check=True) + + +@pytest.fixture(scope="session") +def git_repo(tmpdir_factory): + """Create and populate a test git repo.""" + + location = tmpdir_factory.mktemp("data") + os.chdir(location) + + # Create the repo and add some files + subprocess.run(["git", "init"], check=True) + add_to_repo(0, 10, "Testing") + + # Create a branch and add some files + subprocess.run(["git", "checkout", "-b", "mybranch"], check=True) + add_to_repo(20, 30, "Commit to mybranch") + + # Create a branch-of-branch and add more files + subprocess.run(["git", "checkout", "-b", "subbranch"], check=True) + add_to_repo(40, 50, "Commit to subbranch") + + # Create an branch from main without any changes + subprocess.run(["git", "checkout", "main"], check=True) + subprocess.run(["git", "checkout", "-b", "unchanged"], check=True) + + # Switch back to the main branch ready for testing + subprocess.run(["git", "checkout", "main"], check=True) + + return location + + +def test_init(git_repo): + """Test creation of a new GitBDiff instance""" + + os.chdir(git_repo) + bdiff = GitBDiff() + + assert bdiff.branch is not None + assert bdiff.branch == "main" + assert not bdiff.is_branch + assert not bdiff.has_diverged + + +def test_branch_diff(git_repo): + """Test a simple branch diff.""" + + os.chdir(git_repo) + subprocess.run(["git", "checkout", "mybranch"], check=True) + + try: + bdiff = GitBDiff() + changes = list(bdiff.files()) + finally: + subprocess.run(["git", "checkout", "main"], check=True) + + assert bdiff.branch == "mybranch" + assert bdiff.is_branch + assert bdiff.has_diverged + assert len(changes) == 10 + assert changes[0] == "file20" + + +def test_branch_of_branch_diff(git_repo): + """Test a branch of branch diff. + + This effectively tests whether all the commits since the branch + point with main are picked up correctly. + """ + + os.chdir(git_repo) + subprocess.run(["git", "checkout", "subbranch"], check=True) + + try: + bdiff = GitBDiff() + changes = list(bdiff.files()) + finally: + subprocess.run(["git", "checkout", "main"], check=True) + + assert bdiff.branch == "subbranch" + assert bdiff.is_branch + assert bdiff.has_diverged + assert len(changes) == 20 + assert changes[0] == "file20" + assert changes[-1] == "file49" + + +def test_unchanged_branch(git_repo): + """Test a branch with no commits.""" + + os.chdir(git_repo) + subprocess.run(["git", "checkout", "unchanged"], check=True) + + try: + bdiff = GitBDiff() + changes = list(bdiff.files()) + finally: + subprocess.run(["git", "checkout", "main"], check=True) + + assert bdiff.branch == "unchanged" + assert bdiff.is_branch + assert not bdiff.has_diverged + assert not changes + + +def test_non_repo(tmpdir): + """Test exception if working directory is not a git repo.""" + + os.chdir(tmpdir) + + with pytest.raises(GitBDiffNotGit): + GitBDiff() + + +def test_unknown_parent(git_repo): + """Test exception if parent branch does not exist. + + This is a proxy test for the detection of all sorts of git + errors. + """ + + os.chdir(git_repo) + + with pytest.raises(GitBDiffError): + GitBDiff(parent="nosuch") From 56813be948ab4c84a7add9517d48c49745d0525b Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Fri, 27 Jun 2025 07:53:38 +0100 Subject: [PATCH 2/8] Add the ability to specify a repository directory Allow the constructor to specify the path to a repository and update the run_git method to change to the directory before running each git command. --- bdiff/git_bdiff.py | 24 +++++++++++++++++++----- bdiff/tests/test_git_bdiff.py | 22 ++++++++++++++++++++++ 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py index 1ec843e..0834912 100644 --- a/bdiff/git_bdiff.py +++ b/bdiff/git_bdiff.py @@ -21,6 +21,7 @@ import re import subprocess +from pathlib import Path class GitBDiffError(Exception): @@ -31,7 +32,9 @@ class GitBDiffNotGit(GitBDiffError): """Error if the target not part of a git repository.""" def __init__(self, cmd): - super().__init__("not a repository (cmd:" + " ".join(cmd) + ")") + super().__init__( + "not a repository (cmd:" + " ".join([str(i) for i in cmd]) + ")" + ) class GitBDiff: @@ -46,8 +49,16 @@ class GitBDiff: # Match branch names _branch_pattern = re.compile(r"^(\S+)$") - def __init__(self, parent=None): + def __init__(self, parent=None, repo=None): self.parent = parent or self.primary_branch + + if repo is None: + self._repo = None + else: + self._repo = Path(repo) + if not self._repo.is_dir(): + raise GitBDiffError(f"{repo} is not a directory") + self.ancestor = self.get_branch_point() self.current = self.get_latest_commit() self.branch = self.get_branch_name() @@ -112,15 +123,18 @@ def files(self): if line != "": yield line - @staticmethod - def run_git(args): + def run_git(self, args): """Run a git command and yield the output.""" if isinstance(args, str): args = args.split() cmd = ["git"] + args - proc = subprocess.run(cmd, capture_output=True, check=False) + # Run the the command in the repo directory, capture the + # output, and check for errors. The build in error check is + # not used to allow specific git errors to be treated more + # precisely + proc = subprocess.run(cmd, capture_output=True, check=False, cwd=self._repo) for line in proc.stderr.decode("utf-8").split("\n"): if line.startswith("fatal: not a git repository"): diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py index f13d8a1..7aefc0e 100644 --- a/bdiff/tests/test_git_bdiff.py +++ b/bdiff/tests/test_git_bdiff.py @@ -70,6 +70,28 @@ def test_init(git_repo): assert not bdiff.is_branch assert not bdiff.has_diverged + +def test_repo_selection(git_repo): + """Test selection of repository directory.""" + + os.chdir("/") + bdiff = GitBDiff(repo=git_repo) + + assert bdiff.branch is not None + assert bdiff.branch == "main" + assert not bdiff.is_branch + assert not bdiff.has_diverged + + +def test_invalid_repo_selection(git_repo): + """Test non-existent repo or plain file raises an error""" + + with pytest.raises(GitBDiffError): + GitBDiff(repo="/nosuch") + + with pytest.raises(GitBDiffError): + GitBDiff(repo="/etc/hosts") + def test_branch_diff(git_repo): """Test a simple branch diff.""" From 447dd57c2a27e19f97cbf2e783bc7563538987c1 Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Tue, 15 Jul 2025 16:58:21 +0100 Subject: [PATCH 3/8] Include review changes from @ericaneininger --- bdiff/git_bdiff.py | 6 +++--- bdiff/tests/test_git_bdiff.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py index 0834912..74acd1f 100644 --- a/bdiff/git_bdiff.py +++ b/bdiff/git_bdiff.py @@ -44,10 +44,10 @@ class GitBDiff: primary_branch = "main" # Match hex commit IDs - _hash_pattern = re.compile(r"^([0-9a-f]+)$") + _hash_pattern = re.compile(r"^\s*([0-9a-f]+)\s*$") # Match branch names - _branch_pattern = re.compile(r"^(\S+)$") + _branch_pattern = re.compile(r"^\s*(\S+)\s*$") def __init__(self, parent=None, repo=None): self.parent = parent or self.primary_branch @@ -140,7 +140,7 @@ def run_git(self, args): if line.startswith("fatal: not a git repository"): raise GitBDiffNotGit(cmd) if line.startswith("fatal: "): - raise GitBDiffError(line[7:].rstrip()) + raise GitBDiffError(line[7:]) if proc.returncode != 0: raise GitBDiffError(f"command returned {proc.returncode}") diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py index 7aefc0e..0ed149c 100644 --- a/bdiff/tests/test_git_bdiff.py +++ b/bdiff/tests/test_git_bdiff.py @@ -49,7 +49,7 @@ def git_repo(tmpdir_factory): subprocess.run(["git", "checkout", "-b", "subbranch"], check=True) add_to_repo(40, 50, "Commit to subbranch") - # Create an branch from main without any changes + # Create a branch from main without any changes subprocess.run(["git", "checkout", "main"], check=True) subprocess.run(["git", "checkout", "-b", "unchanged"], check=True) @@ -70,10 +70,10 @@ def test_init(git_repo): assert not bdiff.is_branch assert not bdiff.has_diverged - + def test_repo_selection(git_repo): """Test selection of repository directory.""" - + os.chdir("/") bdiff = GitBDiff(repo=git_repo) @@ -82,7 +82,7 @@ def test_repo_selection(git_repo): assert not bdiff.is_branch assert not bdiff.has_diverged - + def test_invalid_repo_selection(git_repo): """Test non-existent repo or plain file raises an error""" From 4f9791104040644621063416cf8c32289701ddd1 Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Tue, 15 Jul 2025 20:36:29 +0100 Subject: [PATCH 4/8] Include review changes from @r-sharp --- bdiff/git_bdiff.py | 8 +++++--- bdiff/tests/test_git_bdiff.py | 28 +++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py index 74acd1f..c0bf6c7 100644 --- a/bdiff/git_bdiff.py +++ b/bdiff/git_bdiff.py @@ -126,15 +126,17 @@ def files(self): def run_git(self, args): """Run a git command and yield the output.""" - if isinstance(args, str): - args = args.split() + if not isinstance(args, list): + raise TypeError("args must be a list") cmd = ["git"] + args # Run the the command in the repo directory, capture the # output, and check for errors. The build in error check is # not used to allow specific git errors to be treated more # precisely - proc = subprocess.run(cmd, capture_output=True, check=False, cwd=self._repo) + proc = subprocess.run( + cmd, capture_output=True, check=False, shell=False, cwd=self._repo + ) for line in proc.stderr.decode("utf-8").split("\n"): if line.startswith("fatal: not a git repository"): diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py index 0ed149c..17649da 100644 --- a/bdiff/tests/test_git_bdiff.py +++ b/bdiff/tests/test_git_bdiff.py @@ -19,12 +19,12 @@ # pylint: disable=redefined-outer-name -def add_to_repo(start, end, message): +def add_to_repo(start, end, message, mode="wt"): """Add and commit dummy files to a repo.""" for i in range(start, end): - with open(f"file{i}", "wt", encoding="utf-8") as fd: - print(f"Hello {i}", file=fd) + with open(f"file{i}", mode, encoding="utf-8") as fd: + print(f"Lorem ipsum dolor sit amet {i}", file=fd) subprocess.run(["git", "add", "-A"], check=True) subprocess.run(["git", "commit", "--no-gpg-sign", "-m", message], check=True) @@ -53,6 +53,11 @@ def git_repo(tmpdir_factory): subprocess.run(["git", "checkout", "main"], check=True) subprocess.run(["git", "checkout", "-b", "unchanged"], check=True) + # Create a branch from main and overwrite some things + subprocess.run(["git", "checkout", "main"], check=True) + subprocess.run(["git", "checkout", "-b", "overwrite"], check=True) + add_to_repo(0, 10, "Overwriting", "at") + # Switch back to the main branch ready for testing subprocess.run(["git", "checkout", "main"], check=True) @@ -136,6 +141,23 @@ def test_branch_of_branch_diff(git_repo): assert changes[-1] == "file49" +def test_overwritten_branch(git_repo): + """Test a diff of a branch with changed files.""" + + os.chdir(git_repo) + subprocess.run(["git", "checkout", "overwrite"], check=True) + try: + bdiff = GitBDiff() + changes = list(bdiff.files()) + finally: + subprocess.run(["git", "checkout", "main"], check=True) + + assert bdiff.branch == "overwrite" + assert bdiff.is_branch + assert bdiff.has_diverged + assert len(changes) == 10 + + def test_unchanged_branch(git_repo): """Test a branch with no commits.""" From 35f43115ecfa8080c9520608673bd44b7dbbd320 Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Tue, 15 Jul 2025 20:56:18 +0100 Subject: [PATCH 5/8] Improve tests of error handling Add string comparisons to existing exception tests and add a couple of exception handling tests for the git_run method to improve testing coverage. --- bdiff/tests/test_git_bdiff.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py index 17649da..8c73aea 100644 --- a/bdiff/tests/test_git_bdiff.py +++ b/bdiff/tests/test_git_bdiff.py @@ -181,11 +181,12 @@ def test_non_repo(tmpdir): os.chdir(tmpdir) - with pytest.raises(GitBDiffNotGit): + with pytest.raises(GitBDiffNotGit) as exc: GitBDiff() + assert "not a repository" in str(exc.value) -def test_unknown_parent(git_repo): +def test_nonexistent_parent(git_repo): """Test exception if parent branch does not exist. This is a proxy test for the detection of all sorts of git @@ -194,5 +195,24 @@ def test_unknown_parent(git_repo): os.chdir(git_repo) - with pytest.raises(GitBDiffError): + with pytest.raises(GitBDiffError) as exc: GitBDiff(parent="nosuch") + assert "Not a valid object name nosuch" in str(exc.value) + + +def test_git_run(git_repo): + """Test git interface and error handling.""" + + bdiff = GitBDiff() + + with pytest.raises(TypeError) as exc: + # Use a string in place of a list + for i in bdiff.run_git("commit -m ''"): + pass + assert "args must be a list" in str(exc.value) + + with pytest.raises(GitBDiffError) as exc: + # Run a command that should return non-zero + for i in bdiff.run_git(["commit", "-m", "''"]): + pass + assert "command returned 1" in str(exc.value) From 538323a94bfe9df4694d63b9253eebfd4254f0b0 Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Wed, 16 Jul 2025 16:15:33 +0100 Subject: [PATCH 6/8] Improve hash pattern match Reviewer has recommended changing the pattern match to require a full 40 character hex string. --- bdiff/git_bdiff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py index c0bf6c7..dc0897a 100644 --- a/bdiff/git_bdiff.py +++ b/bdiff/git_bdiff.py @@ -44,7 +44,7 @@ class GitBDiff: primary_branch = "main" # Match hex commit IDs - _hash_pattern = re.compile(r"^\s*([0-9a-f]+)\s*$") + _hash_pattern = re.compile(r"^\s*([0-9a-f]{40})\s*$") # Match branch names _branch_pattern = re.compile(r"^\s*(\S+)\s*$") From 2ee2454d94962c1e79a80720d2b8803690a84b0f Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Wed, 16 Jul 2025 16:16:47 +0100 Subject: [PATCH 7/8] Minor change to remove some dead test code --- bdiff/tests/test_git_bdiff.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bdiff/tests/test_git_bdiff.py b/bdiff/tests/test_git_bdiff.py index 8c73aea..6170a38 100644 --- a/bdiff/tests/test_git_bdiff.py +++ b/bdiff/tests/test_git_bdiff.py @@ -207,12 +207,10 @@ def test_git_run(git_repo): with pytest.raises(TypeError) as exc: # Use a string in place of a list - for i in bdiff.run_git("commit -m ''"): - pass + list(i for i in bdiff.run_git("commit -m ''")) assert "args must be a list" in str(exc.value) with pytest.raises(GitBDiffError) as exc: # Run a command that should return non-zero - for i in bdiff.run_git(["commit", "-m", "''"]): - pass + list(i for i in bdiff.run_git(["commit", "-m", "''"])) assert "command returned 1" in str(exc.value) From 521adf712c0586e72f88da62655a2dc519c90c87 Mon Sep 17 00:00:00 2001 From: Sam Clarke-Green Date: Thu, 17 Jul 2025 09:00:49 +0100 Subject: [PATCH 8/8] Update branch pattern somewhat This improves the pattern used to match the branch to make it closer to the official git definition. The pattern does not need to include all the defined by git check-ref-format because the branch name must already be compliant with these in order to be created in the first place. --- bdiff/git_bdiff.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bdiff/git_bdiff.py b/bdiff/git_bdiff.py index dc0897a..77f4cf0 100644 --- a/bdiff/git_bdiff.py +++ b/bdiff/git_bdiff.py @@ -46,8 +46,11 @@ class GitBDiff: # Match hex commit IDs _hash_pattern = re.compile(r"^\s*([0-9a-f]{40})\s*$") - # Match branch names - _branch_pattern = re.compile(r"^\s*(\S+)\s*$") + # Match branch names. This should catch all valid names but may + # also some invalid names through. This should matter given that + # it is being used to match git command output. For a complete + # overview of the naming scheme, see man git check-ref-format + _branch_pattern = re.compile(r"^\s*([^\s~\^\:\?\*\[]+[^.])\s*$") def __init__(self, parent=None, repo=None): self.parent = parent or self.primary_branch