From 89d56863237946a844bcb1f081d836d00c11b983 Mon Sep 17 00:00:00 2001 From: John Stanford <787382+jxstanford@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:21:14 -0800 Subject: [PATCH 1/5] feat: add .cgrignore file support for custom exclude patterns Add support for a `.cgrignore` file that allows users to specify additional directories to exclude from parsing. Patterns from this file are merged with `--exclude` CLI flags and auto-detected directories. - Add `load_cgrignore_patterns()` function in config.py - Integrate with `prompt_exclude_directories()` in main.py - Add EXCLUDE_STATUS_CGRIGNORE constant for UI display - Add tests for .cgrignore loading - Update README with documentation --- README.md | 17 +++++++ codebase_rag/config.py | 26 +++++++++++ codebase_rag/constants.py | 1 + codebase_rag/main.py | 6 ++- codebase_rag/tests/test_cgrignore.py | 70 ++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 codebase_rag/tests/test_cgrignore.py diff --git a/README.md b/README.md index aebbcb4b4..423769ae8 100644 --- a/README.md +++ b/README.md @@ -640,6 +640,23 @@ Configuration is managed through environment variables in `.env` file: - `TARGET_REPO_PATH`: Default repository path (default: `.`) - `LOCAL_MODEL_ENDPOINT`: Fallback endpoint for Ollama (default: `http://localhost:11434/v1`) +### Custom Ignore Patterns + +You can specify additional directories to exclude by creating a `.cgrignore` file in your repository root: + +``` +# Comments start with # +vendor +.custom_cache +my_build_output +``` + +- One directory name per line +- Lines starting with `#` are comments +- Blank lines are ignored +- Patterns are exact directory name matches (not globs) +- Patterns from `.cgrignore` are merged with `--exclude` flags and auto-detected directories + ### Key Dependencies diff --git a/codebase_rag/config.py b/codebase_rag/config.py index 20a8909ae..884d73c1d 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import asdict, dataclass +from pathlib import Path from typing import Unpack from dotenv import load_dotenv @@ -227,3 +228,28 @@ def resolve_batch_size(self, batch_size: int | None) -> int: settings = AppConfig() + +CGRIGNORE_FILENAME = ".cgrignore" + + +def load_cgrignore_patterns(repo_path: Path) -> frozenset[str]: + from loguru import logger + + ignore_file = repo_path / CGRIGNORE_FILENAME + if not ignore_file.exists(): + return frozenset() + + patterns: set[str] = set() + try: + with ignore_file.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + patterns.add(line) + if patterns: + logger.info(f"Loaded {len(patterns)} patterns from {ignore_file}") + return frozenset(patterns) + except OSError as e: + logger.warning(f"Failed to read {ignore_file}: {e}") + return frozenset() diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py index 64cdca39e..f0d70de9b 100644 --- a/codebase_rag/constants.py +++ b/codebase_rag/constants.py @@ -632,6 +632,7 @@ class DiffMarker: INTERACTIVE_STYLE_DIM = "dim" INTERACTIVE_STATUS_DETECTED = "auto-detected" INTERACTIVE_STATUS_CLI = "--exclude" +INTERACTIVE_STATUS_CGRIGNORE = ".cgrignore" INTERACTIVE_NESTED_SINGULAR = "{count} dir" INTERACTIVE_NESTED_PLURAL = "{count} dirs" INTERACTIVE_INSTRUCTIONS_GROUPED = ( diff --git a/codebase_rag/main.py b/codebase_rag/main.py index 67a5593ef..d91c246c7 100644 --- a/codebase_rag/main.py +++ b/codebase_rag/main.py @@ -787,8 +787,12 @@ def prompt_for_included_directories( repo_path: Path, cli_excludes: list[str] | None = None, ) -> frozenset[str]: + from .config import load_cgrignore_patterns + detected = detect_excludable_directories(repo_path) - pre_excluded = frozenset(cli_excludes) if cli_excludes else frozenset() + cgrignore_patterns = load_cgrignore_patterns(repo_path) + cli_patterns = frozenset(cli_excludes) if cli_excludes else frozenset() + pre_excluded = cli_patterns | cgrignore_patterns if not detected and not pre_excluded: return frozenset() diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py new file mode 100644 index 000000000..75c29323c --- /dev/null +++ b/codebase_rag/tests/test_cgrignore.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from codebase_rag.config import CGRIGNORE_FILENAME, load_cgrignore_patterns + + +def test_returns_empty_when_no_file(temp_repo: Path) -> None: + result = load_cgrignore_patterns(temp_repo) + assert result == frozenset() + + +def test_loads_patterns_from_file(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\nmy_build\n") + + result = load_cgrignore_patterns(temp_repo) + + assert "vendor" in result + assert "my_build" in result + assert len(result) == 2 + + +def test_ignores_comments_and_blank_lines(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("# Comment\n\nvendor\n # Indented comment\n") + + result = load_cgrignore_patterns(temp_repo) + + assert result == frozenset({"vendor"}) + + +def test_strips_whitespace(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text(" vendor \n\ttemp\t\n") + + result = load_cgrignore_patterns(temp_repo) + + assert "vendor" in result + assert "temp" in result + + +def test_returns_empty_on_read_error( + temp_repo: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor") + + original_open = Path.open + + def mock_open(self: Path, *args, **kwargs): # noqa: ANN002, ANN003 + if self.name == CGRIGNORE_FILENAME: + raise PermissionError("Cannot read") + return original_open(self, *args, **kwargs) + + monkeypatch.setattr(Path, "open", mock_open) + + result = load_cgrignore_patterns(temp_repo) + assert result == frozenset() + + +def test_handles_duplicates(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\nvendor\ntemp\n") + + result = load_cgrignore_patterns(temp_repo) + + assert len(result) == 2 From 2dec557fa2497b02f0fb38d1acac9c9c7b8975a1 Mon Sep 17 00:00:00 2001 From: John Stanford <787382+jxstanford@users.noreply.github.com> Date: Sun, 4 Jan 2026 15:32:21 -0800 Subject: [PATCH 2/5] style: move loguru import to module level and use log constants Addresses PR review feedback: - Move loguru import to module level in config.py - Use log constants from logs.py instead of f-strings --- codebase_rag/config.py | 10 ++++++---- codebase_rag/logs.py | 4 ++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/codebase_rag/config.py b/codebase_rag/config.py index 884d73c1d..c2e486138 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -5,11 +5,13 @@ from typing import Unpack from dotenv import load_dotenv +from loguru import logger from pydantic import AnyHttpUrl from pydantic_settings import BaseSettings, SettingsConfigDict from . import constants as cs from . import exceptions as ex +from . import logs from .types_defs import ModelConfigKwargs load_dotenv() @@ -233,8 +235,6 @@ def resolve_batch_size(self, batch_size: int | None) -> int: def load_cgrignore_patterns(repo_path: Path) -> frozenset[str]: - from loguru import logger - ignore_file = repo_path / CGRIGNORE_FILENAME if not ignore_file.exists(): return frozenset() @@ -248,8 +248,10 @@ def load_cgrignore_patterns(repo_path: Path) -> frozenset[str]: continue patterns.add(line) if patterns: - logger.info(f"Loaded {len(patterns)} patterns from {ignore_file}") + logger.info( + logs.CGRIGNORE_LOADED.format(count=len(patterns), path=ignore_file) + ) return frozenset(patterns) except OSError as e: - logger.warning(f"Failed to read {ignore_file}: {e}") + logger.warning(logs.CGRIGNORE_READ_FAILED.format(path=ignore_file, error=e)) return frozenset() diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py index 378dfe76f..246c1f77d 100644 --- a/codebase_rag/logs.py +++ b/codebase_rag/logs.py @@ -89,6 +89,10 @@ GRAMMAR_LOAD_FAILED = "Failed to load {lang} grammar: {error}" INITIALIZED_PARSERS = "Initialized parsers for: {languages}" +# (H) Ignore pattern logs +CGRIGNORE_LOADED = "Loaded {count} patterns from {path}" +CGRIGNORE_READ_FAILED = "Failed to read {path}: {error}" + # (H) File watcher logs WATCHER_ACTIVE = "File watcher is now active." WATCHER_SKIP_NO_QUERY = "Ingestor does not support querying, skipping real-time update." From f9db441d6db0fb68e2b551443f1deb9671f5b0ea Mon Sep 17 00:00:00 2001 From: vitali87 Date: Tue, 6 Jan 2026 15:40:53 +0400 Subject: [PATCH 3/5] test: add integration tests for cgrignore with prompt_for_included_directories --- codebase_rag/tests/test_cgrignore.py | 73 ++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py index 75c29323c..ae410fc70 100644 --- a/codebase_rag/tests/test_cgrignore.py +++ b/codebase_rag/tests/test_cgrignore.py @@ -1,10 +1,12 @@ from __future__ import annotations from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from codebase_rag.config import CGRIGNORE_FILENAME, load_cgrignore_patterns +from codebase_rag.main import prompt_for_included_directories def test_returns_empty_when_no_file(temp_repo: Path) -> None: @@ -68,3 +70,74 @@ def test_handles_duplicates(temp_repo: Path) -> None: result = load_cgrignore_patterns(temp_repo) assert len(result) == 2 + + +class TestCgrignoreIntegration: + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_patterns_included_in_candidates( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / ".git").mkdir() + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\ncustom_cache\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path) + + assert ".git" in result + assert "vendor" in result + assert "custom_cache" in result + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_merged_with_cli_excludes( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("from_cgrignore\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path, cli_excludes=["from_cli"]) + + assert "from_cgrignore" in result + assert "from_cli" in result + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_only_returns_without_prompt_when_empty( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + result = prompt_for_included_directories(tmp_path) + + assert result == frozenset() + mock_ask.assert_not_called() + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_alone_triggers_prompt( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("my_custom_dir\n") + mock_ask.return_value = "none" + + prompt_for_included_directories(tmp_path) + + mock_ask.assert_called_once() + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_deduplicates_with_detected( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / ".git").mkdir() + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text(".git\nvendor\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path) + + assert ".git" in result + assert "vendor" in result + assert len([x for x in result if x == ".git"]) == 1 From 6d3bc56c030f20bd3f00919019934542469a0782 Mon Sep 17 00:00:00 2001 From: vitali87 Date: Tue, 6 Jan 2026 22:26:48 +0400 Subject: [PATCH 4/5] fix: use is_file() instead of exists() for cgrignore check, add directory edge case test --- codebase_rag/config.py | 2 +- codebase_rag/tests/test_cgrignore.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/codebase_rag/config.py b/codebase_rag/config.py index c2e486138..32f0ce6a4 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -236,7 +236,7 @@ def resolve_batch_size(self, batch_size: int | None) -> int: def load_cgrignore_patterns(repo_path: Path) -> frozenset[str]: ignore_file = repo_path / CGRIGNORE_FILENAME - if not ignore_file.exists(): + if not ignore_file.is_file(): return frozenset() patterns: set[str] = set() diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py index ae410fc70..293e04b62 100644 --- a/codebase_rag/tests/test_cgrignore.py +++ b/codebase_rag/tests/test_cgrignore.py @@ -72,6 +72,15 @@ def test_handles_duplicates(temp_repo: Path) -> None: assert len(result) == 2 +def test_returns_empty_if_cgrignore_is_a_directory(temp_repo: Path) -> None: + cgrignore_path = temp_repo / CGRIGNORE_FILENAME + cgrignore_path.mkdir() + + result = load_cgrignore_patterns(temp_repo) + + assert result == frozenset() + + class TestCgrignoreIntegration: @patch("codebase_rag.main.Prompt.ask") @patch("codebase_rag.main.app_context") From 7db0b0aeeb690fe8e808d4606a1c141837b56bf7 Mon Sep 17 00:00:00 2001 From: vitali87 Date: Tue, 6 Jan 2026 22:33:39 +0400 Subject: [PATCH 5/5] refactor: move load_cgrignore_patterns import to module level --- codebase_rag/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/codebase_rag/main.py b/codebase_rag/main.py index d91c246c7..5ab097588 100644 --- a/codebase_rag/main.py +++ b/codebase_rag/main.py @@ -27,7 +27,7 @@ from . import constants as cs from . import exceptions as ex from . import logs as ls -from .config import settings +from .config import load_cgrignore_patterns, settings from .models import AppContext from .prompts import OPTIMIZATION_PROMPT, OPTIMIZATION_PROMPT_WITH_REFERENCE from .services import QueryProtocol @@ -787,8 +787,6 @@ def prompt_for_included_directories( repo_path: Path, cli_excludes: list[str] | None = None, ) -> frozenset[str]: - from .config import load_cgrignore_patterns - detected = detect_excludable_directories(repo_path) cgrignore_patterns = load_cgrignore_patterns(repo_path) cli_patterns = frozenset(cli_excludes) if cli_excludes else frozenset()