diff --git a/README.md b/README.md index aebbcb4b4..423769ae8 100644 --- a/README.md +++ b/README.md @@ -640,6 +640,23 @@ Configuration is managed through environment variables in `.env` file: - `TARGET_REPO_PATH`: Default repository path (default: `.`) - `LOCAL_MODEL_ENDPOINT`: Fallback endpoint for Ollama (default: `http://localhost:11434/v1`) +### Custom Ignore Patterns + +You can specify additional directories to exclude by creating a `.cgrignore` file in your repository root: + +``` +# Comments start with # +vendor +.custom_cache +my_build_output +``` + +- One directory name per line +- Lines starting with `#` are comments +- Blank lines are ignored +- Patterns are exact directory name matches (not globs) +- Patterns from `.cgrignore` are merged with `--exclude` flags and auto-detected directories + ### Key Dependencies diff --git a/codebase_rag/config.py b/codebase_rag/config.py index 20a8909ae..32f0ce6a4 100644 --- a/codebase_rag/config.py +++ b/codebase_rag/config.py @@ -1,14 +1,17 @@ from __future__ import annotations from dataclasses import asdict, dataclass +from pathlib import Path from typing import Unpack from dotenv import load_dotenv +from loguru import logger from pydantic import AnyHttpUrl from pydantic_settings import BaseSettings, SettingsConfigDict from . import constants as cs from . import exceptions as ex +from . import logs from .types_defs import ModelConfigKwargs load_dotenv() @@ -227,3 +230,28 @@ def resolve_batch_size(self, batch_size: int | None) -> int: settings = AppConfig() + +CGRIGNORE_FILENAME = ".cgrignore" + + +def load_cgrignore_patterns(repo_path: Path) -> frozenset[str]: + ignore_file = repo_path / CGRIGNORE_FILENAME + if not ignore_file.is_file(): + return frozenset() + + patterns: set[str] = set() + try: + with ignore_file.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + patterns.add(line) + if patterns: + logger.info( + logs.CGRIGNORE_LOADED.format(count=len(patterns), path=ignore_file) + ) + return frozenset(patterns) + except OSError as e: + logger.warning(logs.CGRIGNORE_READ_FAILED.format(path=ignore_file, error=e)) + return frozenset() diff --git a/codebase_rag/constants.py b/codebase_rag/constants.py index 64cdca39e..f0d70de9b 100644 --- a/codebase_rag/constants.py +++ b/codebase_rag/constants.py @@ -632,6 +632,7 @@ class DiffMarker: INTERACTIVE_STYLE_DIM = "dim" INTERACTIVE_STATUS_DETECTED = "auto-detected" INTERACTIVE_STATUS_CLI = "--exclude" +INTERACTIVE_STATUS_CGRIGNORE = ".cgrignore" INTERACTIVE_NESTED_SINGULAR = "{count} dir" INTERACTIVE_NESTED_PLURAL = "{count} dirs" INTERACTIVE_INSTRUCTIONS_GROUPED = ( diff --git a/codebase_rag/logs.py b/codebase_rag/logs.py index 378dfe76f..246c1f77d 100644 --- a/codebase_rag/logs.py +++ b/codebase_rag/logs.py @@ -89,6 +89,10 @@ GRAMMAR_LOAD_FAILED = "Failed to load {lang} grammar: {error}" INITIALIZED_PARSERS = "Initialized parsers for: {languages}" +# (H) Ignore pattern logs +CGRIGNORE_LOADED = "Loaded {count} patterns from {path}" +CGRIGNORE_READ_FAILED = "Failed to read {path}: {error}" + # (H) File watcher logs WATCHER_ACTIVE = "File watcher is now active." WATCHER_SKIP_NO_QUERY = "Ingestor does not support querying, skipping real-time update." diff --git a/codebase_rag/main.py b/codebase_rag/main.py index 67a5593ef..5ab097588 100644 --- a/codebase_rag/main.py +++ b/codebase_rag/main.py @@ -27,7 +27,7 @@ from . import constants as cs from . import exceptions as ex from . import logs as ls -from .config import settings +from .config import load_cgrignore_patterns, settings from .models import AppContext from .prompts import OPTIMIZATION_PROMPT, OPTIMIZATION_PROMPT_WITH_REFERENCE from .services import QueryProtocol @@ -788,7 +788,9 @@ def prompt_for_included_directories( cli_excludes: list[str] | None = None, ) -> frozenset[str]: detected = detect_excludable_directories(repo_path) - pre_excluded = frozenset(cli_excludes) if cli_excludes else frozenset() + cgrignore_patterns = load_cgrignore_patterns(repo_path) + cli_patterns = frozenset(cli_excludes) if cli_excludes else frozenset() + pre_excluded = cli_patterns | cgrignore_patterns if not detected and not pre_excluded: return frozenset() diff --git a/codebase_rag/tests/test_cgrignore.py b/codebase_rag/tests/test_cgrignore.py new file mode 100644 index 000000000..293e04b62 --- /dev/null +++ b/codebase_rag/tests/test_cgrignore.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from codebase_rag.config import CGRIGNORE_FILENAME, load_cgrignore_patterns +from codebase_rag.main import prompt_for_included_directories + + +def test_returns_empty_when_no_file(temp_repo: Path) -> None: + result = load_cgrignore_patterns(temp_repo) + assert result == frozenset() + + +def test_loads_patterns_from_file(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\nmy_build\n") + + result = load_cgrignore_patterns(temp_repo) + + assert "vendor" in result + assert "my_build" in result + assert len(result) == 2 + + +def test_ignores_comments_and_blank_lines(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("# Comment\n\nvendor\n # Indented comment\n") + + result = load_cgrignore_patterns(temp_repo) + + assert result == frozenset({"vendor"}) + + +def test_strips_whitespace(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text(" vendor \n\ttemp\t\n") + + result = load_cgrignore_patterns(temp_repo) + + assert "vendor" in result + assert "temp" in result + + +def test_returns_empty_on_read_error( + temp_repo: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor") + + original_open = Path.open + + def mock_open(self: Path, *args, **kwargs): # noqa: ANN002, ANN003 + if self.name == CGRIGNORE_FILENAME: + raise PermissionError("Cannot read") + return original_open(self, *args, **kwargs) + + monkeypatch.setattr(Path, "open", mock_open) + + result = load_cgrignore_patterns(temp_repo) + assert result == frozenset() + + +def test_handles_duplicates(temp_repo: Path) -> None: + cgrignore = temp_repo / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\nvendor\ntemp\n") + + result = load_cgrignore_patterns(temp_repo) + + assert len(result) == 2 + + +def test_returns_empty_if_cgrignore_is_a_directory(temp_repo: Path) -> None: + cgrignore_path = temp_repo / CGRIGNORE_FILENAME + cgrignore_path.mkdir() + + result = load_cgrignore_patterns(temp_repo) + + assert result == frozenset() + + +class TestCgrignoreIntegration: + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_patterns_included_in_candidates( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / ".git").mkdir() + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("vendor\ncustom_cache\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path) + + assert ".git" in result + assert "vendor" in result + assert "custom_cache" in result + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_merged_with_cli_excludes( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("from_cgrignore\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path, cli_excludes=["from_cli"]) + + assert "from_cgrignore" in result + assert "from_cli" in result + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_only_returns_without_prompt_when_empty( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + result = prompt_for_included_directories(tmp_path) + + assert result == frozenset() + mock_ask.assert_not_called() + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_alone_triggers_prompt( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text("my_custom_dir\n") + mock_ask.return_value = "none" + + prompt_for_included_directories(tmp_path) + + mock_ask.assert_called_once() + + @patch("codebase_rag.main.Prompt.ask") + @patch("codebase_rag.main.app_context") + def test_cgrignore_deduplicates_with_detected( + self, mock_context: MagicMock, mock_ask: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / ".git").mkdir() + cgrignore = tmp_path / CGRIGNORE_FILENAME + cgrignore.write_text(".git\nvendor\n") + mock_ask.return_value = "all" + + result = prompt_for_included_directories(tmp_path) + + assert ".git" in result + assert "vendor" in result + assert len([x for x in result if x == ".git"]) == 1