diff --git a/.vscode/launch.json b/.vscode/launch.json index 4382cbb8..05e5d984 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,7 +5,7 @@ "type": "debugpy", "request": "launch", "module": "server", - "args": [], + "args": ["--reload"], "cwd": "${workspaceFolder}/src" } ] diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 01a2c8f3..da88c053 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -7,7 +7,8 @@ from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node -from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemNode, FileSystemStats, FileSystemSymlink +from gitingest.utils.compat_func import readlink from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -70,9 +71,8 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: relative_path = path.relative_to(query.local_path) - file_node = FileSystemNode( + file_node = FileSystemFile( name=path.name, - type=FileSystemNodeType.FILE, size=path.stat().st_size, file_count=1, path_str=str(relative_path), @@ -95,9 +95,8 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: logger.info("Processing directory", extra={"directory_path": str(path)}) - root_node = FileSystemNode( + root_node = FileSystemDirectory( name=path.name, - type=FileSystemNodeType.DIRECTORY, path_str=str(path.relative_to(query.local_path)), path=path, ) @@ -161,9 +160,8 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): - child_directory_node = FileSystemNode( + child_directory_node = FileSystemDirectory( name=sub_path.name, - type=FileSystemNodeType.DIRECTORY, path_str=str(sub_path.relative_to(query.local_path)), path=sub_path, depth=node.depth + 1, @@ -201,11 +199,11 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS The base path of the repository or directory being processed. """ - child = FileSystemNode( + child = FileSystemSymlink( name=path.name, - type=FileSystemNodeType.SYMLINK, path_str=str(path.relative_to(local_path)), path=path, + target=str(readlink(path)), depth=parent_node.depth + 1, ) stats.total_files += 1 @@ -258,9 +256,8 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat stats.total_files += 1 stats.total_size += file_size - child = FileSystemNode( + child = FileSystemFile( name=path.name, - type=FileSystemNodeType.FILE, size=file_size, file_count=1, path_str=str(path.relative_to(local_path)), diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 5c2b59ae..05fa182a 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -8,8 +8,7 @@ import requests.exceptions import tiktoken -from gitingest.schemas import FileSystemNode, FileSystemNodeType -from gitingest.utils.compat_func import readlink +from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemNode, FileSystemSymlink from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: @@ -42,18 +41,19 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, A tuple containing the summary, directory structure, and file contents. """ - is_single_file = node.type == FileSystemNodeType.FILE - summary = _create_summary_prefix(query, single_file=is_single_file) - - if node.type == FileSystemNodeType.DIRECTORY: + # Use polymorphic properties - much cleaner! + summary = _create_summary_prefix(query, single_file=node.is_single_file) + + # Add type-specific summary info + if isinstance(node, FileSystemDirectory): summary += f"Files analyzed: {node.file_count}\n" - elif node.type == FileSystemNodeType.FILE: - summary += f"File: {node.name}\n" - summary += f"Lines: {len(node.content.splitlines()):,}\n" + elif isinstance(node, FileSystemFile): + summary += f"File: {node.name or ''}\nLines: {len(node.content.splitlines()):,}\n" tree = "Directory structure:\n" + _create_tree_structure(query, node=node) - - content = _gather_file_contents(node) + + # Use polymorphic content gathering + content = node.gather_contents() token_estimate = _format_token_count(tree + content) if token_estimate: @@ -102,30 +102,6 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) return "\n".join(parts) + "\n" -def _gather_file_contents(node: FileSystemNode) -> str: - """Recursively gather contents of all files under the given node. - - This function recursively processes a directory node and gathers the contents of all files - under that node. It returns the concatenated content of all files as a single string. - - Parameters - ---------- - node : FileSystemNode - The current directory or file node being processed. - - Returns - ------- - str - The concatenated content of all files under the given node. - - """ - if node.type != FileSystemNodeType.DIRECTORY: - return node.content_string - - # Recursively gather contents of all files under the current directory - return "\n".join(_gather_file_contents(child) for child in node.children) - - def _create_tree_structure( query: IngestionQuery, *, @@ -162,16 +138,10 @@ def _create_tree_structure( tree_str = "" current_prefix = "└── " if is_last else "├── " - # Indicate directories with a trailing slash - display_name = node.name - if node.type == FileSystemNodeType.DIRECTORY: - display_name += "/" - elif node.type == FileSystemNodeType.SYMLINK: - display_name += " -> " + readlink(node.path).name - - tree_str += f"{prefix}{current_prefix}{display_name}\n" + # Use polymorphic display name - handles files, dirs, symlinks automatically! + tree_str += f"{prefix}{current_prefix}{node.display_name}\n" - if node.type == FileSystemNodeType.DIRECTORY and node.children: + if node.children: prefix += " " if is_last else "│ " for i, child in enumerate(node.children): tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index db5cb12f..1a6ae22b 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,25 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas.filesystem import ( + FileSystemDirectory, + FileSystemFile, + FileSystemNode, + FileSystemStats, + FileSystemSymlink, + GitRepository, +) from gitingest.schemas.ingestion import IngestionQuery +from gitingest.schemas.source import Source -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemNodeType", "FileSystemStats", "IngestionQuery"] +__all__ = [ + "CloneConfig", + "FileSystemDirectory", + "FileSystemFile", + "FileSystemNode", + "FileSystemStats", + "FileSystemSymlink", + "GitRepository", + "IngestionQuery", + "Source", +] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index cc66e7b1..30a18bd0 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -3,13 +3,11 @@ from __future__ import annotations import os +from abc import ABC, abstractmethod from dataclasses import dataclass, field -from enum import Enum, auto from typing import TYPE_CHECKING -from gitingest.utils.compat_func import readlink -from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk -from gitingest.utils.notebook import process_notebook +from gitingest.schemas.source import Source if TYPE_CHECKING: from pathlib import Path @@ -17,14 +15,6 @@ SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 -class FileSystemNodeType(Enum): - """Enum representing the type of a file system node (directory or file).""" - - DIRECTORY = auto() - FILE = auto() - SYMLINK = auto() - - @dataclass class FileSystemStats: """Class for tracking statistics during file system traversal.""" @@ -34,123 +24,108 @@ class FileSystemStats: @dataclass -class FileSystemNode: # pylint: disable=too-many-instance-attributes - """Class representing a node in the file system (either a file or directory). - - Tracks properties of files/directories for comprehensive analysis. - """ - - name: str - type: FileSystemNodeType - path_str: str - path: Path +class FileSystemNode(Source, ABC): # pylint: disable=too-many-instance-attributes + """Abstract base class for filesystem nodes (files, directories, symlinks).""" + + # Required fields - use None defaults and validate in __post_init__ + name: str | None = None + path_str: str | None = None + path: "Path | None" = None + + # Optional fields with sensible defaults size: int = 0 file_count: int = 0 dir_count: int = 0 depth: int = 0 children: list[FileSystemNode] = field(default_factory=list) - def sort_children(self) -> None: - """Sort the children nodes of a directory according to a specific order. - - Order of sorting: - 2. Regular files (not starting with dot) - 3. Hidden files (starting with dot) - 4. Regular directories (not starting with dot) - 5. Hidden directories (starting with dot) - - All groups are sorted alphanumerically within themselves. - - Raises - ------ - ValueError - If the node is not a directory. - - """ - if self.type != FileSystemNodeType.DIRECTORY: - msg = "Cannot sort children of a non-directory node" - raise ValueError(msg) + def __post_init__(self) -> None: + """Validate required fields after initialization.""" + if self.name is None: + raise ValueError("FileSystemNode requires 'name' field") + if self.path_str is None: + raise ValueError("FileSystemNode requires 'path_str' field") + if self.path is None: + raise ValueError("FileSystemNode requires 'path' field") + # Abstract methods - must be implemented by subclasses + @property + @abstractmethod + def display_name(self) -> str: + """Display name for tree view (e.g., file.py, dir/, symlink -> target).""" + + @property + @abstractmethod + def node_type(self) -> str: + """Type name for content string header (FILE, DIRECTORY, SYMLINK).""" + + @property + @abstractmethod + def is_single_file(self) -> bool: + """True if this node represents a single file.""" + + @abstractmethod + def gather_contents(self) -> str: + """Gather all file contents under this node.""" + + # Concrete methods with default implementations + def sort_children(self) -> None: + """Sort children: README first, then files, then dirs, hidden last.""" def _sort_key(child: FileSystemNode) -> tuple[int, str]: - # returns the priority order for the sort function, 0 is first - # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir - name = child.name.lower() - if child.type == FileSystemNodeType.FILE: - if name == "readme" or name.startswith("readme."): - return (0, name) + name = (child.name or "").lower() + + # README files get highest priority + if name == "readme" or name.startswith("readme."): + return (0, name) + + # Then sort by type and visibility + if isinstance(child, FileSystemFile): return (1 if not name.startswith(".") else 2, name) - return (3 if not name.startswith(".") else 4, name) + else: # Directories, symlinks + return (3 if not name.startswith(".") else 4, name) self.children.sort(key=_sort_key) @property def content_string(self) -> str: - """Return the content of the node as a string, including path and content. - - Returns - ------- - str - A string representation of the node's content. - - """ + """Content with header for output format.""" parts = [ SEPARATOR, - f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" - + (f" -> {readlink(self.path).name}" if self.type == FileSystemNodeType.SYMLINK else ""), + f"{self.node_type}: {str(self.path_str or '').replace(os.sep, '/')}", SEPARATOR, f"{self.content}", ] - return "\n".join(parts) + "\n\n" - @property - def content(self) -> str: # pylint: disable=too-many-return-statements - """Return file content (if text / notebook) or an explanatory placeholder. - - Heuristically decides whether the file is text or binary by decoding a small chunk of the file - with multiple encodings and checking for common binary markers. - - Returns - ------- - str - The content of the file, or an error message if the file could not be read. - - Raises - ------ - ValueError - If the node is a directory. + def get_content(self) -> str: + """Default content reading with encoding detection.""" + from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk + from gitingest.utils.notebook import process_notebook - """ - if self.type == FileSystemNodeType.DIRECTORY: - msg = "Cannot read content of a directory node" - raise ValueError(msg) + if not self.path: + return "Error: No path specified" - if self.type == FileSystemNodeType.SYMLINK: - return "" # TODO: are we including the empty content of symlinks? - - if self.path.suffix == ".ipynb": # Notebook + # Handle notebooks specially + if self.path.suffix == ".ipynb": try: return process_notebook(self.path) except Exception as exc: return f"Error processing notebook: {exc}" + # Read chunk and detect encoding chunk = _read_chunk(self.path) - if chunk is None: return "Error reading file" - if chunk == b"": return "[Empty file]" - if not _decodes(chunk, "utf-8"): return "[Binary file]" - # Find the first encoding that decodes the sample - good_enc: str | None = next( + # Find working encoding + good_enc = next( (enc for enc in _get_preferred_encodings() if _decodes(chunk, encoding=enc)), None, ) - if good_enc is None: return "Error: Unable to decode file with available encodings" @@ -159,3 +134,109 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return fp.read() except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" + + @property + def content(self) -> str: + """Backward compatibility property.""" + return self.get_content() + + +@dataclass +class FileSystemFile(FileSystemNode): + """Represents a file in the filesystem.""" + + @property + def display_name(self) -> str: + """Files show just their name.""" + return self.name or "" + + @property + def node_type(self) -> str: + """File type identifier.""" + return "FILE" + + @property + def is_single_file(self) -> bool: + """Files are single files.""" + return True + + def gather_contents(self) -> str: + """Files return their content string.""" + return self.content_string + + +@dataclass +class FileSystemDirectory(FileSystemNode): + """Represents a directory in the filesystem.""" + + file_count_total: int = 0 + + @property + def display_name(self) -> str: + """Directories get trailing slash.""" + return (self.name or "") + "/" + + @property + def node_type(self) -> str: + """Directory type identifier.""" + return "DIRECTORY" + + @property + def is_single_file(self) -> bool: + """Directories are not single files.""" + return False + + def gather_contents(self) -> str: + """Recursively gather all child contents.""" + return "\n".join(child.gather_contents() for child in self.children) + + def get_content(self) -> str: + """Directories cannot have content.""" + raise ValueError("Cannot read content of a directory node") + + +@dataclass +class FileSystemSymlink(FileSystemNode): + """Represents a symbolic link in the filesystem.""" + + target: str = "" + + @property + def display_name(self) -> str: + """Symlinks show target.""" + return f"{self.name or ''} -> {self.target}" + + @property + def node_type(self) -> str: + """Symlink type identifier.""" + return "SYMLINK" + + @property + def is_single_file(self) -> bool: + """Symlinks are not single files.""" + return False + + def gather_contents(self) -> str: + """Symlinks return their content string.""" + return self.content_string + + def get_content(self) -> str: + """Symlinks content is their target.""" + return self.target + + +@dataclass +class GitRepository(FileSystemDirectory): + """A directory that contains a .git folder, representing a Git repository.""" + + git_info: dict = field(default_factory=dict) + + @property + def display_name(self) -> str: + """Git repos show as special directories.""" + return f"{self.name or ''}/ (git repository)" + + @property + def node_type(self) -> str: + """Git repository type identifier.""" + return "GIT_REPOSITORY" diff --git a/src/gitingest/schemas/source.py b/src/gitingest/schemas/source.py new file mode 100644 index 00000000..22823bc1 --- /dev/null +++ b/src/gitingest/schemas/source.py @@ -0,0 +1,13 @@ +"""Abstract base class for all source objects.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class Source: + """Base class for all sources (files, directories, etc).""" + + metadata: dict = field(default_factory=dict) + extra: dict = field(default_factory=dict)