coderamp-labs · MickaelCa · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 14, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -122,6 +122,7 @@ repos:
             loguru>=0.7.0,
             pathspec>=0.12.1,
             prometheus-client,
+            psutil>=5.9.0,
             pydantic,
             pytest-asyncio,
             pytest-mock,
@@ -150,6 +151,7 @@ repos:
             loguru>=0.7.0,
             pathspec>=0.12.1,
             prometheus-client,
+            psutil>=5.9.0,
             pydantic,
             pytest-asyncio,
             pytest-mock,

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ server = [
     "boto3>=1.28.0",  # AWS SDK for S3 support
     "fastapi[standard]>=0.109.1",  # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
     "prometheus-client",
+    "psutil>=5.9.0",  # Memory monitoring for optimization
     "sentry-sdk[fastapi]",
     "slowapi",
     "uvicorn>=0.11.7",  # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)

diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ httpx
 loguru>=0.7.0
 pathspec>=0.12.1
 prometheus-client
+psutil>=5.9.0  # Memory monitoring for optimization
 pydantic
 python-dotenv
 sentry-sdk[fastapi]

diff --git a/src/gitingest/config.py b/src/gitingest/config.py
@@ -9,6 +9,12 @@
 MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024  # Maximum size of output file (500 MB)
 DEFAULT_TIMEOUT = 60  # seconds
 
+# Memory optimization settings
+BATCH_SIZE = 100  # Process files in batches to reduce memory usage
+MEMORY_CHECK_INTERVAL = 25  # Check memory usage every N files (more frequent)
+AGGRESSIVE_GC_INTERVAL = 10  # Force garbage collection every N files for large repos
+MEMORY_PRESSURE_THRESHOLD_MB = 2000  # Trigger aggressive cleanup at 2GB usage
+
 OUTPUT_FILE_NAME = "digest.txt"
 
 TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest"
diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py
@@ -5,11 +5,18 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
+from gitingest.config import (
+    AGGRESSIVE_GC_INTERVAL,
+    MAX_DIRECTORY_DEPTH,
+    MAX_FILES,
+    MAX_TOTAL_SIZE_BYTES,
+    MEMORY_CHECK_INTERVAL,
+)
 from gitingest.output_formatter import format_node
 from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
 from gitingest.utils.ingestion_utils import _should_exclude, _should_include
 from gitingest.utils.logging_config import get_logger
+from gitingest.utils.memory_utils import check_memory_pressure, force_garbage_collection, log_memory_stats
 
 if TYPE_CHECKING:
     from gitingest.schemas import IngestionQuery
@@ -51,6 +58,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
         },
     )
 
+    # Log initial memory usage
+    log_memory_stats("at ingestion start")
+
     subpath = Path(query.subpath.strip("/")).as_posix()
     path = query.local_path / subpath
 
@@ -117,6 +127,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
         },
     )
 
+    # Log final memory usage
+    log_memory_stats("at ingestion completion")
+
     return format_node(root_node, query=query)
 
 
@@ -258,6 +271,21 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
     stats.total_files += 1
     stats.total_size += file_size
 
+    # More aggressive memory management for large repositories
+    if stats.total_files % AGGRESSIVE_GC_INTERVAL == 0:
+        force_garbage_collection()
+
+    # Check memory usage periodically and force more aggressive GC if needed
+    if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure():
+        logger.warning(
+            "Memory pressure detected, forcing aggressive garbage collection",
+            extra={"files_processed": stats.total_files},
+        )
+        # Multiple GC cycles for better cleanup
+        force_garbage_collection()
+        force_garbage_collection()
+        log_memory_stats(f"after aggressive cleanup at {stats.total_files} files")
+
     child = FileSystemNode(
         name=path.name,
         type=FileSystemNodeType.FILE,

diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py
@@ -2,27 +2,21 @@
 
 from __future__ import annotations
 
-import ssl
+from io import StringIO
 from typing import TYPE_CHECKING
 
-import requests.exceptions
-import tiktoken
-
 from gitingest.schemas import FileSystemNode, FileSystemNodeType
 from gitingest.utils.compat_func import readlink
 from gitingest.utils.logging_config import get_logger
+from gitingest.utils.memory_utils import force_garbage_collection, log_memory_stats
+from gitingest.utils.token_utils import clear_encoding_cache, count_tokens_optimized, format_token_count
 
 if TYPE_CHECKING:
     from gitingest.schemas import IngestionQuery
 
 # Initialize logger for this module
 logger = get_logger(__name__)
 
-_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
-    (1_000_000, "M"),
-    (1_000, "k"),
-]
-
 
 def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]:
     """Generate a summary, directory structure, and file contents for a given file system node.
@@ -51,13 +45,33 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
         summary += f"File: {node.name}\n"
         summary += f"Lines: {len(node.content.splitlines()):,}\n"
 
+    # Log memory before tree generation
+    log_memory_stats("before tree structure generation")
+
     tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
 
+    # Log memory before content gathering (this is the memory-intensive part)
+    log_memory_stats("before content gathering")
+
     content = _gather_file_contents(node)
 
-    token_estimate = _format_token_count(tree + content)
-    if token_estimate:
-        summary += f"\nEstimated tokens: {token_estimate}"
+    # Force garbage collection after content gathering
+    force_garbage_collection()
+    log_memory_stats("after content gathering and cleanup")
+
+    # Count tokens with optimization
+    token_count = count_tokens_optimized(tree + content)
+    if token_count > 0:
+        summary += f"\nEstimated tokens: {format_token_count(token_count)}"
+
+    # Final cleanup
+    if hasattr(node, "clear_content_cache_recursive"):
+        node.clear_content_cache_recursive()
+
+    # Clear the tiktoken encoding cache to free memory
+    clear_encoding_cache()
+    force_garbage_collection()
+    log_memory_stats("after final cache and encoding cleanup")
 
     return summary, tree, content
 
@@ -122,8 +136,46 @@ def _gather_file_contents(node: FileSystemNode) -> str:
     if node.type != FileSystemNodeType.DIRECTORY:
         return node.content_string
 
-    # Recursively gather contents of all files under the current directory
-    return "\n".join(_gather_file_contents(child) for child in node.children)
+    # Use StringIO for memory-efficient string concatenation
+    content_buffer = StringIO()
+    try:
+        _gather_file_contents_recursive(node, content_buffer)
+        return content_buffer.getvalue()
+    finally:
+        content_buffer.close()
+
+
+def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None:
+    """Recursively gather file contents with memory optimization.
+
+    This version includes memory optimizations:
+    - Progressive content cache clearing
+    - Periodic garbage collection
+    - Memory-aware processing
+
+    Parameters
+    ----------
+    node : FileSystemNode
+        The current directory or file node being processed.
+    buffer : StringIO
+        Buffer to write content to.
+
+    """
+    if node.type != FileSystemNodeType.DIRECTORY:
+        # Write content and immediately clear cache to free memory
+        buffer.write(node.content_string)
+        node.clear_content_cache()
+        return
+
+    for files_processed, child in enumerate(node.children, 1):
+        _gather_file_contents_recursive(child, buffer)
+
+        # Progressive cleanup every 10 files to prevent memory accumulation
+        if files_processed % 10 == 0:
+            force_garbage_collection()
+
+    # Clear content cache for this directory after processing all children
+    node.clear_content_cache()
 
 
 def _create_tree_structure(
@@ -176,35 +228,3 @@ def _create_tree_structure(
         for i, child in enumerate(node.children):
             tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1)
     return tree_str
-
-
-def _format_token_count(text: str) -> str | None:
-    """Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
-
-    Parameters
-    ----------
-    text : str
-        The text string for which the token count is to be estimated.
-
-    Returns
-    -------
-    str | None
-        The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs.
-
-    """
-    try:
-        encoding = tiktoken.get_encoding("o200k_base")  # gpt-4o, gpt-4o-mini
-        total_tokens = len(encoding.encode(text, disallowed_special=()))
-    except (ValueError, UnicodeEncodeError) as exc:
-        logger.warning("Failed to estimate token size", extra={"error": str(exc)})
-        return None
-    except (requests.exceptions.RequestException, ssl.SSLError) as exc:
-        # If network errors, skip token count estimation instead of erroring out
-        logger.warning("Failed to download tiktoken model", extra={"error": str(exc)})
-        return None
-
-    for threshold, suffix in _TOKEN_THRESHOLDS:
-        if total_tokens >= threshold:
-            return f"{total_tokens / threshold:.1f}{suffix}"
-
-    return str(total_tokens)
diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py
@@ -5,6 +5,7 @@
 import os
 from dataclasses import dataclass, field
 from enum import Enum, auto
+from io import StringIO
 from typing import TYPE_CHECKING
 
 from gitingest.utils.compat_func import readlink
@@ -49,6 +50,7 @@ class FileSystemNode:  # pylint: disable=too-many-instance-attributes
     dir_count: int = 0
     depth: int = 0
     children: list[FileSystemNode] = field(default_factory=list)
+    _content_cache: str | None = field(default=None, init=False)
 
     def sort_children(self) -> None:
         """Sort the children nodes of a directory according to a specific order.
@@ -105,10 +107,9 @@ def content_string(self) -> str:
 
     @property
     def content(self) -> str:  # pylint: disable=too-many-return-statements
-        """Return file content (if text / notebook) or an explanatory placeholder.
+        """Return file content with caching for memory optimization.
 
-        Heuristically decides whether the file is text or binary by decoding a small chunk of the file
-        with multiple encodings and checking for common binary markers.
+        Uses lazy loading and caching to reduce memory usage for large repositories.
 
         Returns
         -------
@@ -128,14 +129,50 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
         if self.type == FileSystemNodeType.SYMLINK:
             return ""  # TODO: are we including the empty content of symlinks?
 
-        if self.path.suffix == ".ipynb":  # Notebook
+        # Return cached content if available
+        if self._content_cache is not None:
+            return self._content_cache
+
+        # Load and cache content
+        self._content_cache = self._load_content()
+        return self._content_cache
+
+    def _load_content(self) -> str:
+        """Load file content from disk.
+
+        Returns
+        -------
+        str
+            The file content
+
+        """
+        # Handle notebooks separately
+        if self.path.suffix == ".ipynb":
             try:
                 return process_notebook(self.path)
             except Exception as exc:
                 return f"Error processing notebook: {exc}"
 
+        # Read file chunk for analysis
         chunk = _read_chunk(self.path)
 
+        # Determine the appropriate content based on chunk analysis
+        return self._analyze_chunk_and_read(chunk)
+
+    def _analyze_chunk_and_read(self, chunk: bytes | None) -> str:
+        """Analyze file chunk and return appropriate content.
+
+        Parameters
+        ----------
+        chunk : bytes | None
+            The file chunk to analyze
+
+        Returns
+        -------
+        str
+            The file content or error message
+
+        """
         if chunk is None:
             return "Error reading file"
 
@@ -155,7 +192,44 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
             return "Error: Unable to decode file with available encodings"
 
         try:
-            with self.path.open(encoding=good_enc) as fp:
-                return fp.read()
+            return self._read_file_content_streaming(good_enc)
         except (OSError, UnicodeDecodeError) as exc:
             return f"Error reading file with {good_enc!r}: {exc}"
+
+    def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) -> str:
+        """Read file content using streaming to reduce memory usage.
+
+        Parameters
+        ----------
+        encoding : str
+            The encoding to use for reading the file.
+        chunk_size : int
+            Size of chunks to read at a time (default: 8192 bytes).
+
+        Returns
+        -------
+        str
+            The file content.
+
+        """
+        content_buffer = StringIO()
+        try:
+            with self.path.open(encoding=encoding) as fp:
+                while True:
+                    chunk = fp.read(chunk_size)
+                    if not chunk:
+                        break
+                    content_buffer.write(chunk)
+            return content_buffer.getvalue()
+        finally:
+            content_buffer.close()
+
+    def clear_content_cache(self) -> None:
+        """Clear cached content to free memory."""
+        self._content_cache = None
+
+    def clear_content_cache_recursive(self) -> None:
+        """Recursively clear content cache for this node and all children."""
+        self.clear_content_cache()
+        for child in self.children:
+            child.clear_content_cache_recursive()