From d17d3329f49dd39488aa08bacc5408d10168d026 Mon Sep 17 00:00:00 2001 From: mickael Date: Tue, 12 Aug 2025 18:41:36 +0200 Subject: [PATCH 1/4] feat: add memory optimizations to prevent OOM issues --- .pre-commit-config.yaml | 2 + pyproject.toml | 1 + requirements.txt | 1 + src/gitingest/config.py | 4 ++ src/gitingest/ingestion.py | 18 ++++++- src/gitingest/output_formatter.py | 29 +++++++++- src/gitingest/schemas/filesystem.py | 32 ++++++++++- src/gitingest/utils/memory_utils.py | 83 +++++++++++++++++++++++++++++ src/server/query_processor.py | 18 +++++-- 9 files changed, 180 insertions(+), 8 deletions(-) create mode 100644 src/gitingest/utils/memory_utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3fcfb61b..a7f749dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -122,6 +122,7 @@ repos: loguru>=0.7.0, pathspec>=0.12.1, prometheus-client, + psutil>=5.9.0, pydantic, pytest-asyncio, pytest-mock, @@ -150,6 +151,7 @@ repos: loguru>=0.7.0, pathspec>=0.12.1, prometheus-client, + psutil>=5.9.0, pydantic, pytest-asyncio, pytest-mock, diff --git a/pyproject.toml b/pyproject.toml index 36219fe6..abd6c511 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ server = [ "boto3>=1.28.0", # AWS SDK for S3 support "fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38) "prometheus-client", + "psutil>=5.9.0", # Memory monitoring for optimization "sentry-sdk[fastapi]", "slowapi", "uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150) diff --git a/requirements.txt b/requirements.txt index b803cf7b..2d28af30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ httpx loguru>=0.7.0 pathspec>=0.12.1 prometheus-client +psutil>=5.9.0 # Memory monitoring for optimization pydantic python-dotenv sentry-sdk[fastapi] diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 3d154684..eabd60e0 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -9,6 +9,10 @@ MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB) DEFAULT_TIMEOUT = 60 # seconds +# Memory optimization settings +BATCH_SIZE = 100 # Process files in batches to reduce memory usage +MEMORY_CHECK_INTERVAL = 50 # Check memory usage every N files + OUTPUT_FILE_NAME = "digest.txt" TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 01a2c8f3..be3c5e67 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,11 +5,12 @@ from pathlib import Path from typing import TYPE_CHECKING -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MEMORY_CHECK_INTERVAL from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger +from gitingest.utils.memory_utils import check_memory_pressure, force_garbage_collection, log_memory_stats if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -51,6 +52,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: }, ) + # Log initial memory usage + log_memory_stats("at ingestion start") + subpath = Path(query.subpath.strip("/")).as_posix() path = query.local_path / subpath @@ -117,6 +121,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: }, ) + # Log final memory usage + log_memory_stats("at ingestion completion") + return format_node(root_node, query=query) @@ -258,6 +265,15 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat stats.total_files += 1 stats.total_size += file_size + # Check memory usage periodically and force GC if needed + if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure(): + logger.warning( + "Memory pressure detected, forcing garbage collection", + extra={"files_processed": stats.total_files}, + ) + force_garbage_collection() + log_memory_stats(f"after processing {stats.total_files} files") + child = FileSystemNode( name=path.name, type=FileSystemNodeType.FILE, diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 5c2b59ae..64b8449c 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -3,6 +3,7 @@ from __future__ import annotations import ssl +from io import StringIO from typing import TYPE_CHECKING import requests.exceptions @@ -122,8 +123,32 @@ def _gather_file_contents(node: FileSystemNode) -> str: if node.type != FileSystemNodeType.DIRECTORY: return node.content_string - # Recursively gather contents of all files under the current directory - return "\n".join(_gather_file_contents(child) for child in node.children) + # Use StringIO for memory-efficient string concatenation + content_buffer = StringIO() + try: + _gather_file_contents_recursive(node, content_buffer) + return content_buffer.getvalue() + finally: + content_buffer.close() + + +def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None: + """Recursively gather file contents into a StringIO buffer to reduce memory usage. + + Parameters + ---------- + node : FileSystemNode + The current directory or file node being processed. + buffer : StringIO + Buffer to write content to. + + """ + if node.type != FileSystemNodeType.DIRECTORY: + buffer.write(node.content_string) + return + + for child in node.children: + _gather_file_contents_recursive(child, buffer) def _create_tree_structure( diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index cc66e7b1..4df8212a 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -5,6 +5,7 @@ import os from dataclasses import dataclass, field from enum import Enum, auto +from io import StringIO from typing import TYPE_CHECKING from gitingest.utils.compat_func import readlink @@ -155,7 +156,34 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return "Error: Unable to decode file with available encodings" try: - with self.path.open(encoding=good_enc) as fp: - return fp.read() + return self._read_file_content_streaming(good_enc) except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" + + def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) -> str: + """Read file content using streaming to reduce memory usage. + + Parameters + ---------- + encoding : str + The encoding to use for reading the file. + chunk_size : int + Size of chunks to read at a time (default: 8192 bytes). + + Returns + ------- + str + The file content. + + """ + content_buffer = StringIO() + try: + with self.path.open(encoding=encoding) as fp: + while True: + chunk = fp.read(chunk_size) + if not chunk: + break + content_buffer.write(chunk) + return content_buffer.getvalue() + finally: + content_buffer.close() diff --git a/src/gitingest/utils/memory_utils.py b/src/gitingest/utils/memory_utils.py new file mode 100644 index 00000000..d32cb61a --- /dev/null +++ b/src/gitingest/utils/memory_utils.py @@ -0,0 +1,83 @@ +"""Memory utility functions for monitoring and optimization.""" + +from __future__ import annotations + +import gc +from typing import Any + +import psutil + +from gitingest.utils.logging_config import get_logger + +logger = get_logger(__name__) + + +def get_memory_usage() -> dict[str, Any]: + """Get current memory usage statistics. + + Returns + ------- + dict[str, Any] + Dictionary containing memory usage statistics in MB. + + """ + try: + process = psutil.Process() + memory_info = process.memory_info() + + return { + "rss_mb": memory_info.rss / (1024 * 1024), # Resident Set Size + "vms_mb": memory_info.vms / (1024 * 1024), # Virtual Memory Size + "percent": process.memory_percent(), + } + except Exception as exc: + logger.warning("Failed to get memory usage", extra={"error": str(exc)}) + return {"rss_mb": 0, "vms_mb": 0, "percent": 0} + + +def force_garbage_collection() -> None: + """Force garbage collection to free up memory.""" + try: + collected = gc.collect() + logger.debug("Forced garbage collection", extra={"objects_collected": collected}) + except Exception as exc: + logger.warning("Failed to force garbage collection", extra={"error": str(exc)}) + + +def check_memory_pressure(threshold_mb: float = 3000) -> bool: + """Check if memory usage is above threshold. + + Parameters + ---------- + threshold_mb : float + Memory threshold in MB (default: 3000 MB = 3 GB). + + Returns + ------- + bool + True if memory usage is above threshold. + + """ + memory_stats = get_memory_usage() + return memory_stats["rss_mb"] > threshold_mb + + +def log_memory_stats(context: str = "") -> None: + """Log current memory statistics. + + Parameters + ---------- + context : str + Context information for the log message. + + """ + memory_stats = get_memory_usage() + logger.info( + "Memory usage %s", + context, + extra={ + "memory_rss_mb": round(memory_stats["rss_mb"], 2), + "memory_vms_mb": round(memory_stats["vms_mb"], 2), + "memory_percent": round(memory_stats["percent"], 2), + }, + ) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index f2f2ae90..ad68b519 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -3,6 +3,7 @@ from __future__ import annotations import shutil +from io import StringIO from pathlib import Path from typing import TYPE_CHECKING, cast @@ -302,7 +303,19 @@ async def process_query( try: summary, tree, content = ingest_query(query) - digest_content = tree + "\n" + content + + # Clean up repository immediately after ingestion to free memory + _cleanup_repository(clone_config) + + # Use StringIO for memory-efficient string concatenation + digest_buffer = StringIO() + try: + digest_buffer.write(tree) + digest_buffer.write("\n") + digest_buffer.write(content) + digest_content = digest_buffer.getvalue() + finally: + digest_buffer.close() _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) @@ -326,8 +339,7 @@ async def process_query( digest_url = _generate_digest_url(query) - # Clean up the repository after successful processing - _cleanup_repository(clone_config) + # Repository was already cleaned up after ingestion to free memory earlier return IngestSuccessResponse( repo_url=input_text, From 8e4e37c311482665905deb3d5665adaf54883593 Mon Sep 17 00:00:00 2001 From: mickael Date: Tue, 12 Aug 2025 19:01:12 +0200 Subject: [PATCH 2/4] feat: add prometheus memory metrics for ingestion monitoring --- src/gitingest/utils/memory_utils.py | 6 +- src/server/memory_metrics.py | 144 ++++++++++++++++++++++++++++ src/server/metrics_server.py | 3 + src/server/query_processor.py | 57 ++++++----- 4 files changed, 183 insertions(+), 27 deletions(-) create mode 100644 src/server/memory_metrics.py diff --git a/src/gitingest/utils/memory_utils.py b/src/gitingest/utils/memory_utils.py index d32cb61a..c52c990b 100644 --- a/src/gitingest/utils/memory_utils.py +++ b/src/gitingest/utils/memory_utils.py @@ -72,10 +72,10 @@ def log_memory_stats(context: str = "") -> None: """ memory_stats = get_memory_usage() - logger.info( - "Memory usage %s", - context, + logger.debug( + "Memory usage statistics", extra={ + "context": context, "memory_rss_mb": round(memory_stats["rss_mb"], 2), "memory_vms_mb": round(memory_stats["vms_mb"], 2), "memory_percent": round(memory_stats["percent"], 2), diff --git a/src/server/memory_metrics.py b/src/server/memory_metrics.py new file mode 100644 index 00000000..746f3ac7 --- /dev/null +++ b/src/server/memory_metrics.py @@ -0,0 +1,144 @@ +"""Memory usage metrics for Prometheus monitoring.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from prometheus_client import Gauge, Histogram + +from gitingest.utils.memory_utils import get_memory_usage + +if TYPE_CHECKING: + import types + from typing import Self + +# Memory usage gauges +memory_usage_rss_mb = Gauge( + "gitingest_memory_usage_rss_mb", + "Resident Set Size memory usage in MB", + ["repo_url"], +) + +memory_usage_vms_mb = Gauge( + "gitingest_memory_usage_vms_mb", + "Virtual Memory Size usage in MB", + ["repo_url"], +) + +memory_usage_percent = Gauge( + "gitingest_memory_usage_percent", + "Memory usage percentage", + ["repo_url"], +) + +# Memory usage histogram to track distribution of memory consumption per repository +memory_consumption_histogram = Histogram( + "gitingest_memory_consumption_mb", + "Memory consumption distribution per repository in MB", + ["repo_url"], + buckets=(50, 100, 250, 500, 1000, 2000, 3000, 5000, 10000, float("inf")), +) + +# Peak memory usage gauge +peak_memory_usage_mb = Gauge( + "gitingest_peak_memory_usage_mb", + "Peak memory usage during ingestion in MB", + ["repo_url"], +) + + +def record_memory_usage(repo_url: str) -> dict[str, float]: + """Record current memory usage metrics for a repository. + + Parameters + ---------- + repo_url : str + The repository URL to label the metrics with + + Returns + ------- + dict[str, float] + Current memory usage statistics + + """ + # Truncate URL for label to avoid excessive cardinality + repo_label = repo_url[:255] + + # Get current memory stats + memory_stats = get_memory_usage() + + # Record current memory usage + memory_usage_rss_mb.labels(repo_url=repo_label).set(memory_stats["rss_mb"]) + memory_usage_vms_mb.labels(repo_url=repo_label).set(memory_stats["vms_mb"]) + memory_usage_percent.labels(repo_url=repo_label).set(memory_stats["percent"]) + + # Record in histogram for distribution analysis + memory_consumption_histogram.labels(repo_url=repo_label).observe(memory_stats["rss_mb"]) + + return memory_stats + + +def record_peak_memory_usage(repo_url: str, peak_mb: float) -> None: + """Record peak memory usage for a repository ingestion. + + Parameters + ---------- + repo_url : str + The repository URL to label the metrics with + peak_mb : float + Peak memory usage in MB + + """ + repo_label = repo_url[:255] + peak_memory_usage_mb.labels(repo_url=repo_label).set(peak_mb) + + +class MemoryTracker: + """Context manager to track memory usage during repository ingestion. + + Parameters + ---------- + repo_url : str + Repository URL for labeling metrics + + """ + + def __init__(self, repo_url: str) -> None: + self.repo_url = repo_url + self.initial_memory = 0.0 + self.peak_memory = 0.0 + + def __enter__(self) -> Self: + """Start memory tracking.""" + initial_stats = get_memory_usage() + self.initial_memory = initial_stats["rss_mb"] + self.peak_memory = self.initial_memory + + # Record initial memory usage + record_memory_usage(self.repo_url) + + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: types.TracebackType | None, + ) -> None: + """End memory tracking and record peak usage.""" + # Record final memory usage + final_stats = record_memory_usage(self.repo_url) + + # Update peak if current is higher + self.peak_memory = max(self.peak_memory, final_stats["rss_mb"]) + + # Record peak memory usage + record_peak_memory_usage(self.repo_url, self.peak_memory) + + def update_peak(self) -> None: + """Update peak memory if current usage is higher.""" + current_stats = get_memory_usage() + self.peak_memory = max(self.peak_memory, current_stats["rss_mb"]) + + # Also record current usage + record_memory_usage(self.repo_url) diff --git a/src/server/metrics_server.py b/src/server/metrics_server.py index b24424c6..ec51b7c8 100644 --- a/src/server/metrics_server.py +++ b/src/server/metrics_server.py @@ -7,6 +7,9 @@ from gitingest.utils.logging_config import get_logger +# Import to ensure memory metrics are registered +from server import memory_metrics # noqa: F401 # pylint: disable=unused-import + # Create a logger for this module logger = get_logger(__name__) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index ad68b519..c56ac107 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -13,6 +13,7 @@ from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger from gitingest.utils.pattern_utils import process_patterns +from server.memory_metrics import MemoryTracker from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata from server.s3_utils import ( _build_s3_url, @@ -292,36 +293,44 @@ async def process_query( return s3_response clone_config = query.extract_clone_config() - await clone_repo(clone_config, token=token) - short_repo_url = f"{query.user_name}/{query.repo_name}" - # The commit hash should always be available at this point - if not query.commit: - msg = "Unexpected error: no commit hash found" - raise RuntimeError(msg) + # Track memory usage during the entire ingestion process + with MemoryTracker(input_text) as memory_tracker: + await clone_repo(clone_config, token=token) - try: - summary, tree, content = ingest_query(query) + # Update peak memory after cloning + memory_tracker.update_peak() - # Clean up repository immediately after ingestion to free memory - _cleanup_repository(clone_config) + # The commit hash should always be available at this point + if not query.commit: + msg = "Unexpected error: no commit hash found" + raise RuntimeError(msg) - # Use StringIO for memory-efficient string concatenation - digest_buffer = StringIO() try: - digest_buffer.write(tree) - digest_buffer.write("\n") - digest_buffer.write(content) - digest_content = digest_buffer.getvalue() - finally: - digest_buffer.close() - _store_digest_content(query, clone_config, digest_content, summary, tree, content) - except Exception as exc: - _print_error(query.url, exc, max_file_size, pattern_type, pattern) - # Clean up repository even if processing failed - _cleanup_repository(clone_config) - return IngestErrorResponse(error=f"{exc!s}") + summary, tree, content = ingest_query(query) + + # Update peak memory after ingestion (this is likely the highest usage) + memory_tracker.update_peak() + + # Clean up repository immediately after ingestion to free memory + _cleanup_repository(clone_config) + + # Use StringIO for memory-efficient string concatenation + digest_buffer = StringIO() + try: + digest_buffer.write(tree) + digest_buffer.write("\n") + digest_buffer.write(content) + digest_content = digest_buffer.getvalue() + finally: + digest_buffer.close() + _store_digest_content(query, clone_config, digest_content, summary, tree, content) + except Exception as exc: + _print_error(query.url, exc, max_file_size, pattern_type, pattern) + # Clean up repository even if processing failed + _cleanup_repository(clone_config) + return IngestErrorResponse(error=f"{exc!s}") if len(content) > MAX_DISPLAY_SIZE: content = ( From d1d7abb3fc3c1b23d255ab38d6adf6077df29520 Mon Sep 17 00:00:00 2001 From: mickael Date: Tue, 12 Aug 2025 19:13:28 +0200 Subject: [PATCH 3/4] fix: ensure memory metrics are recorded for all ingestion paths --- src/server/query_processor.py | 83 ++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index c56ac107..5b0d0a95 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -280,23 +280,24 @@ async def process_query( include_patterns=pattern if pattern_type == PatternType.INCLUDE else None, ) - # Check if digest already exists on S3 before cloning - s3_response = await _check_s3_cache( - query=query, - input_text=input_text, - max_file_size=max_file_size, - pattern_type=pattern_type.value, - pattern=pattern, - token=token, - ) - if s3_response: - return s3_response - - clone_config = query.extract_clone_config() - short_repo_url = f"{query.user_name}/{query.repo_name}" - - # Track memory usage during the entire ingestion process + # Track memory usage for the entire request (including S3 cache checks) with MemoryTracker(input_text) as memory_tracker: + # Check if digest already exists on S3 before cloning + s3_response = await _check_s3_cache( + query=query, + input_text=input_text, + max_file_size=max_file_size, + pattern_type=pattern_type.value, + pattern=pattern, + token=token, + ) + if s3_response: + # Even for S3 cache hits, record the memory usage + memory_tracker.update_peak() + return s3_response + + clone_config = query.extract_clone_config() + short_repo_url = f"{query.user_name}/{query.repo_name}" await clone_repo(clone_config, token=token) # Update peak memory after cloning @@ -332,35 +333,35 @@ async def process_query( _cleanup_repository(clone_config) return IngestErrorResponse(error=f"{exc!s}") - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - ) + if len(content) > MAX_DISPLAY_SIZE: + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) - _print_success( - url=query.url, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=summary, - ) + _print_success( + url=query.url, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) - digest_url = _generate_digest_url(query) + digest_url = _generate_digest_url(query) - # Repository was already cleaned up after ingestion to free memory earlier + # Repository was already cleaned up after ingestion to free memory earlier - return IngestSuccessResponse( - repo_url=input_text, - short_repo_url=short_repo_url, - summary=summary, - digest_url=digest_url, - tree=tree, - content=content, - default_max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - ) + return IngestSuccessResponse( + repo_url=input_text, + short_repo_url=short_repo_url, + summary=summary, + digest_url=digest_url, + tree=tree, + content=content, + default_max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + ) def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: From 6cecec0fa75c5a2993115b3988a7ac3fe4ea6a19 Mon Sep 17 00:00:00 2001 From: mickael Date: Thu, 14 Aug 2025 10:40:05 +0200 Subject: [PATCH 4/4] perf: implement enhanced memory management utilities - Added aggressive garbage collection and memory checks. - Introduced token counting optimizations with caching and chunking. - Integrated content caching and progressive cleanup for large repos. - Adjusted memory thresholds for better handling of OOM risks. --- src/gitingest/config.py | 4 +- src/gitingest/ingestion.py | 20 ++- src/gitingest/output_formatter.py | 87 +++++++------ src/gitingest/schemas/filesystem.py | 54 +++++++- src/gitingest/utils/memory_utils.py | 2 +- src/gitingest/utils/token_utils.py | 183 ++++++++++++++++++++++++++++ src/server/query_processor.py | 12 +- 7 files changed, 305 insertions(+), 57 deletions(-) create mode 100644 src/gitingest/utils/token_utils.py diff --git a/src/gitingest/config.py b/src/gitingest/config.py index eabd60e0..2049c939 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -11,7 +11,9 @@ # Memory optimization settings BATCH_SIZE = 100 # Process files in batches to reduce memory usage -MEMORY_CHECK_INTERVAL = 50 # Check memory usage every N files +MEMORY_CHECK_INTERVAL = 25 # Check memory usage every N files (more frequent) +AGGRESSIVE_GC_INTERVAL = 10 # Force garbage collection every N files for large repos +MEMORY_PRESSURE_THRESHOLD_MB = 2000 # Trigger aggressive cleanup at 2GB usage OUTPUT_FILE_NAME = "digest.txt" diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index be3c5e67..32c98be5 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -5,7 +5,13 @@ from pathlib import Path from typing import TYPE_CHECKING -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MEMORY_CHECK_INTERVAL +from gitingest.config import ( + AGGRESSIVE_GC_INTERVAL, + MAX_DIRECTORY_DEPTH, + MAX_FILES, + MAX_TOTAL_SIZE_BYTES, + MEMORY_CHECK_INTERVAL, +) from gitingest.output_formatter import format_node from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include @@ -265,14 +271,20 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat stats.total_files += 1 stats.total_size += file_size - # Check memory usage periodically and force GC if needed + # More aggressive memory management for large repositories + if stats.total_files % AGGRESSIVE_GC_INTERVAL == 0: + force_garbage_collection() + + # Check memory usage periodically and force more aggressive GC if needed if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure(): logger.warning( - "Memory pressure detected, forcing garbage collection", + "Memory pressure detected, forcing aggressive garbage collection", extra={"files_processed": stats.total_files}, ) + # Multiple GC cycles for better cleanup + force_garbage_collection() force_garbage_collection() - log_memory_stats(f"after processing {stats.total_files} files") + log_memory_stats(f"after aggressive cleanup at {stats.total_files} files") child = FileSystemNode( name=path.name, diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 64b8449c..ea956659 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -2,16 +2,14 @@ from __future__ import annotations -import ssl from io import StringIO from typing import TYPE_CHECKING -import requests.exceptions -import tiktoken - from gitingest.schemas import FileSystemNode, FileSystemNodeType from gitingest.utils.compat_func import readlink from gitingest.utils.logging_config import get_logger +from gitingest.utils.memory_utils import force_garbage_collection, log_memory_stats +from gitingest.utils.token_utils import clear_encoding_cache, count_tokens_optimized, format_token_count if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -19,11 +17,6 @@ # Initialize logger for this module logger = get_logger(__name__) -_TOKEN_THRESHOLDS: list[tuple[int, str]] = [ - (1_000_000, "M"), - (1_000, "k"), -] - def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]: """Generate a summary, directory structure, and file contents for a given file system node. @@ -52,13 +45,33 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" + # Log memory before tree generation + log_memory_stats("before tree structure generation") + tree = "Directory structure:\n" + _create_tree_structure(query, node=node) + # Log memory before content gathering (this is the memory-intensive part) + log_memory_stats("before content gathering") + content = _gather_file_contents(node) - token_estimate = _format_token_count(tree + content) - if token_estimate: - summary += f"\nEstimated tokens: {token_estimate}" + # Force garbage collection after content gathering + force_garbage_collection() + log_memory_stats("after content gathering and cleanup") + + # Count tokens with optimization + token_count = count_tokens_optimized(tree + content) + if token_count > 0: + summary += f"\nEstimated tokens: {format_token_count(token_count)}" + + # Final cleanup + if hasattr(node, "clear_content_cache_recursive"): + node.clear_content_cache_recursive() + + # Clear the tiktoken encoding cache to free memory + clear_encoding_cache() + force_garbage_collection() + log_memory_stats("after final cache and encoding cleanup") return summary, tree, content @@ -133,7 +146,12 @@ def _gather_file_contents(node: FileSystemNode) -> str: def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None: - """Recursively gather file contents into a StringIO buffer to reduce memory usage. + """Recursively gather file contents with memory optimization. + + This version includes memory optimizations: + - Progressive content cache clearing + - Periodic garbage collection + - Memory-aware processing Parameters ---------- @@ -144,12 +162,21 @@ def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> N """ if node.type != FileSystemNodeType.DIRECTORY: + # Write content and immediately clear cache to free memory buffer.write(node.content_string) + node.clear_content_cache() return - for child in node.children: + for files_processed, child in enumerate(node.children, 1): _gather_file_contents_recursive(child, buffer) + # Progressive cleanup every 10 files to prevent memory accumulation + if files_processed % 10 == 0: + force_garbage_collection() + + # Clear content cache for this directory after processing all children + node.clear_content_cache() + def _create_tree_structure( query: IngestionQuery, @@ -201,35 +228,3 @@ def _create_tree_structure( for i, child in enumerate(node.children): tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) return tree_str - - -def _format_token_count(text: str) -> str | None: - """Return a human-readable token-count string (e.g. 1.2k, 1.2 M). - - Parameters - ---------- - text : str - The text string for which the token count is to be estimated. - - Returns - ------- - str | None - The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs. - - """ - try: - encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini - total_tokens = len(encoding.encode(text, disallowed_special=())) - except (ValueError, UnicodeEncodeError) as exc: - logger.warning("Failed to estimate token size", extra={"error": str(exc)}) - return None - except (requests.exceptions.RequestException, ssl.SSLError) as exc: - # If network errors, skip token count estimation instead of erroring out - logger.warning("Failed to download tiktoken model", extra={"error": str(exc)}) - return None - - for threshold, suffix in _TOKEN_THRESHOLDS: - if total_tokens >= threshold: - return f"{total_tokens / threshold:.1f}{suffix}" - - return str(total_tokens) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 4df8212a..944e374f 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -50,6 +50,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes dir_count: int = 0 depth: int = 0 children: list[FileSystemNode] = field(default_factory=list) + _content_cache: str | None = field(default=None, init=False) def sort_children(self) -> None: """Sort the children nodes of a directory according to a specific order. @@ -106,10 +107,9 @@ def content_string(self) -> str: @property def content(self) -> str: # pylint: disable=too-many-return-statements - """Return file content (if text / notebook) or an explanatory placeholder. + """Return file content with caching for memory optimization. - Heuristically decides whether the file is text or binary by decoding a small chunk of the file - with multiple encodings and checking for common binary markers. + Uses lazy loading and caching to reduce memory usage for large repositories. Returns ------- @@ -129,14 +129,50 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if self.type == FileSystemNodeType.SYMLINK: return "" # TODO: are we including the empty content of symlinks? - if self.path.suffix == ".ipynb": # Notebook + # Return cached content if available + if self._content_cache is not None: + return self._content_cache + + # Load and cache content + self._content_cache = self._load_content() + return self._content_cache + + def _load_content(self) -> str: + """Load file content from disk. + + Returns + ------- + str + The file content + + """ + # Handle notebooks separately + if self.path.suffix == ".ipynb": try: return process_notebook(self.path) except Exception as exc: return f"Error processing notebook: {exc}" + # Read file chunk for analysis chunk = _read_chunk(self.path) + # Determine the appropriate content based on chunk analysis + return self._analyze_chunk_and_read(chunk) + + def _analyze_chunk_and_read(self, chunk: bytes | None) -> str: + """Analyze file chunk and return appropriate content. + + Parameters + ---------- + chunk : bytes | None + The file chunk to analyze + + Returns + ------- + str + The file content or error message + + """ if chunk is None: return "Error reading file" @@ -187,3 +223,13 @@ def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) -> return content_buffer.getvalue() finally: content_buffer.close() + + def clear_content_cache(self) -> None: + """Clear cached content to free memory.""" + self._content_cache = None + + def clear_content_cache_recursive(self) -> None: + """Recursively clear content cache for this node and all children.""" + self.clear_content_cache() + for child in self.children: + child.clear_content_cache_recursive() diff --git a/src/gitingest/utils/memory_utils.py b/src/gitingest/utils/memory_utils.py index c52c990b..45f768bf 100644 --- a/src/gitingest/utils/memory_utils.py +++ b/src/gitingest/utils/memory_utils.py @@ -44,7 +44,7 @@ def force_garbage_collection() -> None: logger.warning("Failed to force garbage collection", extra={"error": str(exc)}) -def check_memory_pressure(threshold_mb: float = 3000) -> bool: +def check_memory_pressure(threshold_mb: float = 2000) -> bool: """Check if memory usage is above threshold. Parameters diff --git a/src/gitingest/utils/token_utils.py b/src/gitingest/utils/token_utils.py new file mode 100644 index 00000000..b3307010 --- /dev/null +++ b/src/gitingest/utils/token_utils.py @@ -0,0 +1,183 @@ +"""Optimized token counting utilities with memory management.""" + +from __future__ import annotations + +import gc +import os +from typing import Protocol, Self + +from gitingest.utils.logging_config import get_logger + +# Try to import tiktoken, but don't fail if it's not available +try: + import tiktoken + + TIKTOKEN_AVAILABLE = True +except ImportError: + tiktoken = None # type: ignore[assignment] + TIKTOKEN_AVAILABLE = False + +logger = get_logger(__name__) + +# Environment variable to disable token counting for memory-sensitive environments +DISABLE_TOKEN_COUNTING = os.getenv("GITINGEST_DISABLE_TOKEN_COUNTING", "false").lower() == "true" + +# Constants for token formatting +TOKEN_MILLION = 1_000_000 +TOKEN_THOUSAND = 1_000 + + +class Encoding(Protocol): + """Protocol for tiktoken encoding objects.""" + + def encode(self, text: str, *, disallowed_special: tuple[str, ...] = ()) -> list[int]: + """Encode text to tokens.""" + + +class TokenEncoder: + """Singleton class to manage token encoding with lazy loading.""" + + _instance: TokenEncoder | None = None + _encoding: Encoding | None = None + + def __new__(cls) -> Self: + """Create or return the singleton instance.""" + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_encoding(self) -> Encoding | None: + """Get or create a cached encoding instance. + + Returns + ------- + Encoding | None + The cached encoding instance, or None if token counting is disabled + + """ + if DISABLE_TOKEN_COUNTING: + return None + + if self._encoding is None: + self._encoding = self._load_encoding() + return self._encoding + + def _load_encoding(self) -> Encoding | None: + """Load the tiktoken encoding. + + Returns + ------- + Encoding | None + The encoding instance or None if tiktoken is not available + + """ + if not TIKTOKEN_AVAILABLE: + logger.warning("tiktoken not available, token counting disabled") + return None + + try: + return tiktoken.get_encoding("o200k_base") # type: ignore[union-attr] + except Exception as exc: + logger.warning("Failed to load tiktoken encoding", extra={"error": str(exc)}) + return None + + def clear_cache(self) -> None: + """Clear the encoding cache to free memory.""" + if self._encoding is not None: + self._encoding = None + gc.collect() + + +# Create the singleton instance +_token_encoder = TokenEncoder() + + +def get_cached_encoding() -> Encoding | None: + """Get or create a cached encoding instance. + + Returns + ------- + Encoding | None + The cached encoding instance, or None if token counting is disabled + + """ + return _token_encoder.get_encoding() + + +def clear_encoding_cache() -> None: + """Clear the global encoding cache to free memory.""" + _token_encoder.clear_cache() + + +def count_tokens_optimized(text: str, chunk_size: int = 100000) -> int: + """Count tokens with memory optimization. + + Parameters + ---------- + text : str + Text to count tokens for + chunk_size : int + Size of chunks to process at a time (default: 100k chars) + + Returns + ------- + int + Total token count, or estimated count if token counting is disabled + + """ + # If token counting is disabled, return a rough estimate + if DISABLE_TOKEN_COUNTING: + # Rough estimate: ~1.3 tokens per character for code + return int(len(text) * 1.3) + + try: + encoding = get_cached_encoding() + if encoding is None: + # Fallback to estimation + return int(len(text) * 1.3) + + # Process in chunks to avoid memory spike for large texts + total_tokens = 0 + + if len(text) > chunk_size: + # Process large texts in chunks + for i in range(0, len(text), chunk_size): + chunk = text[i : i + chunk_size] + tokens = encoding.encode(chunk, disallowed_special=()) + total_tokens += len(tokens) + # Clear the tokens list immediately to free memory + del tokens + else: + tokens = encoding.encode(text, disallowed_special=()) + total_tokens = len(tokens) + del tokens + + except (ValueError, UnicodeEncodeError) as exc: + logger.warning("Failed to count tokens", extra={"error": str(exc)}) + return int(len(text) * 1.3) # Fallback to estimation + except Exception as exc: + logger.warning("Unexpected error counting tokens", extra={"error": str(exc)}) + return int(len(text) * 1.3) # Fallback to estimation + + return total_tokens + + +def format_token_count(count: int) -> str: + """Format token count as human-readable string. + + Parameters + ---------- + count : int + Token count + + Returns + ------- + str + Formatted string (e.g., "1.2k", "1.2M") + + """ + if count >= TOKEN_MILLION: + return f"{count / TOKEN_MILLION:.1f}M" + if count >= TOKEN_THOUSAND: + return f"{count / TOKEN_THOUSAND:.1f}k" + return str(count) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 5b0d0a95..76b42995 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -12,6 +12,7 @@ from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger +from gitingest.utils.memory_utils import force_garbage_collection from gitingest.utils.pattern_utils import process_patterns from server.memory_metrics import MemoryTracker from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata @@ -351,7 +352,8 @@ async def process_query( # Repository was already cleaned up after ingestion to free memory earlier - return IngestSuccessResponse( + # Create response + response = IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, summary=summary, @@ -363,6 +365,14 @@ async def process_query( pattern=pattern, ) + # Aggressive cleanup of large strings to free memory + del tree + del content + del summary + force_garbage_collection() + + return response + def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """Print a formatted summary of the query details for debugging.