feat(import[truncation]) Warn when results are capped by --limit

tony · tony · commit 9f97e0a16cfe · 2026-02-18T17:20:18.000-06:00
why: Users saw "Found 100 repositories" with no indication that hundreds more
were silently truncated, leading to incomplete imports.
what:
- GitLab: Extract x-total and x-next-page headers to detect truncation
- GitLab: Add _warn_truncation() method with two warning variants
- GitHub search: Use total_count from JSON body to detect truncation
- GitHub user/org: Use mid-page limit hit as "more available" signal
- Add parametrized truncation warning tests for both providers
diff --git a/src/vcspull/_internal/remotes/github.py b/src/vcspull/_internal/remotes/github.py
@@ -191,6 +191,7 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
         endpoint = "/search/repositories"
         page = 1
         count = 0
+        total_available: int | None = None
 
         while count < options.limit:
             # Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -212,12 +213,14 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
             self._log_rate_limit(headers)
 
             total_count = data.get("total_count", 0)
-            if page == 1 and total_count > 1000:
-                log.warning(
-                    "GitHub search returned %d total results but API limits "
-                    "to 1000; consider narrowing your query",
-                    total_count,
-                )
+            if page == 1:
+                total_available = total_count
+                if total_count > 1000:
+                    log.warning(
+                        "GitHub search returned %d total results but API limits "
+                        "to 1000; consider narrowing your query",
+                        total_count,
+                    )
 
             items = data.get("items", [])
             if not items:
@@ -242,6 +245,18 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
 
             page += 1
 
+        # Warn if results were truncated by --limit
+        if (
+            count >= options.limit
+            and total_available is not None
+            and total_available > count
+        ):
+            log.warning(
+                "Showing %d of %d repositories (use --limit 0 to fetch all)",
+                count,
+                total_available,
+            )
+
     def _paginate_repos(
         self,
         endpoint: str,
@@ -263,6 +278,7 @@ def _paginate_repos(
         """
         page = 1
         count = 0
+        more_available = False
 
         while count < options.limit:
             # Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -285,8 +301,12 @@ def _paginate_repos(
             if not data:
                 break
 
-            for item in data:
+            for idx, item in enumerate(data):
                 if count >= options.limit:
+                    # Remaining items on this page or a full page = more exist
+                    more_available = (
+                        idx < len(data) - 1 or len(data) == DEFAULT_PER_PAGE
+                    )
                     break
 
                 repo = self._parse_repo(item)
@@ -300,6 +320,15 @@ def _paginate_repos(
 
             page += 1
 
+        # Warn if results were truncated by --limit
+        # GitHub user/org endpoints don't return total count
+        if count >= options.limit and more_available:
+            log.warning(
+                "Showing %d repositories; more may be available "
+                "(use --limit 0 to fetch all)",
+                count,
+            )
+
     def _parse_repo(self, data: dict[str, t.Any]) -> RemoteRepo:
         """Parse GitHub API response into RemoteRepo.
 
diff --git a/src/vcspull/_internal/remotes/gitlab.py b/src/vcspull/_internal/remotes/gitlab.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import contextlib
 import logging
 import typing as t
 import urllib.parse
@@ -188,6 +189,8 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
         endpoint = "/search"
         page = 1
         count = 0
+        total_available: int | None = None
+        last_x_next_page: str | None = None
 
         while count < options.limit:
             # Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -210,6 +213,15 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
 
             self._log_rate_limit(headers)
 
+            # Capture pagination metadata from first page
+            if page == 1:
+                x_total = headers.get("x-total")
+                if x_total is not None:
+                    with contextlib.suppress(ValueError, TypeError):
+                        total_available = int(x_total)
+
+            last_x_next_page = headers.get("x-next-page") or None
+
             if not data:
                 break
 
@@ -228,6 +240,9 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
 
             page += 1
 
+        # Warn if results were truncated by --limit
+        self._warn_truncation(count, options.limit, total_available, last_x_next_page)
+
     def _paginate_repos(
         self,
         endpoint: str,
@@ -253,6 +268,8 @@ def _paginate_repos(
         """
         page = 1
         count = 0
+        total_available: int | None = None
+        last_x_next_page: str | None = None
 
         while count < options.limit:
             # Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -279,6 +296,15 @@ def _paginate_repos(
 
             self._log_rate_limit(headers)
 
+            # Capture pagination metadata from first page
+            if page == 1:
+                x_total = headers.get("x-total")
+                if x_total is not None:
+                    with contextlib.suppress(ValueError, TypeError):
+                        total_available = int(x_total)
+
+            last_x_next_page = headers.get("x-next-page") or None
+
             if not data:
                 break
 
@@ -297,6 +323,45 @@ def _paginate_repos(
 
             page += 1
 
+        # Warn if results were truncated by --limit
+        self._warn_truncation(count, options.limit, total_available, last_x_next_page)
+
+    def _warn_truncation(
+        self,
+        count: int,
+        limit: int,
+        total_available: int | None,
+        x_next_page: str | None,
+    ) -> None:
+        """Warn if results were truncated by the --limit option.
+
+        Parameters
+        ----------
+        count : int
+            Number of repositories actually returned
+        limit : int
+            The configured limit
+        total_available : int | None
+            Value of x-total header (None if absent)
+        x_next_page : str | None
+            Value of x-next-page header (None if absent/empty)
+        """
+        if count < limit:
+            return
+
+        if total_available is not None and total_available > count:
+            log.warning(
+                "Showing %d of %d repositories (use --limit 0 to fetch all)",
+                count,
+                total_available,
+            )
+        elif x_next_page is not None:
+            log.warning(
+                "Showing %d repositories; more are available "
+                "(use --limit 0 to fetch all)",
+                count,
+            )
+
     def _log_rate_limit(self, headers: dict[str, str]) -> None:
         """Log rate limit information from response headers.
 
diff --git a/tests/_internal/remotes/test_github.py b/tests/_internal/remotes/test_github.py
@@ -653,3 +653,129 @@ def urlopen_side_effect(
     # Should have fetched at most 10 pages (1000 results)
     assert call_count <= 10, f"Expected at most 10 API calls, got {call_count}"
     assert len(repos) <= 1000
+
+
+# ---------------------------------------------------------------------------
+# Truncation warnings
+# ---------------------------------------------------------------------------
+
+
+def _make_github_repo(idx: int) -> dict[str, t.Any]:
+    """Create a minimal GitHub repo API object for testing."""
+    return {
+        "name": f"repo-{idx}",
+        "clone_url": f"https://github.com/user/repo-{idx}.git",
+        "ssh_url": f"git@github.com:user/repo-{idx}.git",
+        "html_url": f"https://github.com/user/repo-{idx}",
+        "description": f"Repo {idx}",
+        "language": "Python",
+        "topics": [],
+        "stargazers_count": 10,
+        "fork": False,
+        "archived": False,
+        "default_branch": "main",
+        "owner": {"login": "user"},
+    }
+
+
+class GitHubTruncationFixture(t.NamedTuple):
+    """Fixture for GitHub truncation warning test cases."""
+
+    test_id: str
+    mode: ImportMode
+    limit: int
+    num_repos_on_server: int
+    total_count: int | None  # for search mode, total_count in JSON body
+    expect_warning: bool
+    expected_warning_fragment: str | None
+
+
+GITHUB_TRUNCATION_FIXTURES: list[GitHubTruncationFixture] = [
+    GitHubTruncationFixture(
+        test_id="search-truncated-with-total-count",
+        mode=ImportMode.SEARCH,
+        limit=2,
+        num_repos_on_server=5,
+        total_count=5,
+        expect_warning=True,
+        expected_warning_fragment="showing 2 of 5",
+    ),
+    GitHubTruncationFixture(
+        test_id="search-not-truncated",
+        mode=ImportMode.SEARCH,
+        limit=100,
+        num_repos_on_server=3,
+        total_count=3,
+        expect_warning=False,
+        expected_warning_fragment=None,
+    ),
+    GitHubTruncationFixture(
+        test_id="user-truncated-full-page",
+        mode=ImportMode.USER,
+        limit=3,
+        num_repos_on_server=5,
+        total_count=None,
+        expect_warning=True,
+        expected_warning_fragment="more may be available",
+    ),
+    GitHubTruncationFixture(
+        test_id="user-not-truncated",
+        mode=ImportMode.USER,
+        limit=100,
+        num_repos_on_server=3,
+        total_count=None,
+        expect_warning=False,
+        expected_warning_fragment=None,
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    list(GitHubTruncationFixture._fields),
+    GITHUB_TRUNCATION_FIXTURES,
+    ids=[f.test_id for f in GITHUB_TRUNCATION_FIXTURES],
+)
+def test_github_truncation_warning(
+    test_id: str,
+    mode: ImportMode,
+    limit: int,
+    num_repos_on_server: int,
+    total_count: int | None,
+    expect_warning: bool,
+    expected_warning_fragment: str | None,
+    monkeypatch: pytest.MonkeyPatch,
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    """Test truncation warnings when results exceed --limit."""
+    import logging
+
+    from tests._internal.remotes.conftest import MockHTTPResponse
+
+    caplog.set_level(logging.WARNING)
+
+    repos = [_make_github_repo(i) for i in range(num_repos_on_server)]
+    rate_headers = {"x-ratelimit-remaining": "100", "x-ratelimit-limit": "60"}
+
+    if mode == ImportMode.SEARCH:
+        body = json.dumps({"total_count": total_count, "items": repos}).encode()
+    else:
+        body = json.dumps(repos).encode()
+
+    def urlopen_side_effect(
+        request: t.Any,
+        timeout: int | None = None,
+    ) -> MockHTTPResponse:
+        return MockHTTPResponse(body, rate_headers, 200)
+
+    # Mock urlopen: return all repos in one page
+    monkeypatch.setattr("urllib.request.urlopen", urlopen_side_effect)
+
+    importer = GitHubImporter()
+    options = ImportOptions(mode=mode, target="user", limit=limit)
+    list(importer.fetch_repos(options))
+
+    if expect_warning:
+        assert expected_warning_fragment is not None
+        assert expected_warning_fragment in caplog.text.lower()
+    else:
+        assert "--limit" not in caplog.text.lower()
diff --git a/tests/_internal/remotes/test_gitlab.py b/tests/_internal/remotes/test_gitlab.py