Skip to content

Commit 9f97e0a

Browse files
committed
feat(import[truncation]) Warn when results are capped by --limit
why: Users saw "Found 100 repositories" with no indication that hundreds more were silently truncated, leading to incomplete imports. what: - GitLab: Extract x-total and x-next-page headers to detect truncation - GitLab: Add _warn_truncation() method with two warning variants - GitHub search: Use total_count from JSON body to detect truncation - GitHub user/org: Use mid-page limit hit as "more available" signal - Add parametrized truncation warning tests for both providers
1 parent 43b7ec8 commit 9f97e0a

File tree

4 files changed

+355
-7
lines changed

4 files changed

+355
-7
lines changed

src/vcspull/_internal/remotes/github.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
191191
endpoint = "/search/repositories"
192192
page = 1
193193
count = 0
194+
total_available: int | None = None
194195

195196
while count < options.limit:
196197
# Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -212,12 +213,14 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
212213
self._log_rate_limit(headers)
213214

214215
total_count = data.get("total_count", 0)
215-
if page == 1 and total_count > 1000:
216-
log.warning(
217-
"GitHub search returned %d total results but API limits "
218-
"to 1000; consider narrowing your query",
219-
total_count,
220-
)
216+
if page == 1:
217+
total_available = total_count
218+
if total_count > 1000:
219+
log.warning(
220+
"GitHub search returned %d total results but API limits "
221+
"to 1000; consider narrowing your query",
222+
total_count,
223+
)
221224

222225
items = data.get("items", [])
223226
if not items:
@@ -242,6 +245,18 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
242245

243246
page += 1
244247

248+
# Warn if results were truncated by --limit
249+
if (
250+
count >= options.limit
251+
and total_available is not None
252+
and total_available > count
253+
):
254+
log.warning(
255+
"Showing %d of %d repositories (use --limit 0 to fetch all)",
256+
count,
257+
total_available,
258+
)
259+
245260
def _paginate_repos(
246261
self,
247262
endpoint: str,
@@ -263,6 +278,7 @@ def _paginate_repos(
263278
"""
264279
page = 1
265280
count = 0
281+
more_available = False
266282

267283
while count < options.limit:
268284
# Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -285,8 +301,12 @@ def _paginate_repos(
285301
if not data:
286302
break
287303

288-
for item in data:
304+
for idx, item in enumerate(data):
289305
if count >= options.limit:
306+
# Remaining items on this page or a full page = more exist
307+
more_available = (
308+
idx < len(data) - 1 or len(data) == DEFAULT_PER_PAGE
309+
)
290310
break
291311

292312
repo = self._parse_repo(item)
@@ -300,6 +320,15 @@ def _paginate_repos(
300320

301321
page += 1
302322

323+
# Warn if results were truncated by --limit
324+
# GitHub user/org endpoints don't return total count
325+
if count >= options.limit and more_available:
326+
log.warning(
327+
"Showing %d repositories; more may be available "
328+
"(use --limit 0 to fetch all)",
329+
count,
330+
)
331+
303332
def _parse_repo(self, data: dict[str, t.Any]) -> RemoteRepo:
304333
"""Parse GitHub API response into RemoteRepo.
305334

src/vcspull/_internal/remotes/gitlab.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import contextlib
56
import logging
67
import typing as t
78
import urllib.parse
@@ -188,6 +189,8 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
188189
endpoint = "/search"
189190
page = 1
190191
count = 0
192+
total_available: int | None = None
193+
last_x_next_page: str | None = None
191194

192195
while count < options.limit:
193196
# Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -210,6 +213,15 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
210213

211214
self._log_rate_limit(headers)
212215

216+
# Capture pagination metadata from first page
217+
if page == 1:
218+
x_total = headers.get("x-total")
219+
if x_total is not None:
220+
with contextlib.suppress(ValueError, TypeError):
221+
total_available = int(x_total)
222+
223+
last_x_next_page = headers.get("x-next-page") or None
224+
213225
if not data:
214226
break
215227

@@ -228,6 +240,9 @@ def _fetch_search(self, options: ImportOptions) -> t.Iterator[RemoteRepo]:
228240

229241
page += 1
230242

243+
# Warn if results were truncated by --limit
244+
self._warn_truncation(count, options.limit, total_available, last_x_next_page)
245+
231246
def _paginate_repos(
232247
self,
233248
endpoint: str,
@@ -253,6 +268,8 @@ def _paginate_repos(
253268
"""
254269
page = 1
255270
count = 0
271+
total_available: int | None = None
272+
last_x_next_page: str | None = None
256273

257274
while count < options.limit:
258275
# Always use DEFAULT_PER_PAGE to maintain consistent pagination offset.
@@ -279,6 +296,15 @@ def _paginate_repos(
279296

280297
self._log_rate_limit(headers)
281298

299+
# Capture pagination metadata from first page
300+
if page == 1:
301+
x_total = headers.get("x-total")
302+
if x_total is not None:
303+
with contextlib.suppress(ValueError, TypeError):
304+
total_available = int(x_total)
305+
306+
last_x_next_page = headers.get("x-next-page") or None
307+
282308
if not data:
283309
break
284310

@@ -297,6 +323,45 @@ def _paginate_repos(
297323

298324
page += 1
299325

326+
# Warn if results were truncated by --limit
327+
self._warn_truncation(count, options.limit, total_available, last_x_next_page)
328+
329+
def _warn_truncation(
330+
self,
331+
count: int,
332+
limit: int,
333+
total_available: int | None,
334+
x_next_page: str | None,
335+
) -> None:
336+
"""Warn if results were truncated by the --limit option.
337+
338+
Parameters
339+
----------
340+
count : int
341+
Number of repositories actually returned
342+
limit : int
343+
The configured limit
344+
total_available : int | None
345+
Value of x-total header (None if absent)
346+
x_next_page : str | None
347+
Value of x-next-page header (None if absent/empty)
348+
"""
349+
if count < limit:
350+
return
351+
352+
if total_available is not None and total_available > count:
353+
log.warning(
354+
"Showing %d of %d repositories (use --limit 0 to fetch all)",
355+
count,
356+
total_available,
357+
)
358+
elif x_next_page is not None:
359+
log.warning(
360+
"Showing %d repositories; more are available "
361+
"(use --limit 0 to fetch all)",
362+
count,
363+
)
364+
300365
def _log_rate_limit(self, headers: dict[str, str]) -> None:
301366
"""Log rate limit information from response headers.
302367

tests/_internal/remotes/test_github.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,3 +653,129 @@ def urlopen_side_effect(
653653
# Should have fetched at most 10 pages (1000 results)
654654
assert call_count <= 10, f"Expected at most 10 API calls, got {call_count}"
655655
assert len(repos) <= 1000
656+
657+
658+
# ---------------------------------------------------------------------------
659+
# Truncation warnings
660+
# ---------------------------------------------------------------------------
661+
662+
663+
def _make_github_repo(idx: int) -> dict[str, t.Any]:
664+
"""Create a minimal GitHub repo API object for testing."""
665+
return {
666+
"name": f"repo-{idx}",
667+
"clone_url": f"https://github.com/user/repo-{idx}.git",
668+
"ssh_url": f"git@github.com:user/repo-{idx}.git",
669+
"html_url": f"https://github.com/user/repo-{idx}",
670+
"description": f"Repo {idx}",
671+
"language": "Python",
672+
"topics": [],
673+
"stargazers_count": 10,
674+
"fork": False,
675+
"archived": False,
676+
"default_branch": "main",
677+
"owner": {"login": "user"},
678+
}
679+
680+
681+
class GitHubTruncationFixture(t.NamedTuple):
682+
"""Fixture for GitHub truncation warning test cases."""
683+
684+
test_id: str
685+
mode: ImportMode
686+
limit: int
687+
num_repos_on_server: int
688+
total_count: int | None # for search mode, total_count in JSON body
689+
expect_warning: bool
690+
expected_warning_fragment: str | None
691+
692+
693+
GITHUB_TRUNCATION_FIXTURES: list[GitHubTruncationFixture] = [
694+
GitHubTruncationFixture(
695+
test_id="search-truncated-with-total-count",
696+
mode=ImportMode.SEARCH,
697+
limit=2,
698+
num_repos_on_server=5,
699+
total_count=5,
700+
expect_warning=True,
701+
expected_warning_fragment="showing 2 of 5",
702+
),
703+
GitHubTruncationFixture(
704+
test_id="search-not-truncated",
705+
mode=ImportMode.SEARCH,
706+
limit=100,
707+
num_repos_on_server=3,
708+
total_count=3,
709+
expect_warning=False,
710+
expected_warning_fragment=None,
711+
),
712+
GitHubTruncationFixture(
713+
test_id="user-truncated-full-page",
714+
mode=ImportMode.USER,
715+
limit=3,
716+
num_repos_on_server=5,
717+
total_count=None,
718+
expect_warning=True,
719+
expected_warning_fragment="more may be available",
720+
),
721+
GitHubTruncationFixture(
722+
test_id="user-not-truncated",
723+
mode=ImportMode.USER,
724+
limit=100,
725+
num_repos_on_server=3,
726+
total_count=None,
727+
expect_warning=False,
728+
expected_warning_fragment=None,
729+
),
730+
]
731+
732+
733+
@pytest.mark.parametrize(
734+
list(GitHubTruncationFixture._fields),
735+
GITHUB_TRUNCATION_FIXTURES,
736+
ids=[f.test_id for f in GITHUB_TRUNCATION_FIXTURES],
737+
)
738+
def test_github_truncation_warning(
739+
test_id: str,
740+
mode: ImportMode,
741+
limit: int,
742+
num_repos_on_server: int,
743+
total_count: int | None,
744+
expect_warning: bool,
745+
expected_warning_fragment: str | None,
746+
monkeypatch: pytest.MonkeyPatch,
747+
caplog: pytest.LogCaptureFixture,
748+
) -> None:
749+
"""Test truncation warnings when results exceed --limit."""
750+
import logging
751+
752+
from tests._internal.remotes.conftest import MockHTTPResponse
753+
754+
caplog.set_level(logging.WARNING)
755+
756+
repos = [_make_github_repo(i) for i in range(num_repos_on_server)]
757+
rate_headers = {"x-ratelimit-remaining": "100", "x-ratelimit-limit": "60"}
758+
759+
if mode == ImportMode.SEARCH:
760+
body = json.dumps({"total_count": total_count, "items": repos}).encode()
761+
else:
762+
body = json.dumps(repos).encode()
763+
764+
def urlopen_side_effect(
765+
request: t.Any,
766+
timeout: int | None = None,
767+
) -> MockHTTPResponse:
768+
return MockHTTPResponse(body, rate_headers, 200)
769+
770+
# Mock urlopen: return all repos in one page
771+
monkeypatch.setattr("urllib.request.urlopen", urlopen_side_effect)
772+
773+
importer = GitHubImporter()
774+
options = ImportOptions(mode=mode, target="user", limit=limit)
775+
list(importer.fetch_repos(options))
776+
777+
if expect_warning:
778+
assert expected_warning_fragment is not None
779+
assert expected_warning_fragment in caplog.text.lower()
780+
else:
781+
assert "--limit" not in caplog.text.lower()

0 commit comments

Comments
 (0)