Skip to content

Add last crawl's stats object to CrawlConfigOut #2714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
update_query["lastCrawlSize"] = sum(
file_.get("size", 0) for file_ in last_crawl.get("files", [])
)
update_query["lastCrawlStats"] = last_crawl.get("stats")
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False

Expand All @@ -773,6 +774,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
update_query["lastCrawlTime"] = None
update_query["lastCrawlState"] = None
update_query["lastCrawlSize"] = 0
update_query["lastCrawlStats"] = None
update_query["lastRun"] = None
update_query["isCrawlRunning"] = False

Expand Down Expand Up @@ -802,6 +804,7 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
crawlconfig.lastCrawlPausedExpiry = None
crawlconfig.lastCrawlStats = crawl.stats if crawl.stats else None
if crawl.pausedAt:
crawlconfig.lastCrawlPausedExpiry = (
crawl.pausedAt + self.paused_expiry_delta
Expand Down Expand Up @@ -1313,6 +1316,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
update_query["lastStartedByName"] = last_crawl.get("userName")
update_query["lastCrawlState"] = last_crawl.get("state")
update_query["lastCrawlSize"] = last_crawl_size
update_query["lastCrawlStats"] = last_crawl.get("stats")
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False

Expand Down
19 changes: 10 additions & 9 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,15 @@ class UserOrgInfoOut(BaseModel):
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]


# ============================================================================
class CrawlStats(BaseModel):
"""Crawl Stats for pages and size"""

found: int = 0
done: int = 0
size: int = 0


# ============================================================================

### CRAWL CONFIGS ###
Expand Down Expand Up @@ -505,6 +514,7 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
lastCrawlShouldPause: Optional[bool] = False
lastCrawlPausedAt: Optional[datetime] = None
lastCrawlPausedExpiry: Optional[datetime] = None
lastCrawlStats: Optional[CrawlStats] = None
profileName: Optional[str] = None
firstSeed: Optional[str] = None
seedCount: int = 0
Expand Down Expand Up @@ -760,15 +770,6 @@ class CrawlFileOut(BaseModel):
expireAt: Optional[str] = None


# ============================================================================
class CrawlStats(BaseModel):
"""Crawl Stats for pages and size"""

found: int = 0
done: int = 0
size: int = 0


# ============================================================================
class CoreCrawlable(BaseModel):
# pylint: disable=too-few-public-methods
Expand Down
16 changes: 16 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,11 @@ def test_workflow_total_size_and_last_crawl_stats(
assert workflow["lastRun"]
assert workflow["lastCrawlSize"] > 0

stats = workflow["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0

if last_crawl_id == admin_crawl_id:
global _admin_crawl_cid
_admin_crawl_cid = workflow["id"]
Expand All @@ -544,6 +549,11 @@ def test_workflow_total_size_and_last_crawl_stats(
assert data["lastRun"]
assert data["lastCrawlSize"] > 0

stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0


def test_incremental_workflow_total_size_and_last_crawl_stats(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
Expand All @@ -563,6 +573,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
last_crawl_started = data["lastCrawlStartTime"]
last_crawl_finished = data["lastCrawlTime"]
last_run = data["lastRun"]
last_stats = data["lastCrawlStats"]

# Run new crawl in this workflow
r = requests.post(
Expand Down Expand Up @@ -601,6 +612,10 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
assert data["lastCrawlStartTime"] > last_crawl_started
assert data["lastCrawlTime"] > last_crawl_finished
assert data["lastRun"] > last_run
stats = data["lastCrawlStats"]
assert stats["found"] > 0
assert stats["done"] > 0
assert stats["size"] > 0

# Delete new crawl
r = requests.post(
Expand All @@ -627,6 +642,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats(
assert data["lastCrawlStartTime"] == last_crawl_started
assert data["lastCrawlTime"] == last_crawl_finished
assert data["lastRun"] == last_run
assert data["lastCrawlStats"] == last_stats


def test_get_config_seeds(crawler_auth_headers, default_org_id, url_list_config_id):
Expand Down
Loading