Skip to content

Commit 6797b41

Browse files
tw4lemma-sg
andauthored
Add pageCount to crawls and uploads and use in frontend for page counts (#2315)
Fixes #2257 This is a follow-up to the public collections work, which adds pages to the database for uploads. All crawls and uploads now have a `pageCount` field which is populated when the item is successfully added. A new migration is also added to populate the field for existing archived items that don't have it set yet. OrgMetrics have also been modified to include `crawlPageCount` and `uploadPageCount`, and to include the total of both in `pageCount`, and all three included in the frontend org dashboard. The frontend has been updated to use `pageCount` rather than `stats.done` wherever appropriate, meaning that in archived item lists and details we now have a consistent page count for both crawls and uploads. ### New functionality - Deploy this branch - Create new crawls and uploads and verify that page count appears correctly throughout the frontend for all new crawls and uploads ### Migration - Deploy from latest main - Create some crawls and uploads - Change to this branch and re-deploy - Verify migration ran without errors in backend logs - Verify that page count has been populated successfully by checking archived items lists, crawl and upload detail pages, and dashboard to ensure there are no longer any missing page counts. --------- Co-authored-by: emma <[email protected]>
1 parent 5684e89 commit 6797b41

File tree

18 files changed

+435
-116
lines changed

18 files changed

+435
-116
lines changed

backend/btrixcloud/db.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .migrations import BaseMigration
1818

1919

20-
CURR_DB_VERSION = "0039"
20+
CURR_DB_VERSION = "0040"
2121

2222

2323
# ============================================================================
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Migration 0040 -- archived item pageCount
3+
"""
4+
5+
from btrixcloud.migrations import BaseMigration
6+
7+
8+
MIGRATION_VERSION = "0040"
9+
10+
11+
class Migration(BaseMigration):
12+
"""Migration class."""
13+
14+
# pylint: disable=unused-argument
15+
def __init__(self, mdb, **kwargs):
16+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
17+
18+
self.page_ops = kwargs.get("page_ops")
19+
20+
async def migrate_up(self):
21+
"""Perform migration up.
22+
23+
Calculate and store pageCount for archived items that don't have it yet
24+
"""
25+
crawls_mdb = self.mdb["crawls"]
26+
27+
if self.page_ops is None:
28+
print(
29+
"Unable to set pageCount for archived items, missing page_ops",
30+
flush=True,
31+
)
32+
return
33+
34+
async for crawl_raw in crawls_mdb.find({"pageCount": None}):
35+
crawl_id = crawl_raw["_id"]
36+
try:
37+
await self.page_ops.set_archived_item_page_count(crawl_id)
38+
# pylint: disable=broad-exception-caught
39+
except Exception as err:
40+
print(
41+
f"Error saving pageCount for archived item {crawl_id}: {err}",
42+
flush=True,
43+
)

backend/btrixcloud/models.py

+5
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,8 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel):
797797

798798
reviewStatus: ReviewStatus = None
799799

800+
pageCount: Optional[int] = 0
801+
800802
filePageCount: Optional[int] = 0
801803
errorPageCount: Optional[int] = 0
802804

@@ -872,6 +874,7 @@ class CrawlOut(BaseMongoModel):
872874
lastQAState: Optional[str] = None
873875
lastQAStarted: Optional[datetime] = None
874876

877+
pageCount: Optional[int] = 0
875878
filePageCount: Optional[int] = 0
876879
errorPageCount: Optional[int] = 0
877880

@@ -1914,6 +1917,8 @@ class OrgMetrics(BaseModel):
19141917
crawlCount: int
19151918
uploadCount: int
19161919
pageCount: int
1920+
crawlPageCount: int
1921+
uploadPageCount: int
19171922
profileCount: int
19181923
workflowsRunningCount: int
19191924
maxConcurrentCrawls: int

backend/btrixcloud/operator/crawls.py

+1
Original file line numberDiff line numberDiff line change
@@ -1534,6 +1534,7 @@ async def do_crawl_finished_tasks(
15341534
)
15351535

15361536
if state in SUCCESSFUL_STATES and crawl.oid:
1537+
await self.page_ops.set_archived_item_page_count(crawl.id)
15371538
await self.org_ops.inc_org_bytes_stored(
15381539
crawl.oid, status.filesAddedSize, "crawl"
15391540
)

backend/btrixcloud/orgs.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,10 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
939939
archived_item_count = 0
940940
crawl_count = 0
941941
upload_count = 0
942+
942943
page_count = 0
944+
crawl_page_count = 0
945+
upload_page_count = 0
943946

944947
async for item_data in self.crawls_db.find({"oid": org.id}):
945948
item = BaseCrawl.from_dict(item_data)
@@ -948,10 +951,12 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
948951
archived_item_count += 1
949952
if item.type == "crawl":
950953
crawl_count += 1
954+
crawl_page_count += item.pageCount or 0
951955
if item.type == "upload":
952956
upload_count += 1
953-
if item.stats:
954-
page_count += item.stats.done
957+
upload_page_count += item.pageCount or 0
958+
if item.pageCount:
959+
page_count += item.pageCount
955960

956961
profile_count = await self.profiles_db.count_documents({"oid": org.id})
957962
workflows_running_count = await self.crawls_db.count_documents(
@@ -975,6 +980,8 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]:
975980
"crawlCount": crawl_count,
976981
"uploadCount": upload_count,
977982
"pageCount": page_count,
983+
"crawlPageCount": crawl_page_count,
984+
"uploadPageCount": upload_page_count,
978985
"profileCount": profile_count,
979986
"workflowsRunningCount": workflows_running_count,
980987
"maxConcurrentCrawls": max_concurrent_crawls,

backend/btrixcloud/pages.py

+10
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
9292
if pages_buffer:
9393
await self._add_pages_to_db(crawl_id, pages_buffer)
9494

95+
await self.set_archived_item_page_count(crawl_id)
96+
9597
print(f"Added pages for crawl {crawl_id} to db", flush=True)
9698
# pylint: disable=broad-exception-caught, raise-missing-from
9799
except Exception as err:
@@ -661,6 +663,14 @@ def get_crawl_type_from_pages_route(self, request: Request):
661663

662664
return crawl_type
663665

666+
async def set_archived_item_page_count(self, crawl_id: str):
667+
"""Store archived item page count in crawl document"""
668+
_, page_count = await self.list_pages(crawl_id)
669+
670+
await self.crawls.find_one_and_update(
671+
{"_id": crawl_id}, {"$set": {"pageCount": page_count}}
672+
)
673+
664674

665675
# ============================================================================
666676
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme

backend/test/test_run_crawl.py

+8
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,14 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
877877
)
878878
assert r.status_code == 403
879879

880+
# Check that pageCount was stored on crawl
881+
r = requests.get(
882+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
883+
headers=crawler_auth_headers,
884+
)
885+
assert r.status_code == 200
886+
assert r.json()["pageCount"] > 0
887+
880888

881889
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
882890
note_text = "testing"

backend/test/test_uploads.py

+8
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,14 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
274274
assert page.get("modified") is None
275275
assert page.get("approved") is None
276276

277+
# Check that pageCount was stored on upload
278+
r = requests.get(
279+
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}",
280+
headers=admin_auth_headers,
281+
)
282+
assert r.status_code == 200
283+
assert r.json()["pageCount"] > 0
284+
277285

278286
def test_replace_upload(
279287
admin_auth_headers, default_org_id, uploads_collection_id, upload_id

frontend/src/features/archived-items/archived-item-list.ts

+18-1
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,24 @@ export class ArchivedItemListItem extends BtrixElement {
252252
</btrix-table-cell>
253253
<btrix-table-cell class="tabular-nums">
254254
${isUpload
255-
? notApplicable
255+
? html`<sl-tooltip
256+
hoist
257+
@click=${this.onTooltipClick}
258+
content=${msg(
259+
str`${this.localize.number(
260+
this.item.pageCount ? +this.item.pageCount : 0,
261+
)}`,
262+
)}
263+
>
264+
<div class="min-w-4">
265+
${this.localize.number(
266+
this.item.pageCount ? +this.item.pageCount : 0,
267+
{
268+
notation: "compact",
269+
},
270+
)}
271+
</div>
272+
</sl-tooltip>`
256273
: html`<sl-tooltip
257274
hoist
258275
@click=${this.onTooltipClick}

frontend/src/features/archived-items/crawl-list.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -191,12 +191,13 @@ export class CrawlListItem extends BtrixElement {
191191
</btrix-table-cell>
192192
<btrix-table-cell>
193193
${this.safeRender((crawl) => {
194-
const pagesComplete = +(crawl.stats?.done || 0);
195194
const pagesFound = +(crawl.stats?.found || 0);
196195
if (crawl.finished) {
196+
const pagesComplete = crawl.pageCount ? +crawl.pageCount : 0;
197197
return `${this.localize.number(pagesComplete, { notation: "compact" })} ${pluralOf("pages", pagesComplete)}`;
198198
}
199199
200+
const pagesComplete = +(crawl.stats?.done || 0);
200201
return `${this.localize.number(pagesComplete, { notation: "compact" })} / ${this.localize.number(pagesFound, { notation: "compact" })} ${pluralOf("pages", pagesFound)}`;
201202
})}
202203
</btrix-table-cell>

frontend/src/pages/org/archived-item-detail/archived-item-detail.ts

+13-2
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,7 @@ export class ArchivedItemDetail extends BtrixElement {
859859
? html`${this.item.fileSize
860860
? html`${this.localize.bytes(this.item.fileSize || 0, {
861861
unitDisplay: "narrow",
862-
})}${this.item.stats
862+
})}${this.item.stats?.done
863863
? html`<span>,</span
864864
><span
865865
class="tracking-tighter${this.isActive
@@ -873,7 +873,18 @@ export class ArchivedItemDetail extends BtrixElement {
873873
<span
874874
>${pluralOf("pages", +this.item.stats.found)}</span
875875
>`
876-
: ""}`
876+
: html`<span>,</span
877+
><span>
878+
${this.localize.number(
879+
this.item.pageCount ? +this.item.pageCount : 0,
880+
)}
881+
</span>
882+
<span
883+
>${pluralOf(
884+
"pages",
885+
this.item.pageCount ? +this.item.pageCount : 0,
886+
)}</span
887+
>`}`
877888
: html`<span class="text-0-400">${msg("Unknown")}</span>`}`
878889
: html`<sl-skeleton class="h-[16px] w-24"></sl-skeleton>`}
879890
</btrix-desc-list-item>

frontend/src/pages/org/dashboard.ts

+24-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ type Metrics = {
2828
crawlCount: number;
2929
uploadCount: number;
3030
pageCount: number;
31+
crawlPageCount: number;
32+
uploadPageCount: number;
3133
profileCount: number;
3234
workflowsRunningCount: number;
3335
maxConcurrentCrawls: number;
@@ -236,10 +238,31 @@ export class Dashboard extends BtrixElement {
236238
pluralLabel: msg("Crawl Workflows Waiting"),
237239
iconProps: { name: "hourglass-split", color: "violet" },
238240
})}
241+
<sl-divider
242+
style="--spacing:var(--sl-spacing-small)"
243+
></sl-divider>
239244
${this.renderStat({
240-
value: metrics.pageCount,
245+
value: metrics.crawlPageCount,
241246
singleLabel: msg("Page Crawled"),
242247
pluralLabel: msg("Pages Crawled"),
248+
iconProps: {
249+
name: "file-richtext-fill",
250+
color: this.colors.crawls,
251+
},
252+
})}
253+
${this.renderStat({
254+
value: metrics.uploadPageCount,
255+
singleLabel: msg("Page Uploaded"),
256+
pluralLabel: msg("Pages Uploaded"),
257+
iconProps: {
258+
name: "file-richtext-fill",
259+
color: this.colors.uploads,
260+
},
261+
})}
262+
${this.renderStat({
263+
value: metrics.pageCount,
264+
singleLabel: msg("Page Total"),
265+
pluralLabel: msg("Pages Total"),
243266
iconProps: { name: "file-richtext-fill" },
244267
})}
245268
</dl>

frontend/src/types/crawler.ts

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ type ArchivedItemBase = {
164164
activeQAStats: { done: number; found: number } | null;
165165
lastQAState: CrawlState | null;
166166
lastQAStarted: string | null;
167+
pageCount?: number;
167168
filePageCount?: number;
168169
errorPageCount?: number;
169170
};

frontend/webpack.config.js

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ const childProcess = require("child_process");
55
const fs = require("fs");
66
const path = require("path");
77

8-
98
const CopyPlugin = require("copy-webpack-plugin");
109
const ForkTsCheckerWebpackPlugin = require("fork-ts-checker-webpack-plugin");
1110
const HtmlWebpackPlugin = require("html-webpack-plugin");

0 commit comments

Comments
 (0)