Skip to content

Commit 763c654

Browse files
tw4lSuaYooikreymer
authoredJan 23, 2025··
feat: Update collection sorting, metadata, stats (#2327)
- Refactors dashboard and org profile preview to use private API endpoint, to fix public collections not showing when the org visibility is hidden - Adds additional sorting options for collections - Adds unique page url counts for archived items, collections, and organizations to backend and exposes this in collections - Shows collection period (i.e. `dateEarliest` to `dateLatest`) in collections list - Shows same collection metadata in private and public views, updates private view info bar - Fixes "Update Org Profile" action item showing for crawler roles --------- Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: sua yoo <sua@suayoo.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent f8976e6 commit 763c654

32 files changed

+480
-235
lines changed
 

‎backend/btrixcloud/basecrawls.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949

5050

5151
# ============================================================================
52-
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
52+
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-branches
5353
class BaseCrawlOps:
5454
"""operations that apply to all crawls"""
5555

@@ -300,6 +300,7 @@ async def delete_crawls(
300300
) -> tuple[int, dict[UUID, dict[str, int]], bool]:
301301
"""Delete a list of crawls by id for given org"""
302302
cids_to_update: dict[UUID, dict[str, int]] = {}
303+
collection_ids_to_update = set()
303304

304305
size = 0
305306

@@ -325,6 +326,10 @@ async def delete_crawls(
325326

326327
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
327328

329+
if crawl.collectionIds:
330+
for coll_id in crawl.collectionIds:
331+
collection_ids_to_update.add(coll_id)
332+
328333
if type_ == "crawl":
329334
await self.delete_all_crawl_qa_files(crawl_id, org)
330335

@@ -361,6 +366,10 @@ async def delete_crawls(
361366

362367
await self.orgs.set_last_crawl_finished(org.id)
363368

369+
if collection_ids_to_update:
370+
for coll_id in collection_ids_to_update:
371+
await self.colls.update_collection_counts_and_tags(coll_id)
372+
364373
quota_reached = self.orgs.storage_quota_reached(org)
365374

366375
return res.deleted_count, cids_to_update, quota_reached

‎backend/btrixcloud/colls.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ async def list_collections(
396396
page = page - 1
397397
skip = page * page_size
398398

399-
match_query: dict[str, object] = {"oid": org.id}
399+
match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id}
400400

401401
if name:
402402
match_query["name"] = name
@@ -409,15 +409,33 @@ async def list_collections(
409409
elif access:
410410
match_query["access"] = access
411411

412-
aggregate = [{"$match": match_query}]
412+
aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [
413+
{"$match": match_query}
414+
]
413415

414416
if sort_by:
415-
if sort_by not in ("modified", "name", "description", "totalSize"):
417+
if sort_by not in (
418+
"created",
419+
"modified",
420+
"dateLatest",
421+
"name",
422+
"crawlCount",
423+
"pageCount",
424+
"totalSize",
425+
"description",
426+
"caption",
427+
):
416428
raise HTTPException(status_code=400, detail="invalid_sort_by")
417429
if sort_direction not in (1, -1):
418430
raise HTTPException(status_code=400, detail="invalid_sort_direction")
419431

420-
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
432+
sort_query = {sort_by: sort_direction}
433+
434+
# add secondary sort keys:
435+
if sort_by == "dateLatest":
436+
sort_query["dateEarliest"] = sort_direction
437+
438+
aggregate.extend([{"$sort": sort_query}])
421439

422440
aggregate.extend(
423441
[
@@ -564,11 +582,14 @@ async def recalculate_org_collection_counts_tags(self, org: Organization):
564582

565583
async def update_collection_counts_and_tags(self, collection_id: UUID):
566584
"""Set current crawl info in config when crawl begins"""
585+
# pylint: disable=too-many-locals
567586
crawl_count = 0
568587
page_count = 0
569588
total_size = 0
570589
tags = []
571590

591+
crawl_ids = []
592+
572593
coll = await self.get_collection(collection_id)
573594
org = await self.orgs.get_org_by_id(coll.oid)
574595

@@ -582,25 +603,30 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
582603
total_size += file.size
583604

584605
try:
585-
_, crawl_pages = await self.page_ops.list_pages(
606+
_, crawl_page_count = await self.page_ops.list_pages(
586607
crawl.id, org, page_size=1_000_000
587608
)
588-
page_count += crawl_pages
609+
page_count += crawl_page_count
589610
# pylint: disable=broad-exception-caught
590611
except Exception:
591612
pass
592613

593614
if crawl.tags:
594615
tags.extend(crawl.tags)
595616

617+
crawl_ids.append(crawl.id)
618+
596619
sorted_tags = [tag for tag, count in Counter(tags).most_common()]
597620

621+
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
622+
598623
await self.collections.find_one_and_update(
599624
{"_id": collection_id},
600625
{
601626
"$set": {
602627
"crawlCount": crawl_count,
603628
"pageCount": page_count,
629+
"uniquePageCount": unique_page_count,
604630
"totalSize": total_size,
605631
"tags": sorted_tags,
606632
}
@@ -618,6 +644,7 @@ async def recalculate_org_collection_dates(self, org: Organization):
618644

619645
async def update_collection_dates(self, coll_id: UUID):
620646
"""Update collection earliest and latest dates from page timestamps"""
647+
# pylint: disable=too-many-locals
621648
coll = await self.get_collection(coll_id)
622649
crawl_ids = await self.get_collection_crawl_ids(coll_id)
623650

‎backend/btrixcloud/db.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .migrations import BaseMigration
1818

1919

20-
CURR_DB_VERSION = "0040"
20+
CURR_DB_VERSION = "0041"
2121

2222

2323
# ============================================================================
@@ -96,7 +96,7 @@ async def update_and_prepare_db(
9696
await ping_db(mdb)
9797
print("Database setup started", flush=True)
9898
if await run_db_migrations(
99-
mdb, user_manager, page_ops, org_ops, background_job_ops
99+
mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops
100100
):
101101
await drop_indexes(mdb)
102102

@@ -117,8 +117,10 @@ async def update_and_prepare_db(
117117

118118

119119
# ============================================================================
120-
# pylint: disable=too-many-locals
121-
async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job_ops):
120+
# pylint: disable=too-many-locals, too-many-arguments
121+
async def run_db_migrations(
122+
mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops
123+
):
122124
"""Run database migrations."""
123125

124126
# if first run, just set version and exit
@@ -155,6 +157,7 @@ async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job
155157
page_ops=page_ops,
156158
org_ops=org_ops,
157159
background_job_ops=background_job_ops,
160+
coll_ops=coll_ops,
158161
)
159162
if await migration.run():
160163
migrations_run = True

‎backend/btrixcloud/main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def main() -> None:
255255
crawls.set_page_ops(page_ops)
256256
upload_ops.set_page_ops(page_ops)
257257

258-
org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops)
258+
org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops, page_ops)
259259

260260
user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)
261261

‎backend/btrixcloud/migrations/migration_0040_archived_item_page_count.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,13 @@ async def migrate_up(self):
3131
)
3232
return
3333

34-
async for crawl_raw in crawls_mdb.find({"pageCount": None}):
34+
async for crawl_raw in crawls_mdb.find({}):
3535
crawl_id = crawl_raw["_id"]
3636
try:
37-
await self.page_ops.set_archived_item_page_count(crawl_id)
37+
await self.page_ops.set_archived_item_page_counts(crawl_id)
3838
# pylint: disable=broad-exception-caught
3939
except Exception as err:
4040
print(
41-
f"Error saving pageCount for archived item {crawl_id}: {err}",
41+
f"Error saving page counts for archived item {crawl_id}: {err}",
4242
flush=True,
4343
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Migration 0041 - Rationalize page counts
3+
"""
4+
5+
from btrixcloud.migrations import BaseMigration
6+
7+
8+
MIGRATION_VERSION = "0041"
9+
10+
11+
class Migration(BaseMigration):
12+
"""Migration class."""
13+
14+
# pylint: disable=unused-argument
15+
def __init__(self, mdb, **kwargs):
16+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
17+
18+
self.coll_ops = kwargs.get("coll_ops")
19+
20+
async def migrate_up(self):
21+
"""Perform migration up.
22+
23+
Recalculate collections to get new page and unique page counts
24+
"""
25+
colls_mdb = self.mdb["collections"]
26+
27+
if self.coll_ops is None:
28+
print(
29+
"Unable to set collection page counts, missing coll_ops",
30+
flush=True,
31+
)
32+
return
33+
34+
async for coll in colls_mdb.find({}):
35+
coll_id = coll["_id"]
36+
try:
37+
await self.coll_ops.update_collection_counts_and_tags(coll_id)
38+
# pylint: disable=broad-exception-caught
39+
except Exception as err:
40+
print(
41+
f"Unable to update page counts for collection {coll_id}: {err}",
42+
flush=True,
43+
)

‎backend/btrixcloud/models.py

+8
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,7 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel):
798798
reviewStatus: ReviewStatus = None
799799

800800
pageCount: Optional[int] = 0
801+
uniquePageCount: Optional[int] = 0
801802

802803
filePageCount: Optional[int] = 0
803804
errorPageCount: Optional[int] = 0
@@ -875,6 +876,7 @@ class CrawlOut(BaseMongoModel):
875876
lastQAStarted: Optional[datetime] = None
876877

877878
pageCount: Optional[int] = 0
879+
uniquePageCount: Optional[int] = 0
878880
filePageCount: Optional[int] = 0
879881
errorPageCount: Optional[int] = 0
880882

@@ -1250,6 +1252,7 @@ class Collection(BaseMongoModel):
12501252

12511253
crawlCount: Optional[int] = 0
12521254
pageCount: Optional[int] = 0
1255+
uniquePageCount: Optional[int] = 0
12531256
totalSize: Optional[int] = 0
12541257

12551258
dateEarliest: Optional[datetime] = None
@@ -1303,6 +1306,7 @@ class CollOut(BaseMongoModel):
13031306

13041307
crawlCount: Optional[int] = 0
13051308
pageCount: Optional[int] = 0
1309+
uniquePageCount: Optional[int] = 0
13061310
totalSize: Optional[int] = 0
13071311

13081312
dateEarliest: Optional[datetime] = None
@@ -1339,6 +1343,7 @@ class PublicCollOut(BaseMongoModel):
13391343

13401344
crawlCount: Optional[int] = 0
13411345
pageCount: Optional[int] = 0
1346+
uniquePageCount: Optional[int] = 0
13421347
totalSize: Optional[int] = 0
13431348

13441349
dateEarliest: Optional[datetime] = None
@@ -1919,6 +1924,9 @@ class OrgMetrics(BaseModel):
19191924
pageCount: int
19201925
crawlPageCount: int
19211926
uploadPageCount: int
1927+
uniquePageCount: int
1928+
crawlUniquePageCount: int
1929+
uploadUniquePageCount: int
19221930
profileCount: int
19231931
workflowsRunningCount: int
19241932
maxConcurrentCrawls: int

‎backend/btrixcloud/operator/crawls.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1534,7 +1534,7 @@ async def do_crawl_finished_tasks(
15341534
)
15351535

15361536
if state in SUCCESSFUL_STATES and crawl.oid:
1537-
await self.page_ops.set_archived_item_page_count(crawl.id)
1537+
await self.page_ops.set_archived_item_page_counts(crawl.id)
15381538
await self.org_ops.inc_org_bytes_stored(
15391539
crawl.oid, status.filesAddedSize, "crawl"
15401540
)

‎backend/btrixcloud/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def init_ops() -> Tuple[
9797

9898
background_job_ops.set_ops(crawl_ops, profile_ops)
9999

100-
org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops)
100+
org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops, page_ops)
101101

102102
user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)
103103

0 commit comments

Comments
 (0)
Please sign in to comment.