From 22a3bb216c977a657afd1e87b30b2fe97421b554 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 Jan 2025 10:08:49 -0800 Subject: [PATCH 01/18] add 'filename' to Page model, fill in 'filename' if available for new crawls set 'filename' when readding pages work for #2348 --- backend/btrixcloud/models.py | 1 + backend/btrixcloud/pages.py | 1 + backend/btrixcloud/storages.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0e57f200a..bec8e2d4bd 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2493,6 +2493,7 @@ class Page(BaseMongoModel): loadState: Optional[int] = None status: Optional[int] = None mime: Optional[str] = None + filename: Optional[str] = None # manual review userid: Optional[UUID] = None diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4149a3e9d3..4d1d79253d 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -127,6 +127,7 @@ def _get_page_from_dict( loadState=page_dict.get("loadState"), status=status, mime=page_dict.get("mime", "text/html"), + filename=page_dict.get("filename"), ts=(str_to_date(ts) if ts else dt_now()), ) p.compute_page_type() diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index e167449eb5..64135015be 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -619,7 +619,9 @@ def stream_page_lines( line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: - yield _parse_json(line.decode("utf-8", errors="ignore")) + page_json = _parse_json(line.decode("utf-8", errors="ignore")) + page_json["filename"] = wacz_filename + yield page_json page_generators: List[Iterator[Dict[Any, Any]]] = [] From b85f6a9340160debad889a15a78dff74e9a02be3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 29 Jan 2025 14:36:20 -0500 Subject: [PATCH 02/18] Add migration to backfill page filename --- backend/btrixcloud/db.py | 2 +- .../migration_0042_page_filenames.py | 50 +++++++++++++++++++ backend/btrixcloud/pages.py | 33 ++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 backend/btrixcloud/migrations/migration_0042_page_filenames.py diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index a16964626f..9645f56307 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0041" +CURR_DB_VERSION = "0042" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py new file mode 100644 index 0000000000..64dc18ec04 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -0,0 +1,50 @@ +""" +Migration 0042 - Add filename to pages +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0042" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.page_ops = kwargs.get("page_ops") + + async def migrate_up(self): + """Perform migration up. + + Add filename to all pages that don't currently have it stored, + iterating through each archived item and its WACZ files as necessary + """ + pages_mdb = self.mdb["pages"] + + crawl_ids_to_update = set() + + if self.page_ops is None: + print( + "Unable to add filename to pages, missing page_ops", + flush=True, + ) + return + + async for page_raw in pages_mdb.find({"filename": None}): + crawl_id = page_raw.get("crawl_id") + if crawl_id: + crawl_ids_to_update.add(crawl_id) + + for crawl_id in crawl_ids_to_update: + try: + await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error adding filename to pages in item {crawl_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4d1d79253d..39ccf9474c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -83,6 +83,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) + pages_buffer = [] pages_buffer.append( self._get_page_from_dict(page_dict, crawl_id, crawl.oid) @@ -100,6 +101,38 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): traceback.print_exc() print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) + async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): + """Add WACZ filename to existing pages in crawl if not already set""" + try: + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + for wacz_file in crawl.resources: + wacz_filename = wacz_file.name + wacz_page_ids = [] + + stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) + for page_dict in stream: + if not page_dict.get("url"): + continue + + if page_dict.get("filename"): + continue + + if page_dict.get("id"): + wacz_page_ids.append(page_dict["id"]) + + # Update pages in batch per-filename + await self.pages.update_many( + {"_id": {"$in": wacz_page_ids}}, + {"$set": {"filename": wacz_filename}}, + ) + # pylint: disable=broad-exception-caught, raise-missing-from + except Exception as err: + traceback.print_exc() + print( + f"Error adding filename to pages from item {crawl_id} to db: {err}", + flush=True, + ) + def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID ) -> Page: From 30888f8edbf160a49b3c40571975fcdd4ed845d5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 29 Jan 2025 14:51:55 -0500 Subject: [PATCH 03/18] Make filename consistent with name added from crawler (no oid dir) --- backend/btrixcloud/pages.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 39ccf9474c..e235740b77 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -106,7 +106,12 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): try: crawl = await self.crawl_ops.get_crawl_out(crawl_id) for wacz_file in crawl.resources: - wacz_filename = wacz_file.name + + filename = wacz_file.name + name_parts = wacz_file.name.split("/") + if name_parts and len(name_parts) > 1: + filename = name_parts[-1] + wacz_page_ids = [] stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) @@ -123,7 +128,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): # Update pages in batch per-filename await self.pages.update_many( {"_id": {"$in": wacz_page_ids}}, - {"$set": {"filename": wacz_filename}}, + {"$set": {"filename": filename}}, ) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: From 339e51e9aa1bfac6b22958d8db5e5d113ce49f3c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 29 Jan 2025 16:06:44 -0500 Subject: [PATCH 04/18] Linting and fixups --- .../migration_0042_page_filenames.py | 8 +------ backend/btrixcloud/pages.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 64dc18ec04..70d1df8601 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -25,8 +25,6 @@ async def migrate_up(self): """ pages_mdb = self.mdb["pages"] - crawl_ids_to_update = set() - if self.page_ops is None: print( "Unable to add filename to pages, missing page_ops", @@ -34,11 +32,7 @@ async def migrate_up(self): ) return - async for page_raw in pages_mdb.find({"filename": None}): - crawl_id = page_raw.get("crawl_id") - if crawl_id: - crawl_ids_to_update.add(crawl_id) - + crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) for crawl_id in crawl_ids_to_update: try: await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index e235740b77..b3d71dd2dc 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -105,29 +105,34 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): """Add WACZ filename to existing pages in crawl if not already set""" try: crawl = await self.crawl_ops.get_crawl_out(crawl_id) - for wacz_file in crawl.resources: + if not crawl.resources: + return + for wacz_file in crawl.resources: + # Strip oid directory from filename filename = wacz_file.name name_parts = wacz_file.name.split("/") if name_parts and len(name_parts) > 1: filename = name_parts[-1] - wacz_page_ids = [] + page_ids_to_update = [] stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) for page_dict in stream: if not page_dict.get("url"): continue - if page_dict.get("filename"): - continue - - if page_dict.get("id"): - wacz_page_ids.append(page_dict["id"]) + page_id = page_dict.get("id") + if page_id: + try: + page_ids_to_update.append(UUID(page_id)) + # pylint: disable=broad-exception-caught + except Exception: + continue # Update pages in batch per-filename await self.pages.update_many( - {"_id": {"$in": wacz_page_ids}}, + {"_id": {"$in": page_ids_to_update}}, {"$set": {"filename": filename}}, ) # pylint: disable=broad-exception-caught, raise-missing-from From ba39f067445cc1d75a7db9fafd722e066aa0361b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 30 Jan 2025 10:12:15 -0500 Subject: [PATCH 05/18] Simplify stripping uuid directory from filename --- backend/btrixcloud/pages.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index b3d71dd2dc..71cb2dfb70 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1,6 +1,7 @@ """crawl pages""" import asyncio +import os import traceback from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union @@ -110,11 +111,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): for wacz_file in crawl.resources: # Strip oid directory from filename - filename = wacz_file.name - name_parts = wacz_file.name.split("/") - if name_parts and len(name_parts) > 1: - filename = name_parts[-1] - + filename = os.path.basename(wacz_file.name) page_ids_to_update = [] stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) From 0cd50eed7e80a4e95ea76468dc2953649b564658 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 30 Jan 2025 12:16:59 -0500 Subject: [PATCH 06/18] Strip oid directory from WACZ file name --- backend/btrixcloud/storages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 64135015be..d03497484e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -620,7 +620,7 @@ def stream_page_lines( line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: page_json = _parse_json(line.decode("utf-8", errors="ignore")) - page_json["filename"] = wacz_filename + page_json["filename"] = os.path.basename(wacz_filename) yield page_json page_generators: List[Iterator[Dict[Any, Any]]] = [] From d608bc3a1ab61f13d6d61b07065b659722e12362 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 30 Jan 2025 12:55:55 -0500 Subject: [PATCH 07/18] Add filename to page tests --- backend/test/test_run_crawl.py | 4 ++++ backend/test/test_uploads.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index f40f5ba8ba..61f21fbb22 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -673,6 +673,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["loadState"] assert page["status"] assert page["mime"] + assert page["filename"] assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -694,6 +695,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["filename"] assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -794,6 +796,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["filename"] assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -876,6 +879,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["loadState"] assert page["status"] assert page["mime"] + assert page["filename"] assert page["isError"] in (True, False) assert page["isFile"] in (True, False) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 719a247e1c..56e5c1d978 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["crawl_id"] == upload_id assert page["url"] assert page["ts"] + assert page["filename"] assert page.get("title") or page.get("title") is None page_id = pages[0]["id"] @@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["crawl_id"] assert page["url"] assert page["ts"] + assert page["filename"] assert page.get("title") or page.get("title") is None assert page["notes"] == [] From 401cffe43ede018177cab3ab946efcca0ed6ceb1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 3 Feb 2025 23:17:50 -0800 Subject: [PATCH 08/18] also set 'depth' on page --- backend/btrixcloud/models.py | 1 + backend/btrixcloud/pages.py | 6 ++++-- backend/test/test_run_crawl.py | 4 ++++ backend/test/test_uploads.py | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index bec8e2d4bd..e16d4a8461 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2494,6 +2494,7 @@ class Page(BaseMongoModel): status: Optional[int] = None mime: Optional[str] = None filename: Optional[str] = None + depth: Optional[int] = None # manual review userid: Optional[UUID] = None diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 71cb2dfb70..a9af5488f1 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -103,7 +103,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): - """Add WACZ filename to existing pages in crawl if not already set""" + """Add WACZ filename (and depth) to existing pages in crawl if not already set""" try: crawl = await self.crawl_ops.get_crawl_out(crawl_id) if not crawl.resources: @@ -120,6 +120,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): continue page_id = page_dict.get("id") + depth = page_dict.get("depth") if page_id: try: page_ids_to_update.append(UUID(page_id)) @@ -130,7 +131,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): # Update pages in batch per-filename await self.pages.update_many( {"_id": {"$in": page_ids_to_update}}, - {"$set": {"filename": filename}}, + {"$set": {"filename": filename, "depth": depth}}, ) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: @@ -168,6 +169,7 @@ def _get_page_from_dict( status=status, mime=page_dict.get("mime", "text/html"), filename=page_dict.get("filename"), + depth=page_dict.get("depth"), ts=(str_to_date(ts) if ts else dt_now()), ) p.compute_page_type() diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 61f21fbb22..ce94548ab7 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -674,6 +674,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["status"] assert page["mime"] assert page["filename"] + assert page["depth"] is not None assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -696,6 +697,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["loadState"] assert page["mime"] assert page["filename"] + assert page["depth"] is not None assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -797,6 +799,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["loadState"] assert page["mime"] assert page["filename"] + assert page["depth"] is not None assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -880,6 +883,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["status"] assert page["mime"] assert page["filename"] + assert page["depth"] is not None assert page["isError"] in (True, False) assert page["isFile"] in (True, False) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 56e5c1d978..61d7ff6603 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -253,6 +253,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page["filename"] + assert page["depth"] is not None assert page.get("title") or page.get("title") is None page_id = pages[0]["id"] @@ -269,6 +270,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page["filename"] + assert page["depth"] is not None assert page.get("title") or page.get("title") is None assert page["notes"] == [] From 7c3d667bb57a01009e7f9dbe49ce9f3da3c8c58d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 4 Feb 2025 11:33:51 -0800 Subject: [PATCH 09/18] undo depth test in uploads, no depth guaranteed --- backend/test/test_uploads.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 61d7ff6603..56e5c1d978 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -253,7 +253,6 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page["filename"] - assert page["depth"] is not None assert page.get("title") or page.get("title") is None page_id = pages[0]["id"] @@ -270,7 +269,6 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["url"] assert page["ts"] assert page["filename"] - assert page["depth"] is not None assert page.get("title") or page.get("title") is None assert page["notes"] == [] From 307ca055d6bcc37608229f06f8449f3ccd46f0f5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Feb 2025 15:23:42 -0500 Subject: [PATCH 10/18] Add seed and faviconUrl to page model and backfill migration --- backend/btrixcloud/models.py | 2 ++ backend/btrixcloud/pages.py | 26 +++++++++++++++++--------- backend/test/test_run_crawl.py | 8 ++++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e16d4a8461..366b14724b 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2495,6 +2495,8 @@ class Page(BaseMongoModel): mime: Optional[str] = None filename: Optional[str] = None depth: Optional[int] = None + faviconUrl: Optional[AnyHttpUrl] = None + seed: Optional[bool] = None # manual review userid: Optional[UUID] = None diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index a9af5488f1..dd31c42045 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -103,7 +103,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): - """Add WACZ filename (and depth) to existing pages in crawl if not already set""" + """Add WACZ filename and additional fields to existing pages in crawl if not already set""" try: crawl = await self.crawl_ops.get_crawl_out(crawl_id) if not crawl.resources: @@ -112,7 +112,6 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): for wacz_file in crawl.resources: # Strip oid directory from filename filename = os.path.basename(wacz_file.name) - page_ids_to_update = [] stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) for page_dict in stream: @@ -120,19 +119,28 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): continue page_id = page_dict.get("id") - depth = page_dict.get("depth") + + if not page_id: + continue + if page_id: try: - page_ids_to_update.append(UUID(page_id)) + page_id = UUID(page_id) # pylint: disable=broad-exception-caught except Exception: continue - # Update pages in batch per-filename - await self.pages.update_many( - {"_id": {"$in": page_ids_to_update}}, - {"$set": {"filename": filename, "depth": depth}}, - ) + await self.pages.find_one_and_update( + {"_id": page_id}, + { + "$set": { + "filename": filename, + "depth": page_dict.get("depth"), + "seed": page_dict.get("seed"), + "faviconUrl": page_dict.get("faviconUrl"), + } + }, + ) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: traceback.print_exc() diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index ce94548ab7..879768258f 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -675,6 +675,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None + assert page["faviconUrl"] + assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -698,6 +700,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None + assert page["faviconUrl"] + assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -800,6 +804,8 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None + assert page["faviconUrl"] + assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -884,6 +890,8 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["mime"] assert page["filename"] assert page["depth"] is not None + assert page["faviconUrl"] + assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) From 794e12f13c8d2ef16ab422a50daf02a93aae3280 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Feb 2025 15:40:39 -0500 Subject: [PATCH 11/18] Bump backend startupProbe to 1 hour before failure --- chart/templates/backend.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 8f96fdd24f..c97abd3219 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -123,8 +123,8 @@ spec: httpGet: path: /healthzStartup port: 8000 - periodSeconds: 5 - failureThreshold: 60 + periodSeconds: 10 + failureThreshold: 360 successThreshold: 1 readinessProbe: From 25190091579e697a5709a2bdb33f7fff4e7f0fb6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Feb 2025 16:06:48 -0500 Subject: [PATCH 12/18] Rename faviconUrl to favIconUrl to match crawler --- backend/btrixcloud/models.py | 2 +- backend/test/test_run_crawl.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 366b14724b..d2278871f9 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2495,7 +2495,7 @@ class Page(BaseMongoModel): mime: Optional[str] = None filename: Optional[str] = None depth: Optional[int] = None - faviconUrl: Optional[AnyHttpUrl] = None + favIconUrl: Optional[AnyHttpUrl] = None seed: Optional[bool] = None # manual review diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 879768258f..636c6f8515 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -675,7 +675,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None - assert page["faviconUrl"] + assert page["favIconUrl"] assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -700,7 +700,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None - assert page["faviconUrl"] + assert page["favIconUrl"] assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -804,7 +804,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["mime"] assert page["filename"] assert page["depth"] is not None - assert page["faviconUrl"] + assert page["favIconUrl"] assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -890,7 +890,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["mime"] assert page["filename"] assert page["depth"] is not None - assert page["faviconUrl"] + assert page["favIconUrl"] assert page["seed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) From f013d69047f4b86beeb4285326c44147b51677a3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Feb 2025 16:55:57 -0500 Subject: [PATCH 13/18] Add seed and favIconUrl to _get_page_from_dict --- backend/btrixcloud/pages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index dd31c42045..64299e413c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -178,6 +178,8 @@ def _get_page_from_dict( mime=page_dict.get("mime", "text/html"), filename=page_dict.get("filename"), depth=page_dict.get("depth"), + seed=page_dict.get("seed"), + favIconUrl=page_dict.get("favIconUrl"), ts=(str_to_date(ts) if ts else dt_now()), ) p.compute_page_type() From a9ca416c7ba281db0e7265837545eb42c6613487 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Feb 2025 10:57:39 -0500 Subject: [PATCH 14/18] Rename seed->isSeed, default to False --- backend/btrixcloud/models.py | 2 +- backend/btrixcloud/pages.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index d2278871f9..38734d7915 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2496,7 +2496,7 @@ class Page(BaseMongoModel): filename: Optional[str] = None depth: Optional[int] = None favIconUrl: Optional[AnyHttpUrl] = None - seed: Optional[bool] = None + isSeed: Optional[bool] = False # manual review userid: Optional[UUID] = None diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 64299e413c..ef0b37b727 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -136,7 +136,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): "$set": { "filename": filename, "depth": page_dict.get("depth"), - "seed": page_dict.get("seed"), + "isSeed": page_dict.get("seed", False), "faviconUrl": page_dict.get("faviconUrl"), } }, @@ -178,7 +178,7 @@ def _get_page_from_dict( mime=page_dict.get("mime", "text/html"), filename=page_dict.get("filename"), depth=page_dict.get("depth"), - seed=page_dict.get("seed"), + isSeed=page_dict.get("seed", False), favIconUrl=page_dict.get("favIconUrl"), ts=(str_to_date(ts) if ts else dt_now()), ) From 2e19749ab8f2b44e457f1d2036057c83e336d805 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Feb 2025 10:57:59 -0500 Subject: [PATCH 15/18] Fix remaining faviconUrl -> favIconUrl typo --- backend/btrixcloud/pages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index ef0b37b727..4b53b5b9b5 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -137,7 +137,7 @@ async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): "filename": filename, "depth": page_dict.get("depth"), "isSeed": page_dict.get("seed", False), - "faviconUrl": page_dict.get("faviconUrl"), + "favIconUrl": page_dict.get("favIconUrl"), } }, ) From 7ed016ec3b78735aaa59a6272a50304700cb7957 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Feb 2025 10:59:33 -0500 Subject: [PATCH 16/18] Set max startupProbe time to 1 day --- chart/templates/backend.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index c97abd3219..3ce6dec50b 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -124,7 +124,7 @@ spec: path: /healthzStartup port: 8000 periodSeconds: 10 - failureThreshold: 360 + failureThreshold: 8640 successThreshold: 1 readinessProbe: From fb153428395c8b0339572b8acc1643768770c5d2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Feb 2025 11:02:11 -0500 Subject: [PATCH 17/18] Add current index and total of items to migration --- .../migrations/migration_0042_page_filenames.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 70d1df8601..5410d4b593 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -27,18 +27,24 @@ async def migrate_up(self): if self.page_ops is None: print( - "Unable to add filename to pages, missing page_ops", + "Unable to add filename and other fields to pages, missing page_ops", flush=True, ) return crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) + + crawl_count = len(crawl_ids_to_update) + current_index = 1 + for crawl_id in crawl_ids_to_update: + print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) try: await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) # pylint: disable=broad-exception-caught except Exception as err: print( - f"Error adding filename to pages in item {crawl_id}: {err}", + f"Error adding filename and other fields to pages in item {crawl_id}: {err}", flush=True, ) + current_index += 1 From fe30b87de79f2be908a7aa4b8a5d8af2420eb924 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Feb 2025 11:04:04 -0500 Subject: [PATCH 18/18] Update key to isSeed in tests --- backend/test/test_run_crawl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 636c6f8515..511c4c6c1e 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -676,7 +676,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["filename"] assert page["depth"] is not None assert page["favIconUrl"] - assert page["seed"] in (True, False) + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -701,7 +701,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["filename"] assert page["depth"] is not None assert page["favIconUrl"] - assert page["seed"] in (True, False) + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -805,7 +805,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["filename"] assert page["depth"] is not None assert page["favIconUrl"] - assert page["seed"] in (True, False) + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -891,7 +891,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["filename"] assert page["depth"] is not None assert page["favIconUrl"] - assert page["seed"] in (True, False) + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False)