diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index a16964626f..9645f56307 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0041" +CURR_DB_VERSION = "0042" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py new file mode 100644 index 0000000000..5410d4b593 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -0,0 +1,50 @@ +""" +Migration 0042 - Add filename to pages +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0042" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.page_ops = kwargs.get("page_ops") + + async def migrate_up(self): + """Perform migration up. + + Add filename to all pages that don't currently have it stored, + iterating through each archived item and its WACZ files as necessary + """ + pages_mdb = self.mdb["pages"] + + if self.page_ops is None: + print( + "Unable to add filename and other fields to pages, missing page_ops", + flush=True, + ) + return + + crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) + + crawl_count = len(crawl_ids_to_update) + current_index = 1 + + for crawl_id in crawl_ids_to_update: + print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) + try: + await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error adding filename and other fields to pages in item {crawl_id}: {err}", + flush=True, + ) + current_index += 1 diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0e57f200a..38734d7915 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2493,6 +2493,10 @@ class Page(BaseMongoModel): loadState: Optional[int] = None status: Optional[int] = None mime: Optional[str] = None + filename: Optional[str] = None + depth: Optional[int] = None + favIconUrl: Optional[AnyHttpUrl] = None + isSeed: Optional[bool] = False # manual review userid: Optional[UUID] = None diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4149a3e9d3..4b53b5b9b5 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1,6 +1,7 @@ """crawl pages""" import asyncio +import os import traceback from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union @@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) + pages_buffer = [] pages_buffer.append( self._get_page_from_dict(page_dict, crawl_id, crawl.oid) @@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): traceback.print_exc() print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) + async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): + """Add WACZ filename and additional fields to existing pages in crawl if not already set""" + try: + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + if not crawl.resources: + return + + for wacz_file in crawl.resources: + # Strip oid directory from filename + filename = os.path.basename(wacz_file.name) + + stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) + for page_dict in stream: + if not page_dict.get("url"): + continue + + page_id = page_dict.get("id") + + if not page_id: + continue + + if page_id: + try: + page_id = UUID(page_id) + # pylint: disable=broad-exception-caught + except Exception: + continue + + await self.pages.find_one_and_update( + {"_id": page_id}, + { + "$set": { + "filename": filename, + "depth": page_dict.get("depth"), + "isSeed": page_dict.get("seed", False), + "favIconUrl": page_dict.get("favIconUrl"), + } + }, + ) + # pylint: disable=broad-exception-caught, raise-missing-from + except Exception as err: + traceback.print_exc() + print( + f"Error adding filename to pages from item {crawl_id} to db: {err}", + flush=True, + ) + def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID ) -> Page: @@ -127,6 +176,10 @@ def _get_page_from_dict( loadState=page_dict.get("loadState"), status=status, mime=page_dict.get("mime", "text/html"), + filename=page_dict.get("filename"), + depth=page_dict.get("depth"), + isSeed=page_dict.get("seed", False), + favIconUrl=page_dict.get("favIconUrl"), ts=(str_to_date(ts) if ts else dt_now()), ) p.compute_page_type() diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index e167449eb5..d03497484e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -619,7 +619,9 @@ def stream_page_lines( line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: - yield _parse_json(line.decode("utf-8", errors="ignore")) + page_json = _parse_json(line.decode("utf-8", errors="ignore")) + page_json["filename"] = os.path.basename(wacz_filename) + yield page_json page_generators: List[Iterator[Dict[Any, Any]]] = [] diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index f40f5ba8ba..511c4c6c1e 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["loadState"] assert page["status"] assert page["mime"] + assert page["filename"] + assert page["depth"] is not None + assert page["favIconUrl"] + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["filename"] + assert page["depth"] is not None + assert page["favIconUrl"] + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["loadState"] assert page["mime"] + assert page["filename"] + assert page["depth"] is not None + assert page["favIconUrl"] + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) @@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["loadState"] assert page["status"] assert page["mime"] + assert page["filename"] + assert page["depth"] is not None + assert page["favIconUrl"] + assert page["isSeed"] in (True, False) assert page["isError"] in (True, False) assert page["isFile"] in (True, False) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 719a247e1c..56e5c1d978 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["crawl_id"] == upload_id assert page["url"] assert page["ts"] + assert page["filename"] assert page.get("title") or page.get("title") is None page_id = pages[0]["id"] @@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page["crawl_id"] assert page["url"] assert page["ts"] + assert page["filename"] assert page.get("title") or page.get("title") is None assert page["notes"] == [] diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 8f96fdd24f..3ce6dec50b 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -123,8 +123,8 @@ spec: httpGet: path: /healthzStartup port: 8000 - periodSeconds: 5 - failureThreshold: 60 + periodSeconds: 10 + failureThreshold: 8640 successThreshold: 1 readinessProbe: