webrecorder · tw4l · Feb 5, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
@@ -17,7 +17,7 @@
 from .migrations import BaseMigration
 
 
-CURR_DB_VERSION = "0041"
+CURR_DB_VERSION = "0042"
 
 
 # ============================================================================

diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py
@@ -0,0 +1,50 @@
+"""
+Migration 0042 - Add filename to pages
+"""
+
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0042"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+        self.page_ops = kwargs.get("page_ops")
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Add filename to all pages that don't currently have it stored,
+        iterating through each archived item and its WACZ files as necessary
+        """
+        pages_mdb = self.mdb["pages"]
+
+        if self.page_ops is None:
+            print(
+                "Unable to add filename and other fields to pages, missing page_ops",
+                flush=True,
+            )
+            return
+
+        crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
+
+        crawl_count = len(crawl_ids_to_update)
+        current_index = 1
+
+        for crawl_id in crawl_ids_to_update:
+            print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
+            try:
+                await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
+                    flush=True,
+                )
+            current_index += 1
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
@@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
     loadState: Optional[int] = None
     status: Optional[int] = None
     mime: Optional[str] = None
+    filename: Optional[str] = None
+    depth: Optional[int] = None
+    favIconUrl: Optional[AnyHttpUrl] = None
+    isSeed: Optional[bool] = False
 
     # manual review
     userid: Optional[UUID] = None

diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -1,6 +1,7 @@
 """crawl pages"""
 
 import asyncio
+import os
 import traceback
 from datetime import datetime
 from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
@@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
 
                 if len(pages_buffer) > batch_size:
                     await self._add_pages_to_db(crawl_id, pages_buffer)
+                    pages_buffer = []
 
                 pages_buffer.append(
                     self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
             traceback.print_exc()
             print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
 
+    async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
+        """Add WACZ filename and additional fields to existing pages in crawl if not already set"""
+        try:
+            crawl = await self.crawl_ops.get_crawl_out(crawl_id)
+            if not crawl.resources:
+                return
+
+            for wacz_file in crawl.resources:
+                # Strip oid directory from filename
+                filename = os.path.basename(wacz_file.name)
+
+                stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
+                for page_dict in stream:
+                    if not page_dict.get("url"):
+                        continue
+
+                    page_id = page_dict.get("id")
+
+                    if not page_id:
+                        continue
+
+                    if page_id:
+                        try:
+                            page_id = UUID(page_id)
+                        # pylint: disable=broad-exception-caught
+                        except Exception:
+                            continue
+
+                    await self.pages.find_one_and_update(
+                        {"_id": page_id},
+                        {
+                            "$set": {
+                                "filename": filename,
+                                "depth": page_dict.get("depth"),
+                                "isSeed": page_dict.get("seed", False),
+                                "favIconUrl": page_dict.get("favIconUrl"),
+                            }
+                        },
+                    )
+        # pylint: disable=broad-exception-caught, raise-missing-from
+        except Exception as err:
+            traceback.print_exc()
+            print(
+                f"Error adding filename to pages from item {crawl_id} to db: {err}",
+                flush=True,
+            )
+
     def _get_page_from_dict(
         self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
     ) -> Page:
@@ -127,6 +176,10 @@ def _get_page_from_dict(
             loadState=page_dict.get("loadState"),
             status=status,
             mime=page_dict.get("mime", "text/html"),
+            filename=page_dict.get("filename"),
+            depth=page_dict.get("depth"),
+            isSeed=page_dict.get("seed", False),
+            favIconUrl=page_dict.get("favIconUrl"),
             ts=(str_to_date(ts) if ts else dt_now()),
         )
         p.compute_page_type()

diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py
@@ -619,7 +619,9 @@ def stream_page_lines(
 
             line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
             for line in line_iter:
-                yield _parse_json(line.decode("utf-8", errors="ignore"))
+                page_json = _parse_json(line.decode("utf-8", errors="ignore"))
+                page_json["filename"] = os.path.basename(wacz_filename)
+                yield page_json
 
         page_generators: List[Iterator[Dict[Any, Any]]] = []
 

diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
@@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
         assert page["loadState"]
         assert page["status"]
         assert page["mime"]
+        assert page["filename"]
+        assert page["depth"] is not None
+        assert page["favIconUrl"]
+        assert page["isSeed"] in (True, False)
         assert page["isError"] in (True, False)
         assert page["isFile"] in (True, False)
 
@@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
     assert page.get("title") or page.get("title") is None
     assert page["loadState"]
     assert page["mime"]
+    assert page["filename"]
+    assert page["depth"] is not None
+    assert page["favIconUrl"]
+    assert page["isSeed"] in (True, False)
     assert page["isError"] in (True, False)
     assert page["isFile"] in (True, False)
 
@@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
     assert page.get("title") or page.get("title") is None
     assert page["loadState"]
     assert page["mime"]
+    assert page["filename"]
+    assert page["depth"] is not None
+    assert page["favIconUrl"]
+    assert page["isSeed"] in (True, False)
     assert page["isError"] in (True, False)
     assert page["isFile"] in (True, False)
 
@@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
         assert page["loadState"]
         assert page["status"]
         assert page["mime"]
+        assert page["filename"]
+        assert page["depth"] is not None
+        assert page["favIconUrl"]
+        assert page["isSeed"] in (True, False)
         assert page["isError"] in (True, False)
         assert page["isFile"] in (True, False)
 

diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
@@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
         assert page["crawl_id"] == upload_id
         assert page["url"]
         assert page["ts"]
+        assert page["filename"]
         assert page.get("title") or page.get("title") is None
 
     page_id = pages[0]["id"]
@@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
     assert page["crawl_id"]
     assert page["url"]
     assert page["ts"]
+    assert page["filename"]
     assert page.get("title") or page.get("title") is None
 
     assert page["notes"] == []

diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml
@@ -123,8 +123,8 @@ spec:
             httpGet:
               path: /healthzStartup
               port: 8000
-            periodSeconds: 5
-            failureThreshold: 60
+            periodSeconds: 10
+            failureThreshold: 8640
             successThreshold: 1
 
           readinessProbe: