Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WACZ filename, depth, favIconUrl, isSeed to pages #2352

Merged
merged 18 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .migrations import BaseMigration


CURR_DB_VERSION = "0041"
CURR_DB_VERSION = "0042"


# ============================================================================
Expand Down
50 changes: 50 additions & 0 deletions backend/btrixcloud/migrations/migration_0042_page_filenames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Migration 0042 - Add filename to pages
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0042"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

self.page_ops = kwargs.get("page_ops")

async def migrate_up(self):
"""Perform migration up.

Add filename to all pages that don't currently have it stored,
iterating through each archived item and its WACZ files as necessary
"""
pages_mdb = self.mdb["pages"]

if self.page_ops is None:
print(
"Unable to add filename and other fields to pages, missing page_ops",
flush=True,
)
return

crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})

crawl_count = len(crawl_ids_to_update)
current_index = 1

for crawl_id in crawl_ids_to_update:
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
try:
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
flush=True,
)
current_index += 1
4 changes: 4 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False

# manual review
userid: Optional[UUID] = None
Expand Down
53 changes: 53 additions & 0 deletions backend/btrixcloud/pages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""crawl pages"""

import asyncio
import os
import traceback
from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
Expand Down Expand Up @@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):

if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = []

pages_buffer.append(
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
Expand All @@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
traceback.print_exc()
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)

async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
"""Add WACZ filename and additional fields to existing pages in crawl if not already set"""
try:
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
if not crawl.resources:
return

for wacz_file in crawl.resources:
# Strip oid directory from filename
filename = os.path.basename(wacz_file.name)

stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
for page_dict in stream:
if not page_dict.get("url"):
continue

page_id = page_dict.get("id")

if not page_id:
continue

if page_id:
try:
page_id = UUID(page_id)
# pylint: disable=broad-exception-caught
except Exception:
continue

await self.pages.find_one_and_update(
{"_id": page_id},
{
"$set": {
"filename": filename,
"depth": page_dict.get("depth"),
"isSeed": page_dict.get("seed", False),
"favIconUrl": page_dict.get("favIconUrl"),
}
},
)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
traceback.print_exc()
print(
f"Error adding filename to pages from item {crawl_id} to db: {err}",
flush=True,
)

def _get_page_from_dict(
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
) -> Page:
Expand Down Expand Up @@ -127,6 +176,10 @@ def _get_page_from_dict(
loadState=page_dict.get("loadState"),
status=status,
mime=page_dict.get("mime", "text/html"),
filename=page_dict.get("filename"),
depth=page_dict.get("depth"),
isSeed=page_dict.get("seed", False),
favIconUrl=page_dict.get("favIconUrl"),
ts=(str_to_date(ts) if ts else dt_now()),
)
p.compute_page_type()
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,9 @@ def stream_page_lines(

line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
for line in line_iter:
yield _parse_json(line.decode("utf-8", errors="ignore"))
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
page_json["filename"] = os.path.basename(wacz_filename)
yield page_json

page_generators: List[Iterator[Dict[Any, Any]]] = []

Expand Down
16 changes: 16 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand All @@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down Expand Up @@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down Expand Up @@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down
2 changes: 2 additions & 0 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"] == upload_id
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None

page_id = pages[0]["id"]
Expand All @@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None

assert page["notes"] == []
Expand Down
4 changes: 2 additions & 2 deletions chart/templates/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ spec:
httpGet:
path: /healthzStartup
port: 8000
periodSeconds: 5
failureThreshold: 60
periodSeconds: 10
failureThreshold: 8640
successThreshold: 1

readinessProbe:
Expand Down
Loading