Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ dependencies = [
"pandas==2.3.2",
"isort==6.0.1",
"pre-commit>=4",
"psycopg2-binary==2.9.10"
"psycopg2-binary==2.9.10",
"Scrapy>=2.14.2",
"playwright>=1.49.0,<2",
"markitdown>=0.1.0"
]

[project.scripts]
Expand Down
2 changes: 2 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
default = src.data_manager.collectors.scrapers.settings
64 changes: 64 additions & 0 deletions src/data_manager/collectors/scrapers/adapters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Single-dispatch adapter: converts Scrapy Items into ScrapedResource.

Design principles:
- Items are dumb data bags. They know nothing about ScrapedResource.
- This is the ONLY place that knows about both schemas.
- New sources: add a @to_scraped_resource.register block here. Touch nothing else.
- Do NOT reconstruct ResourceMetadata — ScrapedResource.get_metadata() already
derives display_name, url, suffix, source_type from raw fields. Pass raw values only.

Constraint: ~50 LOC of logic.

Adding a new source (e.g. TwikiPageItem):
@to_scraped_resource.register(TwikiPageItem)
def _twiki(item) -> ScrapedResource:
...

If two sources share identical mapping logic, stack decorators:
@to_scraped_resource.register(WebPageItem)
@to_scraped_resource.register(TwikiPageItem)
def _html_page(item) -> ScrapedResource:
...
Note: do NOT use union type hints (WebPageItem | TwikiPageItem) —
singledispatch ignores annotations, it dispatches on runtime type only.
"""
from __future__ import annotations

from functools import singledispatch

from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
from src.data_manager.collectors.scrapers.items import WebPageItem


@singledispatch
def to_scraped_resource(item) -> ScrapedResource:
"""Raises for unregistered types — fail loudly, never silently skip."""
raise TypeError(
f"No adapter registered for item type {type(item).__name__!r}. "
"Add @to_scraped_resource.register(YourItemClass) in this module."
)


@to_scraped_resource.register(WebPageItem)
def _html_page(item) -> ScrapedResource:
"""
Handles all HTML-family pages regardless of auth method.

PDFs scraped from the web also route here — the parser sets
suffix="pdf" and content=bytes in the item, so no branch needed.
The adapter passes suffix and source_type through without inspection.
"""
return ScrapedResource(
url=item["url"],
content=item["content"],
suffix=item.get("suffix", "html"),
source_type=item["source_type"],
metadata={
"content_type": item.get("content_type"),
"encoding": item.get("encoding"),
"title": item.get("title"),
},
)


Loading
Loading