From 47dcb9428e3445d9189717668bd2e5244a783d3e Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 16:32:51 +0100
Subject: [PATCH 01/55] scrapy project scaffolding for backend revamp, trivial
twiki spider.
---
pyproject.toml | 3 +-
scrapy.cfg | 2 +
.../collectors/scrapers/settings.py | 27 +++++++
.../collectors/scrapers/spiders/__init__.py | 0
.../collectors/scrapers/spiders/twiki.py | 70 +++++++++++++++++++
5 files changed, 101 insertions(+), 1 deletion(-)
create mode 100644 scrapy.cfg
create mode 100644 src/data_manager/collectors/scrapers/settings.py
create mode 100644 src/data_manager/collectors/scrapers/spiders/__init__.py
create mode 100644 src/data_manager/collectors/scrapers/spiders/twiki.py
diff --git a/pyproject.toml b/pyproject.toml
index f5136f334..d6508450e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,8 @@ dependencies = [
"pandas==2.3.2",
"isort==6.0.1",
"pre-commit>=4",
- "psycopg2-binary==2.9.10"
+ "psycopg2-binary==2.9.10",
+ "Scrapy>=2.14.2"
]
[project.scripts]
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 000000000..124bc2c4b
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = src.data_manager.collectors.scrapers.settings
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
new file mode 100644
index 000000000..baaa737c5
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -0,0 +1,27 @@
+BOT_NAME = "archi_scrapers"
+
+SPIDER_MODULES = ["src.data_manager.collectors.scrapers.spiders"]
+
+NEWSPIDER_MODULE = "src.data_manager.collectors.scrapers.spiders"
+
+# Browser-like UA to avoid bot-blocking (e.g. Twiki ConnectionLost issue)
+USER_AGENT = (
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36; archi_scrapers"
+)
+
+# Default RETRY_TIMES is 2. We bump to 3 for transient failures.
+# ConnectionLost is in RETRY_HTTP_CODES by default as a non-HTTP failure;
+# Scrapy retries it automatically via RetryMiddleware.
+RETRY_ENABLED = True
+RETRY_TIMES = 3 # total attempts = 1 original + 3 retries
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
+
+# Per-request timeout — prevents indefinite hangs
+DOWNLOAD_TIMEOUT = 30 # seconds
+
+# ---------------------------------------------------------------------------
+# Safety: fail loudly on spider import errors
+# ---------------------------------------------------------------------------
+SPIDER_LOADER_WARN_ONLY = False
diff --git a/src/data_manager/collectors/scrapers/spiders/__init__.py b/src/data_manager/collectors/scrapers/spiders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
new file mode 100644
index 000000000..44d680773
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -0,0 +1,70 @@
+import scrapy
+from urllib.parse import urlparse
+
+
+class TwikiSpider(scrapy.Spider):
+ """
+ Minimal Twiki spider against a real Twiki target.
+ Public page — no SSO needed — isolates lifecycle learning from auth complexity.
+ """
+
+ name = "twiki"
+
+ async def start(self):
+ """
+ Seed request for the CRAB3 Twiki config page.
+ Building the habit: always use start_requests() with errback attached,
+ never rely on the start_urls shortcut in production spiders.
+ """
+ yield scrapy.Request(
+ url="https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ callback=self.parse,
+ errback=self.errback,
+ meta={"source_type": "web"}, # will become "sso" for protected Twiki pages
+ )
+
+ def parse(self, response):
+ """
+ Twiki pages render their main content inside #patternMain or .twikiMain.
+ Yields a raw dict
+ """
+ self.logger.info("Status %s for %s", response.status, response.url)
+
+ # Twiki-specific selectors
+ title = response.css("#topic-title::text, .patternTitle::text").get(default="")
+ if not title:
+ title = response.css("title::text").get(default="").replace(" < TWiki", "").strip()
+
+ # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
+ body_text = " ".join(
+ response.css("#twikiMainContents *::text, .patternMain *::text").getall()
+ ).strip()
+
+ # Same-host links
+ base = "twiki.cern.ch"
+ same_host_links = [
+ response.urljoin(href)
+ for href in response.css("a::attr(href)").getall()
+ if urlparse(response.urljoin(href)).netloc == base
+ ]
+
+ self.logger.info("Found title: %r", title)
+ self.logger.info("Found %d same-host links", len(same_host_links))
+
+ yield {
+ "url": response.url,
+ "title": title,
+ "body_length": len(body_text),
+ "body_preview": body_text[:300],
+ "same_host_links_count": len(same_host_links),
+ "same_host_links_sample": same_host_links[:5],
+ "source_type": response.meta.get("source_type"),
+ "content_type": response.headers.get("Content-Type", b"").decode(),
+ }
+
+ def errback(self, failure):
+ self.logger.error(
+ "Request failed: %s — %s",
+ failure.request.url,
+ repr(failure.value),
+ )
From 1a6f6fcba28f75f9d9f5c6791cd21aecaf5aca1b Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 17:00:31 +0100
Subject: [PATCH 02/55] key scrapy setting with safe defaults.
---
.../collectors/scrapers/settings.py | 59 ++++++++++++++++++-
1 file changed, 56 insertions(+), 3 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index baaa737c5..7adee3131 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -8,20 +8,73 @@
USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/120.0.0.0 Safari/537.36; archi_scrapers"
+ "Chrome/120.0.0.0 Safari/537.36"
+ "archi_scrapers/1.0 (+https://github.com/archi-physics/archi)"
)
# Default RETRY_TIMES is 2. We bump to 3 for transient failures.
# ConnectionLost is in RETRY_HTTP_CODES by default as a non-HTTP failure;
# Scrapy retries it automatically via RetryMiddleware.
RETRY_ENABLED = True
-RETRY_TIMES = 3 # total attempts = 1 original + 3 retries
-RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]
+RETRY_TIMES = 3 # max retries per request (transport + server errors only)
+RETRY_HTTP_CODES = [
+ 500, # Internal Server Error — transient server fault
+ 502, # Bad Gateway — upstream not reachable
+ 503, # Service Unavailable — server overloaded
+ 504, # Gateway Timeout
+ 408, # Request Timeout — network-level timeout
+ # 429 (Too Many Requests) omitted: AutoThrottle should prevent hitting it;
+]
+# Conservative floor delay for all sources.
+# AutoThrottle will increase this dynamically but never go below it.
+# Indico's robots.txt mandates Crawl-delay: 10 — Indico spiders must override
+# this to 10 via custom_settings = {"DOWNLOAD_DELAY": 10}.
+DOWNLOAD_DELAY = 2 # seconds
# Per-request timeout — prevents indefinite hangs
DOWNLOAD_TIMEOUT = 30 # seconds
+
+# Keep a single concurrent request per domain.
+# AutoThrottle adjusts throughput dynamically; starting at 1 is safe.
+CONCURRENT_REQUESTS = 1
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
+
+# Robots.txt: obey by default.
+# override this per-spider: custom_settings = {"ROBOTSTXT_OBEY": False}
+# Never disable globally — it would affect all spiders.
+ROBOTSTXT_OBEY = True
+
+# AutoThrottle
+# Enabled as a second politeness layer on top of DOWNLOAD_DELAY.
+# AutoThrottle treats DOWNLOAD_DELAY as a minimum — it will never go lower.
+# Target concurrency of 1.0 keeps us single-threaded per domain by default.
+AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_START_DELAY = DOWNLOAD_DELAY # initial delay before AT calibrates
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_MAX_DELAY = 60 # cap: never wait more than 60s
+# Log every AutoThrottle adjustment — useful during development, can be
+# set False in production if log volume is too high.
+AUTOTHROTTLE_DEBUG = False
# ---------------------------------------------------------------------------
# Safety: fail loudly on spider import errors
# ---------------------------------------------------------------------------
SPIDER_LOADER_WARN_ONLY = False
+
+# Maximum error count before the spider is closed automatically.
+# 25 gives enough room to diagnose intermittent failures without letting
+# a completely broken crawl run for hours.
+CLOSESPIDER_ERRORCOUNT = 25
+
+LOG_LEVEL = "INFO"
+
+# ---------------------------------------------------------------------------
+# Middlewares, Pipelines and Extensions Priorities
+# ---------------------------------------------------------------------------
+DOWNLOADER_MIDDLEWARES = { }
+
+ITEM_PIPELINES = { }
+
+EXTENSIONS = {
+ "scrapy.extensions.closespider.CloseSpider": 500,
+}
From 15f5b0cf5fb1152a7355175db29623c2c74529bf Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 17:31:11 +0100
Subject: [PATCH 03/55] scrapy check twiki, scrapy's magic to e2e test against
any Item contracts.
---
src/data_manager/collectors/scrapers/spiders/twiki.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 44d680773..944d49b9a 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -10,6 +10,10 @@ class TwikiSpider(scrapy.Spider):
name = "twiki"
+ custom_settings = {
+ "ROBOTSTXT_OBEY": False,
+ }
+
async def start(self):
"""
Seed request for the CRAB3 Twiki config page.
@@ -26,7 +30,10 @@ async def start(self):
def parse(self, response):
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
- Yields a raw dict
+
+ @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
+ @returns items 1 1
+ @scrapes url title same_host_links_count
"""
self.logger.info("Status %s for %s", response.status, response.url)
From 9313696dc5fdf39e322d29a38f9903eb5d6740e0 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 18:55:52 +0100
Subject: [PATCH 04/55] explicitly set RFPDupeFilter, proper Scrapy-Archi Item
definitions.
---
src/data_manager/collectors/scrapers/items.py | 21 ++++++++++++
.../collectors/scrapers/settings.py | 3 ++
.../collectors/scrapers/spiders/twiki.py | 34 +++++++++++--------
3 files changed, 44 insertions(+), 14 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/items.py
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
new file mode 100644
index 000000000..6f88f2242
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -0,0 +1,21 @@
+from scrapy import Item, Field
+
+class ArchiBaseItem(Item):
+ """Fields shared by every source type."""
+ url = Field() # canonical URL of the page
+ content = Field() # str (HTML/Markdown/text) or bytes (PDF)
+ suffix = Field() # "html" | "pdf" | "md" | ...
+ source_type = Field() # "web" | "sso" | "git" | ...
+ title = Field() # page title, may be empty
+
+class WebPageItem(ArchiBaseItem):
+ """Item produced by the plain-Link spider."""
+ content_type = Field() # value of Content-Type response header
+ encoding = Field() # response encoding (e.g. "utf-8")
+
+class TestTWikiItem(WebPageItem):
+ """Item produced by the trivial Twiki spider."""
+ body_length = Field()
+ body_preview = Field()
+ same_host_links_count = Field()
+ same_host_links_sample = Field()
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index 7adee3131..f6b3d74e1 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -68,6 +68,9 @@
LOG_LEVEL = "INFO"
+# The class used to detect and filter duplicate requests
+DUPEFILTER_CLASS = "scrapy.dupefilters.RFPDupeFilter"
+
# ---------------------------------------------------------------------------
# Middlewares, Pipelines and Extensions Priorities
# ---------------------------------------------------------------------------
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 944d49b9a..333a00a66 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -1,8 +1,11 @@
-import scrapy
+from typing import Iterator, cast
+from scrapy import Spider, Request
+from scrapy.http import Response
from urllib.parse import urlparse
+from src.data_manager.collectors.scrapers.items import TestTWikiItem
-class TwikiSpider(scrapy.Spider):
+class TwikiSpider(Spider):
"""
Minimal Twiki spider against a real Twiki target.
Public page — no SSO needed — isolates lifecycle learning from auth complexity.
@@ -12,6 +15,9 @@ class TwikiSpider(scrapy.Spider):
custom_settings = {
"ROBOTSTXT_OBEY": False,
+ "DOWNLOAD_DELAY": 60,
+ "DOWNLOAD_TIMEOUT": 120,
+ "RETRY_TIMES": 0,
}
async def start(self):
@@ -20,14 +26,14 @@ async def start(self):
Building the habit: always use start_requests() with errback attached,
never rely on the start_urls shortcut in production spiders.
"""
- yield scrapy.Request(
+ yield Request(
url="https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
callback=self.parse,
errback=self.errback,
meta={"source_type": "web"}, # will become "sso" for protected Twiki pages
)
- def parse(self, response):
+ def parse(self, response: Response) -> Iterator[TestTWikiItem]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
@@ -58,16 +64,16 @@ def parse(self, response):
self.logger.info("Found title: %r", title)
self.logger.info("Found %d same-host links", len(same_host_links))
- yield {
- "url": response.url,
- "title": title,
- "body_length": len(body_text),
- "body_preview": body_text[:300],
- "same_host_links_count": len(same_host_links),
- "same_host_links_sample": same_host_links[:5],
- "source_type": response.meta.get("source_type"),
- "content_type": response.headers.get("Content-Type", b"").decode(),
- }
+ yield TestTWikiItem(
+ url=response.url,
+ title=title,
+ body_length=len(body_text),
+ body_preview=body_text[:300],
+ same_host_links_count=len(same_host_links),
+ same_host_links_sample=same_host_links[:5],
+ source_type=response.meta.get("source_type"),
+ content_type=cast(bytes, response.headers.get("Content-Type", b"")).decode()
+ )
def errback(self, failure):
self.logger.error(
From 1ac346895df2a95c1af371902e69504479057273 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 20:33:07 +0100
Subject: [PATCH 05/55] added trivial LinkScraper implementation in scrapy,
scrapers.utils, safe DEPTH_LIMIT.
---
src/data_manager/collectors/scrapers/items.py | 4 +
.../collectors/scrapers/settings.py | 5 +
.../collectors/scrapers/spiders/link.py | 108 ++++++++++++++++++
.../collectors/scrapers/spiders/twiki.py | 26 ++---
src/data_manager/collectors/scrapers/utils.py | 32 ++++++
5 files changed, 162 insertions(+), 13 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/spiders/link.py
create mode 100644 src/data_manager/collectors/scrapers/utils.py
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index 6f88f2242..8c14c6535 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -13,6 +13,10 @@ class WebPageItem(ArchiBaseItem):
content_type = Field() # value of Content-Type response header
encoding = Field() # response encoding (e.g. "utf-8")
+class PDFItem(ArchiBaseItem):
+ """Binary PDF scraped from a web URL."""
+ content_type = Field()
+
class TestTWikiItem(WebPageItem):
"""Item produced by the trivial Twiki spider."""
body_length = Field()
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index f6b3d74e1..39b964b2d 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -56,6 +56,11 @@
# set False in production if log volume is too high.
AUTOTHROTTLE_DEBUG = False
+# ------------------------------------------------------------------ #
+# Depth limiting — safety cap; spiders can narrow via custom_settings.
+# ------------------------------------------------------------------ #
+DEPTH_LIMIT = 2 # hard cap so a misconfigured crawl can't run forever
+
# ---------------------------------------------------------------------------
# Safety: fail loudly on spider import errors
# ---------------------------------------------------------------------------
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
new file mode 100644
index 000000000..86bd94ab3
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -0,0 +1,108 @@
+from typing import Iterator
+from urllib.parse import urlparse
+
+from scrapy import Request, Spider
+from scrapy.http import Response, TextResponse
+
+from src.data_manager.collectors.scrapers.items import PDFItem, WebPageItem
+from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
+
+
+class LinkSpider(Spider):
+ """
+ Generic link-following spider for unauthenticated pages.
+ Stays within the same hostname as start_url, up to max_depth.
+ """
+
+ name = "link"
+ custom_settings = {
+ "DEPTH_LIMIT": 2, # safety cap; narrowed per-crawl via meta["depth"] check
+ }
+
+ def __init__(self, start_url: str = "", max_depth: int = 1, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._start_url = start_url
+ self._base_host = urlparse(start_url).netloc
+ self._max_depth = int(max_depth)
+
+ async def start(self):
+ """
+ Seed request — validates start_url at crawl time, not import time.
+ Building the habit: always attach errback here, never rely on
+ start_urls shortcut in production spiders.
+ """
+ if not self._start_url:
+ raise ValueError("links spider requires -a start_url=")
+ yield Request(
+ url=self._start_url,
+ callback=self.parse,
+ errback=self.errback,
+ meta={"depth": 0},
+ )
+
+ def parse(self, response: Response) -> Iterator[WebPageItem | PDFItem | Request]:
+ """
+ Extract one item per response, then yield follow Requests up to max_depth.
+ @url https://quotes.toscrape.com/
+ @returns items 1
+ @scrapes url content suffix source_type title
+ """
+ self.logger.info("Status %s for %s", response.status, response.url)
+
+ yield from self._extract_item(response)
+
+ current_depth = response.meta.get("depth", 0)
+ if current_depth >= self._max_depth:
+ return
+
+ shlinks = same_host_links(self._base_host, response)
+ self.logger.info(
+ "Found %d same-host links at depth %d", len(shlinks), current_depth
+ )
+
+ for url in shlinks:
+ yield Request(
+ url=url,
+ callback=self.parse,
+ errback=self.errback,
+ meta={"depth": current_depth + 1},
+ )
+
+ def errback(self, failure):
+ self.logger.error(
+ "Request failed: %s — %s",
+ failure.request.url,
+ repr(failure.value),
+ )
+
+ # ------------------------------------------------------------------ #
+ # Private helpers — pure, unit-testable without a reactor
+ # ------------------------------------------------------------------ #
+
+ def _extract_item(self, response: Response) -> Iterator[WebPageItem | PDFItem]:
+ ct = get_content_type(response)
+
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield PDFItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ source_type="web",
+ title="",
+ content_type=ct,
+ )
+ return
+
+ title = response.css("title::text").get(default="").strip()
+ encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
+
+ yield WebPageItem(
+ url=response.url,
+ content=response.text,
+ suffix="html",
+ source_type="web",
+ title=title,
+ content_type=ct,
+ encoding=encoding,
+ )
+
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 333a00a66..2f5d7a9a7 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -3,6 +3,7 @@
from scrapy.http import Response
from urllib.parse import urlparse
from src.data_manager.collectors.scrapers.items import TestTWikiItem
+from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
class TwikiSpider(Spider):
@@ -26,11 +27,16 @@ async def start(self):
Building the habit: always use start_requests() with errback attached,
never rely on the start_urls shortcut in production spiders.
"""
+ start_url = "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
+ base_host = urlparse(start_url).netloc
yield Request(
- url="https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ url=start_url,
callback=self.parse,
errback=self.errback,
- meta={"source_type": "web"}, # will become "sso" for protected Twiki pages
+ meta={
+ "source_type": "web",
+ "base_host": base_host,
+ },
)
def parse(self, response: Response) -> Iterator[TestTWikiItem]:
@@ -53,26 +59,20 @@ def parse(self, response: Response) -> Iterator[TestTWikiItem]:
response.css("#twikiMainContents *::text, .patternMain *::text").getall()
).strip()
- # Same-host links
- base = "twiki.cern.ch"
- same_host_links = [
- response.urljoin(href)
- for href in response.css("a::attr(href)").getall()
- if urlparse(response.urljoin(href)).netloc == base
- ]
+ shlinks = same_host_links(response.meta['base_host'], response)
self.logger.info("Found title: %r", title)
- self.logger.info("Found %d same-host links", len(same_host_links))
+ self.logger.info("Found %d same-host links", len(shlinks))
yield TestTWikiItem(
url=response.url,
title=title,
body_length=len(body_text),
body_preview=body_text[:300],
- same_host_links_count=len(same_host_links),
- same_host_links_sample=same_host_links[:5],
+ same_host_links_count=len(shlinks),
+ same_host_links_sample=shlinks[:5],
source_type=response.meta.get("source_type"),
- content_type=cast(bytes, response.headers.get("Content-Type", b"")).decode()
+ content_type=get_content_type(response)
)
def errback(self, failure):
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
new file mode 100644
index 000000000..1e7f2e5c6
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -0,0 +1,32 @@
+from typing import List
+from urllib.parse import urlparse
+
+from scrapy.http import Response
+
+_IMAGE_EXTS = frozenset({
+ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".ico", ".webp"
+})
+
+def same_host_links(base_host, response: Response) -> List[str]:
+ """
+ Return deduplicated same-host, non-image absolute URLs on this page.
+ """
+
+ seen = set()
+ links = []
+ for href in response.css("a::attr(href)").getall():
+ url = response.urljoin(href)
+ parsed = urlparse(url)
+ if parsed.netloc != base_host:
+ continue
+ if any(parsed.path.lower().endswith(e) for e in _IMAGE_EXTS):
+ continue
+ if url not in seen:
+ seen.add(url)
+ links.append(url)
+ return links
+
+def get_content_type(response: Response) -> str:
+ """Decode the Content-Type header bytes to str."""
+ raw: bytes = response.headers.get("Content-Type", b"") or b""
+ return raw.decode("utf-8", errors="replace")
From 9035fe6c9cba20e132c260615ef5b9d306571006 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 21:22:51 +0100
Subject: [PATCH 06/55] Unit-testable parser practice with a trivial real Twiki
parser offline test-cases.
---
src/data_manager/collectors/scrapers/items.py | 2 -
.../collectors/scrapers/spiders/twiki.py | 62 +-
...ew_cmspublic_crab3_configuration_file.html | 993 ++++++++++++++++++
tests/unit/test_twiki_parser.py | 19 +
4 files changed, 1045 insertions(+), 31 deletions(-)
create mode 100644 tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html
create mode 100644 tests/unit/test_twiki_parser.py
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index 8c14c6535..d0d3b0a4f 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -21,5 +21,3 @@ class TestTWikiItem(WebPageItem):
"""Item produced by the trivial Twiki spider."""
body_length = Field()
body_preview = Field()
- same_host_links_count = Field()
- same_host_links_sample = Field()
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 2f5d7a9a7..5ec8bab03 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -1,10 +1,12 @@
-from typing import Iterator, cast
+import logging
+from typing import Iterator
from scrapy import Spider, Request
from scrapy.http import Response
from urllib.parse import urlparse
from src.data_manager.collectors.scrapers.items import TestTWikiItem
from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
+logger = logging.getLogger(__name__)
class TwikiSpider(Spider):
"""
@@ -28,18 +30,17 @@ async def start(self):
never rely on the start_urls shortcut in production spiders.
"""
start_url = "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
- base_host = urlparse(start_url).netloc
+ self._base_host = urlparse(start_url).netloc
yield Request(
url=start_url,
callback=self.parse,
errback=self.errback,
meta={
"source_type": "web",
- "base_host": base_host,
},
)
- def parse(self, response: Response) -> Iterator[TestTWikiItem]:
+ def parse(self, response: Response) -> Iterator[TestTWikiItem | Request]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
@@ -49,31 +50,10 @@ def parse(self, response: Response) -> Iterator[TestTWikiItem]:
"""
self.logger.info("Status %s for %s", response.status, response.url)
- # Twiki-specific selectors
- title = response.css("#topic-title::text, .patternTitle::text").get(default="")
- if not title:
- title = response.css("title::text").get(default="").replace(" < TWiki", "").strip()
-
- # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
- body_text = " ".join(
- response.css("#twikiMainContents *::text, .patternMain *::text").getall()
- ).strip()
-
- shlinks = same_host_links(response.meta['base_host'], response)
-
- self.logger.info("Found title: %r", title)
- self.logger.info("Found %d same-host links", len(shlinks))
-
- yield TestTWikiItem(
- url=response.url,
- title=title,
- body_length=len(body_text),
- body_preview=body_text[:300],
- same_host_links_count=len(shlinks),
- same_host_links_sample=shlinks[:5],
- source_type=response.meta.get("source_type"),
- content_type=get_content_type(response)
- )
+ yield from parse_twiki_page(response) # Yield item
+ # then, follow links
+ shlinks = same_host_links(self._base_host, response)
+ logger.info("Found %d same-host links", len(shlinks))
def errback(self, failure):
self.logger.error(
@@ -81,3 +61,27 @@ def errback(self, failure):
failure.request.url,
repr(failure.value),
)
+
+def parse_twiki_page(response: Response) -> Iterator[TestTWikiItem]:
+ # Twiki-specific selectors
+ title = (
+ response.css("#topic-title::text").get()
+ or response.css(".patternTitle::text").get()
+ or response.css("title::text").get("").split("<")[0].strip()
+ )
+ # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
+ body_text = " ".join(
+ response.css("#twikiMainContents *::text, .patternMain *::text").getall()
+ ).strip()
+
+
+ logger.info("Found title: %r", title)
+
+ yield TestTWikiItem(
+ url=response.url,
+ title=title,
+ body_length=len(body_text),
+ body_preview=body_text[:300],
+ source_type=response.meta.get("source_type"),
+ content_type=get_content_type(response)
+ )
diff --git a/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html b/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html
new file mode 100644
index 000000000..667b81951
--- /dev/null
+++ b/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html
@@ -0,0 +1,993 @@
+
+
+
+
+
+
+
+
+ CRAB3ConfigurationFile < CMSPublic < TWiki
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
CRAB configuration file
+
+
+
+
+
+
CRAB configuration file
+
+
+For convenience, we suggest to place the CRAB configuration file in the same directory as the CMSSW parameter-set file to be used by CRAB.
+
+The expected default name of the CRAB configuration file is
crabConfig.py, but of course one can give it any name (respecting always the filename extension
.py and not adding other dots in the filename), as long as one specifies the name when required (e.g. when issuing the CRAB submission command).
+
+In CRAB3 the configuration file is in Python language. It consists of creating a
Configuration object imported from the
WMCore library:
+
+
+import CRABClient
+from WMCore.Configuration import Configuration
+config = Configuration()
+
+
+Once the
Configuration object is created, it is possible to add new sections to it with corresponding parameters. This is done using the following syntax:
+
+
+config.section_("<section-name>")
+config.<section-name>.<parameter-name> = <parameter-value>
+
+
+
Abbreviated configuration definition
+Those lines can be simplified a bit by using instead the following which already defines all the
config.<section-name> objects, but the more
+explicit format above is more clear and it is the one most commonly used
+
+import CRABClient
+from CRABClient.UserUtilities import config
+
+
+
+
+
+
+
+
CRAB configuration sections
+
+
+The table below shows what are the sections currently available for CRAB configuration.
+
+
+
+ Section
+ Description
+
+
+ General
+ In this section, the user specifies generic parameters about the request (e.g. request name).
+
+
+ JobType
+ This section aims to contain all the parameters of the user job type and related configurables (e.g. CMSSW parameter-set configuration file, additional input files, etc.).
+
+
+ Data
+ This section contains all the parameters related to the data to be analyzed, including the splitting parameters.
+
+
+ Site
+ Grid site parameters are defined in this section, including the stage out information (e.g. stage out destination site, white/black lists, etc.).
+
+
+ User
+ This section is dedicated to all the information relative to the user (e.g. voms information).
+
+
+ Debug
+ For experts use only.
+
+
+
+
Predefined CRAB configuration file with empty skeleton
+
+To simplify life a bit, CRAB provides a function
config that returns a
Configuration object with pre-defined sections. The function is in the
CRABClient.UserUtilities module. Users can import and use the function in their CRAB configuration file:
+
+
+from CRABClient.UserUtilities import config
+config = config()
+
+
+which, from the point of view of the
Configuration instance, is equivalent to:
+
+
+from WMCore.Configuration import Configuration
+config = Configuration()
+
+config.section_("General")
+config.section_("JobType")
+config.section_("Data")
+config.section_("Site")
+config.section_("User")
+config.section_("Debug")
+
+
+
+
CRAB configuration parameters
+
+
+The table below provides a list of all the available CRAB configuration parameters (organized by sections), including a short description. Mandatory parameters are marked with two stars (**). Other important parameters are marked with one star (*).
+
+
+
+ Parameter
+ Type
+ Description
+
+
+ Section General
+
+
+
+
+ requestName (*)
+ string
+ A name the user gives to it's request/task. In particular, it is used by CRAB to create a project directory (named crab_<requestName>) where files corresponding to this particular task will be stored. Defaults to <time-stamp>, where the time stamp is of the form <YYYYMMDD>_<hhmmss> and corresponds to the submission time. The maximum allowed length is 100 characters, according to the formati in RX_TASKNAME . Task submission will fail with "Incorrect 'workflow' parameter" if other characters are used.
+
+
+ workArea (*)
+ string
+ The area (full or relative path) where to create the CRAB project directory. If the area doesn't exist, CRAB will try to create it using the mkdir command. Defaults to the current working directory.
+
+
+ transferOutputs (*)
+ boolean
+ Whether or not to transfer the output files to the storage site. If set to False, the output files are discarded and the user can not recover them. Defaults to True.
+
+
+ transferLogs (*)
+ boolean
+ Whether or not to copy the jobs log files to the storage site. If set to False, the log files are discarded and the user can not recover them. Notice however that a short version of the log files containing the first 1000 lines and the last 3000 lines are still available through the monitoring web pages. Defaults to False.
+
+
+ failureLimit
+ integer
+ The number of jobs that may fail permanently before the entire task is cancelled. Disabled by default. Note: a very dangerous parameter, for expert use, do not touch it unless you are sure of what you are doing
+
+
+ instance (**)
+ string
+ The CRAB server instance where to submit the task. For users please use 'prod'.
+
+
+ activity
+ string
+ The activity name used when reporting to Dashboard. For experts use only.
+
+
+
+
+
+
+
+ Section JobType
+
+
+
+
+ pluginName (**)
+ string
+ Specifies if this task is running an analysis ('Analysis') on an existing dataset or is running MC event generation ('PrivateMC').
+
+
+ psetName (*)
+ string
+ The name of the CMSSW parameter-set configuration file that should be run via cmsRun. Defaults to 'pset.py'.
+
+
+ generator
+ string
+ This parameter should be set to 'lhe' when running MC generation on LHE files. Automatically set if an LHESource is present in the parameter-set.
+
+
+ pyCfgParams
+ list of strings
+ List of parameters to pass to the CMSSW parameter-set configuration file, as explained here . For example, if set to ['myOption','-param1=value1',--'param2=value2'], then the jobs will execute cmsRun JobType.psetName myOption -param1=value1 --param2=value2. NOTE: No blanks allowed in 'param=value' see the note about pyCfgParams below.
+
+
+ inputFiles
+ list of strings
+ List of private input files (and/or directories) needed by the jobs. They will be added to the input sandbox. The input sandbox can not exceed 120 MB. The input sandbox is shipped with each job. The input files will be placed in the working directory where the users' application (e.g. cmsRun) is launched regardless of a possible path indicated in this parameter (i.e. only the file name at right of last / is relevant). Directories are tarred and their subtree structure is preserved. Please check the FAQ for more details on how these files are handled.
+
+
+ disableAutomaticOutputCollection
+ boolean
+ Whether to disable or not the automatic recognition of output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration. If set to True, it becomes the user's responsibility to specify in the JobType.outputFiles parameter all the output files that need to be collected. Defaults to False.
+
+
+ outputFiles
+ list of strings
+ List of output files that need to be collected. If disableAutomaticOutputCollection = False (the default), output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration are automatically recognized by CRAB and don't need to be included in this parameter. If publication in DBS is requested (the default) file names must obey DBS lexicon rules, in particular end with .root. If Data.publication is set to False any reasonable string will do.
+
+
+ eventsPerLumi
+ integer
+ Deprecated. Use Data.lumisPerFile instead.
+
+
+ allowUndistributedCMSSW
+ boolean
+ Whether to allow or not using a CMSSW release possibly not available at sites. Defaults to False.
+
+
+ maxMemoryMB
+ integer
+ Maximum amount of memory (in MB) a job is allowed to use. Defaults to 2000. The more memory you request, the more difficult will be to find a slot where to run your jobs. Maximum for single core jobs is 5000. for multiple cores can be up to 2500*numCores
+
+
+ maxJobRuntimeMin
+ integer
+ The maximum runtime (in minutes) per job. Jobs running longer than this amount of time will be removed. Defaults to 1315 (21 hours 55 minutes), see the note about maxJobRuntimeMin below. Not compatible with Automatic splitting.
+
+
+ numCores
+ integer
+ Number of requested cores per job. Defaults to 1. If you increase this value to run multi-threaded cmsRun, you may need to increase maxMemoryMB as well. In the CMSSW parameter-set configuration you may require also the number of streams to be larger than one per thread, which affects the memory consumption too.
+
+
+ priority
+ integer
+ Task priority among the user's own tasks. Higher priority tasks will be processed before lower priority. Two tasks of equal priority will have their jobs start in an undefined order. The first five jobs in a task are given a priority boost of 10. Defaults to 10.
+
+
+ scriptExe
+ string
+ A user script that should be run on the worker node instead of the default cmsRun. It is up to the user to setup the script properly to run on the worker node enviroment. CRAB guarantees that the CMSSW environment is setup (e.g. scram is in the path) and that the modified CMSSW parameter-set configuration file will be placed in the working directory with name PSet.py. The user must ensure that a properly named framework job report file will be written; this can be done e.g. by calling cmsRun within the script as cmsRun -j FrameworkJobReport.xml -p PSet.py. The script itself will be added automatically to the input sandbox. Output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration file will be automatically collected (CRAB3 will look in the framework job report). The user needs to specify other output files to be collected in the JobType.outputFiles parameter. See CRAB3AdvancedTopic#Running_a_user_script_with_CRAB for more information.
+
+
+ scriptArgs
+ list of strings
+ Additional arguments (in the form param=value) to be passed to the script specified in the JobType.scriptExe parameter. The first argument passed to the script is always the job number
+
+
+ sendPythonFolder
+ boolean
+ Obsolete. The 'python' folder in the CMSSW release ($CMSSW_BASE/python) is always included in the sandbox
+
+
+ sendVenvFolder
+ boolean
+ Determine if the =venv= folder in the CMSSW release ($CMSSW_BASE/venv) is included in the sandbox or not. Contrary to other sandbox files, symbolic links found in venv are not dereferenced. Defaults to False.
+
+
+ sendExternalFolder
+ boolean
+ Determine if the 'external' folder in the CMSSW release ($CMSSW_BASE/external) is included in the sandbox or not. See https://hypernews.cern.ch/HyperNews/CMS/get/computing-tools/1972.html . Defaults to False.
+
+
+ externalPluginFile
+ string
+ Name of a plug-in provided by the user and which should be run instead of the standard CRAB plug-in Analysis or PrivateMC. Can not be specified together with pluginName; is either one or the other. Not supported yet.
+
+
+
+
+
+
+
+ Section Data
+
+
+
+
+ inputDataset (*)
+ string
+ When running an analysis over a dataset registered in DBS, this parameter specifies the name of the dataset. The dataset can be an official CMS dataset or a dataset produced by a user or a Rucio DID as explain in this FAQ .
+
+
+ inputBlocks
+ list
+ A list of DBS block names in the format datasetname#uuid. If present only those blocks will be processed, instead of the full dataset. The dataset in the block names must be the same as indicated in inputDataset.
+
+
+ allowNonValidInputDataset
+ boolean
+ Allow CRAB to run over (the valid files of) the input dataset given in Data.inputDataset even if its status in DBS is not VALID. Defaults to False.
+
+
+ outputPrimaryDataset (*)
+ string
+ When running an analysis over private input files or running MC generation, this parameter specifies the primary dataset name that should be used in the LFN of the output/log files and in the publication dataset name (see Data handling in CRAB ).
+
+
+ inputDBS (*)
+ string
+ The URL of the DBS reader instance where the input dataset is published. The URL is of the form 'https://cmsweb.cern.ch/dbs/prod/<instance>/DBSReader', where instance can be global, phys01, phys02 or phys03. The default is global instance. The aliases global, phys01, phys02 and phys03 in place of the whole URLs are also supported (and indeed recommended to avoid typos). For datasets that are not of USER tier, CRAB only allows to read them from global DBS.
+
+
+ splitting (*)
+ string
+ Mode to use to split the task in jobs. When JobType.pluginName = 'Analysis', the splitting mode can either be 'Automatic' (the default, please read the dedicated FAQ ), 'FileBased', 'LumiBased', or 'EventAwareLumiBased' (for Data the recommended mode is 'Automatic' or 'LumiBased'). For 'EventAwareLumiBased', CRAB will split the task by luminosity sections, where each job will contain a varying number of luminosity sections such that the number of events analyzed by each job is roughly unitsPerJob. When JobType.pluginName = 'PrivateMC', the splitting mode can only be 'EventBased'.
+
+
+ unitsPerJob (*)
+ integer
+ Mandatory when Data.splitting is not 'Automatic', suggests (but not impose) how many units (i.e. files, luminosity sections or events - depending on the splitting mode - see the note about Data.splitting below) to include in each job. When Data.splitting = 'Automatic' it represents the jobs target runtime in minutes and its minimum allowed value is 180 (i.e. 3 hours).
+
+
+ totalUnits (*)
+ integer
+ Mandatory when JobType.pluginName = 'PrivateMC', in which case the parameter tells how many events to generate in total. When JobType.pluginName = 'Analysis', this parameter tells how many files (when Data.splitting = 'FileBased'), luminosity sections (when Data.splitting = 'LumiBased') or events (when Data.splitting = 'EventAwareLumiBased' or Data.splitting = 'Automatic' - see the note about "Data.splitting" below) to analyze (after applying the lumi-mask and/or run range filters).
+
+
+ lumisPerFile
+ integer
+ When JobType.pluginName = 'PrivateMC', this parameter specifies how many luminosity section will be presetn in each output file. It should be used only in very special and well motivated use cases. Note that every job starts with a fresh luminosity section, which may lead to unevenly sized luminosity sections if Data.unitsPerJob is not a multiple of this parameter. Defaults to 1.
+
+
+ useParent
+ boolean
+ Adds corresponding parent dataset in DBS as secondary input source. Allows to gain access to more data tiers than present in the current dataset. This will not check for parent dataset availability; jobs may fail with xrootd errors or due to missing dataset access. Defaults to False.
+
+
+ secondaryInputDataset
+ string
+ An extension of the Data.useParent parameter. Allows to specify any grandparent dataset in DBS (same instance as the primary dataset) as secondary input source. CRAB will internally set this dataset as the parent and will set Data.useParent = True. Therefore, Data.useParent and Data.secondaryInputDataset can not be used together a priori .
+
+
+ lumiMask (*)
+ string
+ A lumi-mask to apply to the input dataset before analysis. Can either be a URL address or the path to a JSON file on disk. Default to an empty string (no lumi-sections filter).
+
+
+ runRange (*)
+ string
+ The runs and/or run ranges to process (e.g. '193093-193999,198050,199564'). It can be used together with a lumi-mask. Defaults to an empty string (no run filter).
+
+
+ outLFNDirBase (*)
+ string
+ The first part of the LFN of the output files (see Data handling in CRAB ). Accepted values are /store/user/<username>[/<subdir>*] (the trailing / after <username> can not be omitted if a subdir is not given) and /store/group/<groupname>[/<subgroupname>*] (and /store/local/<dir>[/<subdir>*] if Data.publication = False). Defaults to /store/user/<username>/. CRAB creates the outLFNDirBase path on the storage site if needed, do not create it yourself otherwise the file stage-out may fail due to permissions inconsistency. Note: even if publication is disabled, the LFN needs to be a valid LFN name for DBS. So keep in mind that 1) name of first subdir after username (or groupname ) must start with a letter, not a number 2) do not use dot as separator inside names 3) LFN must end with .root
+
+
+ publication (*)
+ boolean
+ Whether to publish or not the EDM output files (i.e. output files produced by PoolOutputModule) in DBS. Notice that for publication to be possible, the corresponding output files have to be transferred to the permanent storage element. Defaults to True.
+
+
+ publishDBS (*)
+ string
+ The URL of the DBS writer instance where to publish. The URL is of the form 'https://cmsweb.cern.ch/dbs/prod/<instance>/DBSWriter', where instance can so far only be phys03, and therefore it is set as the default, so the user doesn't have to specify this parameter. The alias phys03 in place of the whole URL is also supported.
+
+
+ outputDatasetTag (*)
+ string
+ A custom string used in both, the LFN of the output files (even if Data.publication = False) and the publication dataset name (if Data.publication = True) (see Data handling in CRAB ).
+
+
+ ignoreLocality
+ boolean
+ Defaults to False. DO NOT USE
+
+
+ userInputFiles
+ list of strings
+ This parameter serves to run an analysis over a set of input files, as opposed to run over an full dataset from DBS. Format is: Data.userInputFiles = ['file1', 'file2', 'etc']. When this parameter is used, the only allowed splitting mode is 'FileBased'. There are two ways to use this. 1) as a fileMask, analogous to inputBlocks. If specified together with Data.inputDataset it must contain a list of LFN's and only the listed files will be processed from that Dataset. An error will be raised if some LFN does not belong to the input dataset. 2) as a pure "list of files" in case Data.inputDataset is missing. In this case 'fileN' can be an LFN (i.e. a string starting with /store/), or PFN (i.e. a string starting with protocol-prefix://store/ like e.g. a pointer to an xrootd redirector ref ). One could also have a local text file containing the list of input files (one file per line; don't include quotation marks nor commas) and then specify in this parameter the following: Data.userInputFiles = open('/path/to/local/file.txt').readlines(). Also, since there is no input dataset from where to extract the primary dataset name, the user must use the parameter Data.outputPrimaryDataset to define it. CRAB will not do any data discovery and user must specify the locations where to run the jobs via the Site.whitelist parameter.
+
+
+ partialDataset
+ boolean
+ Allow to process input dataset that is only partially on disk. Normally, when CRAB finds out that some files of the input dataset are not fully replicated on disk, CRAB will issue tape recall to Rucio and wait for all files to be on disk before running the task. If partialdataset is True, CRAB will submit task to condor immediately without request tape recall and process the files currently on disk.
+
+
+
+
+
+
+
+ Section Site
+
+
+
+
+ storageSite (**)
+ string
+ Site where the output files should be permanently copied to. See the note about storageSite below.
+
+
+ whitelist
+ list of strings
+ A user-specified list of sites where the jobs can run. For example: ['T2_CH_CERN','T2_IT_Bari',...]. Jobs will not be assigned to a site that is not in the white list. Note that at times this list may not be respected, see this FAQ
+
+
+ blacklist
+ list of strings
+ A user-specified list of sites where the jobs should not run. Useful to avoid jobs to run on a site where the user knows they will fail (e.g. because of temporary problems with the site). Note that at times this list may not be respected, see this FAQ
+
+
+ ignoreGlobalBlacklist
+ boolean
+ Whether or not to ignore the global site blacklist provided by the Site Status Board. Should only be used in special cases with a custom whitelist or blacklist to make sure the jobs land on the intended sites.
+
+
+ requireAccelerator
+ boolean
+ Defaults to False. Set to True to request GPU node for the jobs. Please see CMS Submission Infrastructure: GPUs monitor dashboard to check sites and GPUs availability.
+
+
+ acceleratorParams
+ dictionary
+ Defaults to {}. When Site.requireAccelerator is True, this parameter dictionary will be used to specify detailed GPU resource requirements for the jobs. Please see CMS Submission Infrastructure: GPUs monitor dashboard to check GPU's Memory, Capacity and Runtime availability. See Example parameters below.
+
+
+config.Site.acceleratorParams = {
+ "GPUMemoryMB": "4000",
+ "GPUMinimumCapability": "7.0",
+ "GPUMaximumCapability": "8.0",
+ "GPURuntime": "12.1"
+}
+
+
+
+
+
+
+
+
+ Section User
+
+
+
+
+ voGroup
+ string
+ The VO group that should be used with the proxy and under which the task should be submitted.
+
+
+ voRole
+ string
+ The VO role that should be used with the proxy and under which the task should be submitted.
+
+
+
+
+
+
+
+ Section Debug
+
+
+
+
+ oneEventMode
+ boolean
+ For experts use only.
+
+
+ asoConfig
+ list of dictionaries
+ For experts use only.
+
+
+ scheddName
+ string
+ For experts use only. NB if you select a schedd on the ITB pool, remember to change the collector accordingly!
+
+
+ extraJDL
+ list of strings
+ For experts use only.
+
+
+ collector
+ string
+ For experts use only.
+
+
+
+
Note for Data.splitting = 'EventAwareLumiBased'
+When CRAB does data discovery of the input dataset in DBS, the number of events is only known per input file (because that's the information available on DBS) and not per luminosity section. CRAB can therefore only estimate the number of events per luminosity section in a given input file as the number of events in the file divided by the number of luminosity sections in the file. Because of that,
Data.unitsPerJob and
Data.totalUnits should not be considered by the user as rigorous limits, but as limits applicable on average.
+
+
Note for maxJobRuntimeMin
+We strongly encourage every user to tune their splitting paramenters aiming for jobs to run for a few hours, ideally 8-10 hours, and set the
maxJobRuntimeMin accordingly.
+Having many jobs increases the chance of failure, since the number of problems is roughly proportional to the number of run jobs. Moreover, short jobs suffer of start/end overheads resulting in poor CPU/Wall-clock ratio, which impacts negatively CMS and makes it harder to secure additional resources.
+
+
Note for pyCfgParam
+Either
JobType.pyCfgParams=["arg=value" ] or
JobType.pyCfgParams=["arg", "value" ] are fine (the latter somehow more correct), but
JobType.pyCfgParams=["arg value" ] will not work. You can use python's
shlex.split to convert args from "the way you time them when running cmsRun interactively" to the correct format. Example
+
import shlex
+args = '--arg 1 --another-arg "my name"'
+config.JobType.pyCfgParams = shlex.split(args)
+results in
+
['--arg', '1', '--another-arg', 'my name'] i.e. it correctly preserves quoted spaces etc.
+
+N.B. there has been reports years ago that params with double dashes may break things. Most likely it was due to (now very old and unsupported) old python and/or CMSSW versions. But if you get odd errors you may check for this and report.
+
+
+
+
Note for storageSite
+In CRAB3 the output files of each job are transferred first to a temporary storage element in the site where the job ran and later from there to a permanent storage element in a destination site. The transfer to the permanent storage element is done asynchronously by a service called AsyncStageOut (ASO). The destination site must be specified in the
Site.storageSite parameter in the form
'Tx_yy_zzzzz' (e.g.
'T2_IT_Bari',
'T2_US_Nebraska', etc.). The official names of CMS sites can be found in the
CRIC web page.
The user MUST have write permission in the storage site.
+
+
+
Passing CRAB configuration parameters from the command line
+
+
+It is possible to define/overwrite CRAB configuration parameters by passing them through the command line when the
crab submit command is executed. Parameters can be set with the convention
<parameter-name>=<parameter-value> and can be sequentially listed separating them with a blank space. Here is an example on how one would pass the request name and the publication name:
+
+
+crab submit -c my_crab_config_file.py General.requestName=my_request_name Data.outputDatasetTag=my_publication_name
+
+
+
Note : Currently it is only possible to overwrite the parameters that take as value a string, an integer, a float or a boolean. Parameters that take a list can not be overwritten this way.
+
+
+
Converting a CRAB2 configuration file into a CRAB3 configuration file
+
+
+CRAB3 is essentially new compared to CRAB2; it is not just a re-write. As a consequence, the configuration is different and there is no direct trivial translation that can be done automatically for every CRAB2 configuration file into a CRAB3 one. There is only a basic CRAB3 utility, called
crab2cfgTOcrab3py, meant to help the user to convert an existing CRAB2 configuration file into a CRAB3 configuration file template. The user has to provide the name of the CRAB2 configuration file he/she wants to convert and the name he/she wants to give to the CRAB3 configuration file (both arguments have default values;
crab.cfg and
crabConfig.py respectively).
+
+
+crab2cfgTOcrab3py [crab2confgiName.cfg] [crab3configName.py]
+
+
+Instead of blindly taking the produced CRAB3 configuration file and run it, the user should always inspect the produced file, understand what each parameter means, edit them and add other parameters that might be needed, etc.
+
+Here we give a usage example. Suppose we have the following CRAB2 configuration file with the default name
crab.cfg:
+
+
+[CRAB]
+jobtype = cmssw
+scheduler = remoteGlidein
+use_server = 0
+
+[CMSSW]
+datasetpath = /GenericTTbar/HC-CMSSW_5_3_1_START53_V5-v1/GEN-SIM-RECO
+dbs_url = global
+pset = my_CMSSW_config.py
+number_of_jobs = 100
+events_per_job = 20
+output_file = output.root
+
+[GRID]
+se_white_list = T2_IT_Bari
+se_black_list = T2_IT_Legnaro
+data_location_override = T2_IT_Bari
+
+[USER]
+ui_working_dir = my_CRAB_project_directory
+return_data = 0
+copy_data = 1
+storage_element = T2_IT_Legnaro
+user_remote_dir = my_remote_directory
+publish_data = 1
+publish_data_name = my_publication_name
+dbs_url_for_publication = phys03
+
+
+If we run the tool without specifying any input parameters:
+
+
+crab2cfgTOcrab3py
+
+
+it will create a file
crabConfig.py with the following content:
+
+
+from WMCore.Configuration import Configuration
+config = Configuration()
+config.section_('General')
+config.General.transferOutputs = True
+config.General.requestName = 'my_CRAB_project_directory'
+config.section_('JobType')
+config.JobType.psetName = 'my_CMSSW_config.py'
+config.JobType.pluginName = 'Analysis'
+config.JobType.outputFiles = ['output.root']
+config.section_('Data')
+config.Data.inputDataset = '/GenericTTbar/HC-CMSSW_5_3_1_START53_V5-v1/GEN-SIM-RECO'
+config.Data.publication = True
+config.Data.unitsPerJob = 20
+config.Data.publishDBS = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
+config.Data.splitting = 'EventBased'
+config.Data.inputDBS = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader/'
+config.Data.outputDatasetTag = 'my_publication_name'
+config.section_('Site')
+config.Site.blacklist = ['T2_IT_Legnaro']
+config.Site.whitelist = ['T2_IT_Bari']
+config.Site.storageSite = 'T2_IT_Legnaro'
+
+
+and it will show the following screen output:
+
+
+Convertion done!
+crab2cfgTOcrab3py report:
+CRAB2 parameters not YET supported in CRAB3:
+ data_location_override,user_remote_dir
+CRAB2 parameters obsolete in CRAB3:
+ return_data,jobtype,scheduler,use_server
+
+
+As we already emphasized, the template configuration file produced by the
crab2cfgTOcrab3py utility should not be used before carefully looking into its content. Along this line, one can see for example that the parameter
JobType.outputFiles was set to
['output.root']. If
output.root is defined in the CMSSW parameter-set configuration file in an output module, then it doesn't have to be included in the
JobType.outputFiles list (although it doesn't harm).
+
+
+
+
+--
AndresTanasijczuk - 02 Oct 2014
+
+
+
+
+
+
+
+
+
+
+
Copyright &© 2008-2026 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
or Ideas, requests, problems regarding TWiki? use
Discourse or
Send feedback
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/unit/test_twiki_parser.py b/tests/unit/test_twiki_parser.py
new file mode 100644
index 000000000..044899616
--- /dev/null
+++ b/tests/unit/test_twiki_parser.py
@@ -0,0 +1,19 @@
+# tests/unit/test_twiki_parser.py
+from pathlib import Path
+from scrapy.http import HtmlResponse, Request
+from src.data_manager.collectors.scrapers.spiders.twiki import parse_twiki_page
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+def fake_html_response(url: str, fixture_name: str) -> HtmlResponse:
+ body = (FIXTURES / fixture_name).read_bytes()
+ return HtmlResponse(url=url, body=body, encoding="utf-8", request=Request(url=url))
+
+class TestParseTwikiPage:
+ def test_prefers_topic_title(self):
+ response = fake_html_response(
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ "twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html",
+ )
+ item = next(parse_twiki_page(response))
+ assert item["title"] == "CRAB3ConfigurationFile"
From 8e9ac3711e897443cd9f7344e50ab327d13eaf5d Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 22:33:34 +0100
Subject: [PATCH 07/55] scrapers resource adapter, scrapy Item -> Archi's
ScrapedResource.
---
src/data_manager/collectors/scrapers/items.py | 3 +-
.../collectors/scrapers/resource_adapter.py | 59 ++++++++++
.../collectors/scrapers/spiders/twiki.py | 12 +-
tests/unit/test_scrapers_resource_adapter.py | 105 ++++++++++++++++++
4 files changed, 169 insertions(+), 10 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/resource_adapter.py
create mode 100644 tests/unit/test_scrapers_resource_adapter.py
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index d0d3b0a4f..67d775de7 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -5,7 +5,6 @@ class ArchiBaseItem(Item):
url = Field() # canonical URL of the page
content = Field() # str (HTML/Markdown/text) or bytes (PDF)
suffix = Field() # "html" | "pdf" | "md" | ...
- source_type = Field() # "web" | "sso" | "git" | ...
title = Field() # page title, may be empty
class WebPageItem(ArchiBaseItem):
@@ -17,7 +16,7 @@ class PDFItem(ArchiBaseItem):
"""Binary PDF scraped from a web URL."""
content_type = Field()
-class TestTWikiItem(WebPageItem):
+class TWikiPageItem(WebPageItem):
"""Item produced by the trivial Twiki spider."""
body_length = Field()
body_preview = Field()
diff --git a/src/data_manager/collectors/scrapers/resource_adapter.py b/src/data_manager/collectors/scrapers/resource_adapter.py
new file mode 100644
index 000000000..e188fdc5f
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/resource_adapter.py
@@ -0,0 +1,59 @@
+"""
+Single-dispatch adapter: converts Scrapy Items into ScrapedResource.
+
+Design principles:
+- Items are dumb data bags. They know nothing about ScrapedResource.
+- This is the ONLY place that knows about both schemas.
+- New sources: add a @to_scraped_resource.register block here. Touch nothing else.
+- Do NOT reconstruct ResourceMetadata — ScrapedResource.get_metadata() already
+ derives display_name, url, suffix, source_type from raw fields. Pass raw values only.
+
+Constraint: ~50 LOC of logic.
+
+Adding a new source (e.g. TwikiPageItem):
+ @to_scraped_resource.register(TwikiPageItem)
+ def _twiki(item) -> ScrapedResource:
+ ...
+
+If two sources share identical mapping logic, stack decorators:
+ @to_scraped_resource.register(WebPageItem)
+ @to_scraped_resource.register(TwikiPageItem)
+ def _html_page(item) -> ScrapedResource:
+ ...
+ Note: do NOT use union type hints (WebPageItem | TwikiPageItem) —
+ singledispatch ignores annotations, it dispatches on runtime type only.
+"""
+from __future__ import annotations
+
+from functools import singledispatch
+
+from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
+from src.data_manager.collectors.scrapers.items import WebPageItem, TWikiPageItem
+
+
+@singledispatch
+def to_scraped_resource(item) -> ScrapedResource:
+ """Raises for unregistered types — fail loudly, never silently skip."""
+ raise TypeError(
+ f"No adapter registered for item type {type(item).__name__!r}. "
+ "Add @to_scraped_resource.register(YourItemClass) in this module."
+ )
+
+@to_scraped_resource.register(WebPageItem)
+def _web(item): return _html_page(item, source_type="web")
+
+@to_scraped_resource.register(TWikiPageItem)
+def _twiki(item): return _html_page(item, source_type="twiki")
+
+def _html_page(item, source_type) -> ScrapedResource:
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "html"),
+ source_type=source_type,
+ metadata={
+ "content_type": item.get("content_type"),
+ "encoding": item.get("encoding"),
+ "title": item.get("title"),
+ },
+ )
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 5ec8bab03..969bd0476 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -3,7 +3,7 @@
from scrapy import Spider, Request
from scrapy.http import Response
from urllib.parse import urlparse
-from src.data_manager.collectors.scrapers.items import TestTWikiItem
+from src.data_manager.collectors.scrapers.items import TWikiPageItem
from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
logger = logging.getLogger(__name__)
@@ -35,12 +35,9 @@ async def start(self):
url=start_url,
callback=self.parse,
errback=self.errback,
- meta={
- "source_type": "web",
- },
)
- def parse(self, response: Response) -> Iterator[TestTWikiItem | Request]:
+ def parse(self, response: Response) -> Iterator[TWikiPageItem | Request]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
@@ -62,7 +59,7 @@ def errback(self, failure):
repr(failure.value),
)
-def parse_twiki_page(response: Response) -> Iterator[TestTWikiItem]:
+def parse_twiki_page(response: Response) -> Iterator[TWikiPageItem]:
# Twiki-specific selectors
title = (
response.css("#topic-title::text").get()
@@ -77,11 +74,10 @@ def parse_twiki_page(response: Response) -> Iterator[TestTWikiItem]:
logger.info("Found title: %r", title)
- yield TestTWikiItem(
+ yield TWikiPageItem(
url=response.url,
title=title,
body_length=len(body_text),
body_preview=body_text[:300],
- source_type=response.meta.get("source_type"),
content_type=get_content_type(response)
)
diff --git a/tests/unit/test_scrapers_resource_adapter.py b/tests/unit/test_scrapers_resource_adapter.py
new file mode 100644
index 000000000..4524f4c2a
--- /dev/null
+++ b/tests/unit/test_scrapers_resource_adapter.py
@@ -0,0 +1,105 @@
+import pytest
+
+from src.data_manager.collectors.scrapers.resource_adapter import to_scraped_resource
+from src.data_manager.collectors.scrapers.items import WebPageItem, TWikiPageItem
+from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
+
+
+# ---------------------------------------------------------------------------
+# WebPageItem adapter
+# ---------------------------------------------------------------------------
+
+# class TestWebAdapter:
+# def _make_item(self, **overrides) -> WebPageItem:
+# base = {
+# "url": "https://example.com/page",
+# "content": "hello",
+# "suffix": "html",
+# "content_type": "text/html; charset=utf-8",
+# "encoding": "utf-8",
+# "title": "Example Page",
+# }
+# return WebPageItem({**base, **overrides})
+#
+# def test_returns_scraped_resource(self):
+# result = to_scraped_resource(self._make_item())
+# assert isinstance(result, ScrapedResource)
+#
+# def test_url_passthrough(self):
+# item = self._make_item(url="https://example.com/foo")
+# assert to_scraped_resource(item).url == "https://example.com/foo"
+#
+# def test_content_passthrough(self):
+# item = self._make_item(content="hi
")
+# assert to_scraped_resource(item).content == "hi
"
+#
+# def test_suffix(self):
+# assert to_scraped_resource(self._make_item(suffix="pdf")).suffix == "pdf"
+#
+# def test_source_type_is_web(self):
+# assert to_scraped_resource(self._make_item()).source_type == "web"
+#
+# def test_metadata_content_type(self):
+# item = self._make_item(content_type="text/html")
+# assert to_scraped_resource(item).metadata["content_type"] == "text/html"
+#
+# def test_metadata_encoding(self):
+# item = self._make_item(encoding="utf-8")
+# assert to_scraped_resource(item).metadata["encoding"] == "utf-8"
+#
+# def test_metadata_title(self):
+# item = self._make_item(title="My Title")
+# assert to_scraped_resource(item).metadata["title"] == "My Title"
+#
+# def test_optional_fields_absent(self):
+# """Adapter must not crash when optional fields are missing."""
+# item = WebPageItem({
+# "url": "https://example.com/page",
+# "content": "body",
+# "suffix": "html",
+# })
+# result = to_scraped_resource(item)
+# assert result.metadata.get("title") is None
+# assert result.metadata.get("encoding") is None
+#
+# def test_binary_content_passthrough(self):
+# """PDF bytes must pass through unchanged."""
+# raw = b"%PDF-1.4 binary content"
+# item = self._make_item(content=raw, suffix="pdf", content_type="application/pdf")
+# result = to_scraped_resource(item)
+# assert result.content == raw
+# assert result.is_binary
+
+
+# ---------------------------------------------------------------------------
+# TWikiPageItem adapter
+# ---------------------------------------------------------------------------
+
+class TestTWikiPageItemAdapter:
+ def _make_item(self, **overrides) -> TWikiPageItem:
+ base = {
+ "url": "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ "content": "CRAB3ConfigurationFile",
+ "title": "CRAB3ConfigurationFile",
+ }
+ return TWikiPageItem({**base, **overrides})
+
+ def test_returns_scraped_resource(self):
+ assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
+
+ def test_default_source_type_is_twiki(self):
+ assert to_scraped_resource(self._make_item()).source_type == "twiki"
+
+# ---------------------------------------------------------------------------
+# Unregistered item type — must fail loudly
+# ---------------------------------------------------------------------------
+
+class TestUnregisteredItem:
+ def test_raises_type_error_for_unknown_item(self):
+ """Adapter must raise, never silently return None or a half-baked resource."""
+
+ class UnknownItem(dict):
+ pass
+
+ with pytest.raises(TypeError, match="No adapter registered"):
+ to_scraped_resource(UnknownItem({"url": "x", "content": "y"}))
From 94ea4daf49e104c584fba37900880ac57cd02d6b Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 22 Mar 2026 22:49:57 +0100
Subject: [PATCH 08/55] preserve source_type=web for now, rearrange
resource_adapter & unit-tests.
---
.../collectors/scrapers/resource_adapter.py | 24 +++--
.../collectors/scrapers/spiders/link.py | 4 +-
tests/unit/test_scrapers_resource_adapter.py | 98 +++++--------------
3 files changed, 44 insertions(+), 82 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/resource_adapter.py b/src/data_manager/collectors/scrapers/resource_adapter.py
index e188fdc5f..c9d6e08ac 100644
--- a/src/data_manager/collectors/scrapers/resource_adapter.py
+++ b/src/data_manager/collectors/scrapers/resource_adapter.py
@@ -28,7 +28,7 @@ def _html_page(item) -> ScrapedResource:
from functools import singledispatch
from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-from src.data_manager.collectors.scrapers.items import WebPageItem, TWikiPageItem
+from src.data_manager.collectors.scrapers.items import PDFItem, WebPageItem, TWikiPageItem
@singledispatch
@@ -40,17 +40,27 @@ def to_scraped_resource(item) -> ScrapedResource:
)
@to_scraped_resource.register(WebPageItem)
-def _web(item): return _html_page(item, source_type="web")
-
@to_scraped_resource.register(TWikiPageItem)
-def _twiki(item): return _html_page(item, source_type="twiki")
-
-def _html_page(item, source_type) -> ScrapedResource:
+def _html_page(item) -> ScrapedResource:
return ScrapedResource(
url=item["url"],
content=item["content"],
suffix=item.get("suffix", "html"),
- source_type=source_type,
+ source_type="web",
+ metadata={
+ "content_type": item.get("content_type"),
+ "encoding": item.get("encoding"),
+ "title": item.get("title"),
+ },
+ )
+
+@to_scraped_resource.register(PDFItem)
+def _pdf(item) -> ScrapedResource:
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "pdf"),
+ source_type="web",
metadata={
"content_type": item.get("content_type"),
"encoding": item.get("encoding"),
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 86bd94ab3..5ba48a989 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -45,7 +45,7 @@ def parse(self, response: Response) -> Iterator[WebPageItem | PDFItem | Request]
Extract one item per response, then yield follow Requests up to max_depth.
@url https://quotes.toscrape.com/
@returns items 1
- @scrapes url content suffix source_type title
+ @scrapes url content suffix title
"""
self.logger.info("Status %s for %s", response.status, response.url)
@@ -87,7 +87,6 @@ def _extract_item(self, response: Response) -> Iterator[WebPageItem | PDFItem]:
url=response.url,
content=response.body,
suffix="pdf",
- source_type="web",
title="",
content_type=ct,
)
@@ -100,7 +99,6 @@ def _extract_item(self, response: Response) -> Iterator[WebPageItem | PDFItem]:
url=response.url,
content=response.text,
suffix="html",
- source_type="web",
title=title,
content_type=ct,
encoding=encoding,
diff --git a/tests/unit/test_scrapers_resource_adapter.py b/tests/unit/test_scrapers_resource_adapter.py
index 4524f4c2a..b1348dd42 100644
--- a/tests/unit/test_scrapers_resource_adapter.py
+++ b/tests/unit/test_scrapers_resource_adapter.py
@@ -1,81 +1,14 @@
import pytest
from src.data_manager.collectors.scrapers.resource_adapter import to_scraped_resource
-from src.data_manager.collectors.scrapers.items import WebPageItem, TWikiPageItem
+from src.data_manager.collectors.scrapers.items import TWikiPageItem, PDFItem
from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-
-# ---------------------------------------------------------------------------
-# WebPageItem adapter
-# ---------------------------------------------------------------------------
-
-# class TestWebAdapter:
-# def _make_item(self, **overrides) -> WebPageItem:
-# base = {
-# "url": "https://example.com/page",
-# "content": "hello",
-# "suffix": "html",
-# "content_type": "text/html; charset=utf-8",
-# "encoding": "utf-8",
-# "title": "Example Page",
-# }
-# return WebPageItem({**base, **overrides})
-#
-# def test_returns_scraped_resource(self):
-# result = to_scraped_resource(self._make_item())
-# assert isinstance(result, ScrapedResource)
-#
-# def test_url_passthrough(self):
-# item = self._make_item(url="https://example.com/foo")
-# assert to_scraped_resource(item).url == "https://example.com/foo"
-#
-# def test_content_passthrough(self):
-# item = self._make_item(content="hi
")
-# assert to_scraped_resource(item).content == "hi
"
-#
-# def test_suffix(self):
-# assert to_scraped_resource(self._make_item(suffix="pdf")).suffix == "pdf"
-#
-# def test_source_type_is_web(self):
-# assert to_scraped_resource(self._make_item()).source_type == "web"
-#
-# def test_metadata_content_type(self):
-# item = self._make_item(content_type="text/html")
-# assert to_scraped_resource(item).metadata["content_type"] == "text/html"
-#
-# def test_metadata_encoding(self):
-# item = self._make_item(encoding="utf-8")
-# assert to_scraped_resource(item).metadata["encoding"] == "utf-8"
-#
-# def test_metadata_title(self):
-# item = self._make_item(title="My Title")
-# assert to_scraped_resource(item).metadata["title"] == "My Title"
-#
-# def test_optional_fields_absent(self):
-# """Adapter must not crash when optional fields are missing."""
-# item = WebPageItem({
-# "url": "https://example.com/page",
-# "content": "body",
-# "suffix": "html",
-# })
-# result = to_scraped_resource(item)
-# assert result.metadata.get("title") is None
-# assert result.metadata.get("encoding") is None
-#
-# def test_binary_content_passthrough(self):
-# """PDF bytes must pass through unchanged."""
-# raw = b"%PDF-1.4 binary content"
-# item = self._make_item(content=raw, suffix="pdf", content_type="application/pdf")
-# result = to_scraped_resource(item)
-# assert result.content == raw
-# assert result.is_binary
-
-
# ---------------------------------------------------------------------------
-# TWikiPageItem adapter
+# WebPageItem, TWikiPageItem adapter
# ---------------------------------------------------------------------------
-class TestTWikiPageItemAdapter:
+class TestWebPageItemAdapter:
def _make_item(self, **overrides) -> TWikiPageItem:
base = {
"url": "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
@@ -87,8 +20,29 @@ def _make_item(self, **overrides) -> TWikiPageItem:
def test_returns_scraped_resource(self):
assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
- def test_default_source_type_is_twiki(self):
- assert to_scraped_resource(self._make_item()).source_type == "twiki"
+ def test_default_source_type_is_web(self):
+ assert to_scraped_resource(self._make_item()).source_type == "web"
+
+# ---------------------------------------------------------------------------
+# PDFItem adapter
+# ---------------------------------------------------------------------------
+
+class TestPDFAdapter:
+ def _make_item(self, **overrides) -> PDFItem:
+ base = {
+ "url": "https://mit-teal.github.io/801/textbook/2ed_chapter01.pdf",
+ "content": b"%PDF-1.4\n%mock pdf content\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF\n",
+ "title": "mock pdf",
+ "suffix": "pdf",
+ "content_type": "application/pdf",
+ }
+ return PDFItem({**base, **overrides})
+
+ def test_returns_scraped_resource(self):
+ assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
+
+ def test_default_source_type_is_web(self):
+ assert to_scraped_resource(self._make_item()).source_type == "web"
# ---------------------------------------------------------------------------
# Unregistered item type — must fail loudly
From 1c666aa72cb2a75204053ba2f39ad5bef7e551ac Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 29 Mar 2026 14:26:50 +0200
Subject: [PATCH 09/55] generic items, adapters, pipelines which encourage OCP.
---
.../{resource_adapter.py => adapters.py} | 29 ++++--
src/data_manager/collectors/scrapers/items.py | 95 ++++++++++++++-----
.../collectors/scrapers/pipelines.py | 8 ++
3 files changed, 102 insertions(+), 30 deletions(-)
rename src/data_manager/collectors/scrapers/{resource_adapter.py => adapters.py} (71%)
create mode 100644 src/data_manager/collectors/scrapers/pipelines.py
diff --git a/src/data_manager/collectors/scrapers/resource_adapter.py b/src/data_manager/collectors/scrapers/adapters.py
similarity index 71%
rename from src/data_manager/collectors/scrapers/resource_adapter.py
rename to src/data_manager/collectors/scrapers/adapters.py
index c9d6e08ac..cb1c2e4c6 100644
--- a/src/data_manager/collectors/scrapers/resource_adapter.py
+++ b/src/data_manager/collectors/scrapers/adapters.py
@@ -28,7 +28,7 @@ def _html_page(item) -> ScrapedResource:
from functools import singledispatch
from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-from src.data_manager.collectors.scrapers.items import PDFItem, WebPageItem, TWikiPageItem
+from src.data_manager.collectors.scrapers.items import WebPageItem, IndicoPageItem
@singledispatch
@@ -39,14 +39,21 @@ def to_scraped_resource(item) -> ScrapedResource:
"Add @to_scraped_resource.register(YourItemClass) in this module."
)
+
@to_scraped_resource.register(WebPageItem)
-@to_scraped_resource.register(TWikiPageItem)
def _html_page(item) -> ScrapedResource:
+ """
+ Handles all HTML-family pages regardless of auth method.
+
+ PDFs scraped from the web also route here — the parser sets
+ suffix="pdf" and content=bytes in the item, so no branch needed.
+ The adapter passes suffix and source_type through without inspection.
+ """
return ScrapedResource(
url=item["url"],
content=item["content"],
suffix=item.get("suffix", "html"),
- source_type="web",
+ source_type=item["source_type"],
metadata={
"content_type": item.get("content_type"),
"encoding": item.get("encoding"),
@@ -54,16 +61,22 @@ def _html_page(item) -> ScrapedResource:
},
)
-@to_scraped_resource.register(PDFItem)
-def _pdf(item) -> ScrapedResource:
+
+@to_scraped_resource.register(IndicoPageItem)
+def _indico(item) -> ScrapedResource:
+ """
+ Indico items carry event_id and category as extra metadata.
+ These are the only fields that justify a separate dispatch branch.
+ """
return ScrapedResource(
url=item["url"],
content=item["content"],
- suffix=item.get("suffix", "pdf"),
- source_type="web",
+ suffix=item.get("suffix", "html"),
+ source_type=item["source_type"],
metadata={
"content_type": item.get("content_type"),
- "encoding": item.get("encoding"),
"title": item.get("title"),
+ "event_id": item.get("event_id"),
+ "category": item.get("category"),
},
)
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index 67d775de7..f7ea5e6e6 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -1,22 +1,73 @@
-from scrapy import Item, Field
-
-class ArchiBaseItem(Item):
- """Fields shared by every source type."""
- url = Field() # canonical URL of the page
- content = Field() # str (HTML/Markdown/text) or bytes (PDF)
- suffix = Field() # "html" | "pdf" | "md" | ...
- title = Field() # page title, may be empty
-
-class WebPageItem(ArchiBaseItem):
- """Item produced by the plain-Link spider."""
- content_type = Field() # value of Content-Type response header
- encoding = Field() # response encoding (e.g. "utf-8")
-
-class PDFItem(ArchiBaseItem):
- """Binary PDF scraped from a web URL."""
- content_type = Field()
-
-class TWikiPageItem(WebPageItem):
- """Item produced by the trivial Twiki spider."""
- body_length = Field()
- body_preview = Field()
+"""
+Scrapy intuition — Items as the data contract (FR-7a):
+
+ Items sit between Parser and Adapter.
+ Their field schema must be driven by what the Adapter needs
+ to construct a ScrapedResource — not by what's convenient
+ to inspect during development.
+
+ Wrong mental model: "what fields help me debug?"
+ Right mental model: "what fields does ScrapedResource.__init__ need?"
+
+ ScrapedResource fields (from scraped_resource.py):
+ url — required
+ content — required (str or bytes)
+ suffix — required
+ source_type — required ("web", "sso", "git")
+ metadata — dict, optional (title, content_type, encoding, etc.)
+ file_name — optional
+ relative_path — optional
+
+ So items carry exactly those fields.
+ Debug fields (body_preview, body_length) belong in logger calls,
+ not in the item schema — otherwise the adapter becomes a translation
+ layer for data that should never have been structured in the first place.
+
+SOLID note — Open/Closed:
+ Add new Item subclasses for new source types.
+ Do not add source-specific fields to the base class.
+ The adapter is the extension point, not the Item.
+"""
+
+import scrapy
+
+
+class BasePageItem(scrapy.Item):
+ """
+ Common fields shared across all scraped source types.
+ Maps directly to ScrapedResource constructor arguments.
+ """
+ url = scrapy.Field()
+ content = scrapy.Field() # Full text or bytes — NOT a preview
+ suffix = scrapy.Field() # "html", "pdf", "md" etc.
+ source_type = scrapy.Field() # "web" | "sso" | "twiki" | 'indico" | "discourse"
+
+ # Metadata fields — become ScrapedResource.metadata dict
+ title = scrapy.Field()
+ content_type = scrapy.Field() # HTTP Content-Type header value
+ encoding = scrapy.Field() # HTTP response encoding
+
+ # Optional — used by git/SSO scrapers for filesystem layout
+ file_name = scrapy.Field()
+ relative_path = scrapy.Field()
+
+
+class WebPageItem(BasePageItem):
+ """
+ Generic page item, works for SSO-*, ordinary web page.
+ No extra fields needed beyond BasePageItem.
+ Subclassing is the extension point (OCP) — Twiki quirks
+ belong in parse_twiki_page(), not in a bloated base class.
+ """
+ pass
+
+
+class IndicoPageItem(BasePageItem):
+ """
+ Indico-specific item.
+ Indico API responses carry an event_id and category — useful
+ for metadata routing in the adapter without polluting the base.
+ """
+ event_id = scrapy.Field()
+ category = scrapy.Field()
+
diff --git a/src/data_manager/collectors/scrapers/pipelines.py b/src/data_manager/collectors/scrapers/pipelines.py
new file mode 100644
index 000000000..1c1365794
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines.py
@@ -0,0 +1,8 @@
+from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
+
+class AdapterPipeline:
+ def process_item(self, item, spider):
+ resource = to_scraped_resource(item)
+ # Implicitly, set site for every pair of spider/resource.
+ resource.metadata["site"] = spider.name
+ return item
From b7fc00fc4becc7522f9e5dd27cf5396c07a91833 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Mon, 30 Mar 2026 21:19:06 +0200
Subject: [PATCH 10/55] generic LinkSpider for subclassing, clear Open/Closed
boundaries + SoC via parsers, introduce extension points parse_item,
parse_follow_links, default implementation, toscrape and twiki example.
---
.../collectors/scrapers/parsers/toscrape.py | 29 +++++
.../collectors/scrapers/parsers/twiki.py | 25 +++++
.../collectors/scrapers/spiders/link.py | 106 ++++++++----------
.../collectors/scrapers/spiders/toscrape.py | 28 +++++
.../collectors/scrapers/spiders/twiki.py | 76 +++----------
src/data_manager/collectors/scrapers/utils.py | 8 +-
6 files changed, 148 insertions(+), 124 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/parsers/toscrape.py
create mode 100644 src/data_manager/collectors/scrapers/parsers/twiki.py
create mode 100644 src/data_manager/collectors/scrapers/spiders/toscrape.py
diff --git a/src/data_manager/collectors/scrapers/parsers/toscrape.py b/src/data_manager/collectors/scrapers/parsers/toscrape.py
new file mode 100644
index 000000000..92a045933
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/parsers/toscrape.py
@@ -0,0 +1,29 @@
+from typing import Iterator
+from scrapy.http import Response, TextResponse
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.utils import get_content_type
+
+def parse_toscrape_page(response: Response) -> Iterator[WebPageItem]:
+ ct = get_content_type(response)
+
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield WebPageItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ title="",
+ content_type=ct,
+ )
+ return
+
+ title = response.css("title::text").get(default="").strip()
+ encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
+
+ yield WebPageItem(
+ url=response.url,
+ content=response.text,
+ suffix="html",
+ title=title,
+ content_type=ct,
+ encoding=encoding,
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
new file mode 100644
index 000000000..aa2da5201
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -0,0 +1,25 @@
+from typing import Iterator
+from scrapy.http import Response
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.utils import get_content_type
+
+
+def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
+ # Twiki-specific selectors
+ title = (
+ response.css("#topic-title::text").get()
+ or response.css(".patternTitle::text").get()
+ or response.css("title::text").get("").split("<")[0].strip()
+ )
+ # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
+ body_text = " ".join(
+ response.css("#twikiMainContents *::text, .patternMain *::text").getall()
+ ).strip()
+
+ yield WebPageItem(
+ url=response.url,
+ title=title,
+ body_length=len(body_text),
+ body_preview=body_text[:300],
+ content_type=get_content_type(response)
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 5ba48a989..76b8d5546 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -1,12 +1,10 @@
from typing import Iterator
from urllib.parse import urlparse
-
from scrapy import Request, Spider
-from scrapy.http import Response, TextResponse
-
-from src.data_manager.collectors.scrapers.items import PDFItem, WebPageItem
-from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
-
+from scrapy.http import Response
+from scrapy.linkextractors import LinkExtractor
+from scrapy.link import Link
+from src.data_manager.collectors.scrapers.items import WebPageItem
class LinkSpider(Spider):
"""
@@ -15,59 +13,63 @@ class LinkSpider(Spider):
"""
name = "link"
- custom_settings = {
- "DEPTH_LIMIT": 2, # safety cap; narrowed per-crawl via meta["depth"] check
- }
- def __init__(self, start_url: str = "", max_depth: int = 1, *args, **kwargs):
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ max_depth = int(kwargs.get("max_depth", 1))
+ max_page = int(kwargs.get("max_page", 0))
+ crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
+ if max_page:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_page, priority="spider")
+ return super().from_crawler(crawler, *args, **kwargs)
+
+ def __init__(self, start_urls: list[str] = None, max_depth: int = 1, max_page: int = 0, allow: list[str] = None, deny: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
- self._start_url = start_url
- self._base_host = urlparse(start_url).netloc
+ if start_urls is None:
+ raise ValueError("LinkSpider requires start_urls list parameter")
+ self._start_urls = start_urls
+ self._base_host = urlparse(start_urls[0]).netloc
self._max_depth = int(max_depth)
+ self._max_page = int(max_page)
+ self._le = LinkExtractor(
+ allow=allow or [],
+ deny=deny or [],
+ allow_domains=[self._base_host],
+ deny_extensions=[".jpg", ".jpeg", ".png", ".gif",
+ ".bmp", ".svg", ".ico", ".webp"],
+ unique=True,
+ )
async def start(self):
"""
- Seed request — validates start_url at crawl time, not import time.
+ Seed requests — validates start_urls at crawl time, not import time.
Building the habit: always attach errback here, never rely on
start_urls shortcut in production spiders.
"""
- if not self._start_url:
- raise ValueError("links spider requires -a start_url=")
- yield Request(
- url=self._start_url,
- callback=self.parse,
- errback=self.errback,
- meta={"depth": 0},
- )
+ for url in self._start_urls:
+ yield Request(url=url, callback=self.parse, errback=self.errback, meta={"depth": 0})
- def parse(self, response: Response) -> Iterator[WebPageItem | PDFItem | Request]:
+ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
Extract one item per response, then yield follow Requests up to max_depth.
- @url https://quotes.toscrape.com/
- @returns items 1
- @scrapes url content suffix title
"""
- self.logger.info("Status %s for %s", response.status, response.url)
-
- yield from self._extract_item(response)
+ yield from self.parse_item(response) # Yield Items
+ yield from self.follow_links(response) # Yield Requests
+
+ def follow_links(self, response: Response) -> Iterator[Request]:
current_depth = response.meta.get("depth", 0)
if current_depth >= self._max_depth:
return
-
- shlinks = same_host_links(self._base_host, response)
- self.logger.info(
- "Found %d same-host links at depth %d", len(shlinks), current_depth
- )
-
- for url in shlinks:
+ for link in self.parse_follow_links(response):
+ self.logger.info("Following %s at depth %d", link.url, current_depth)
yield Request(
- url=url,
+ link.url,
callback=self.parse,
errback=self.errback,
meta={"depth": current_depth + 1},
)
-
+
def errback(self, failure):
self.logger.error(
"Request failed: %s — %s",
@@ -76,31 +78,11 @@ def errback(self, failure):
)
# ------------------------------------------------------------------ #
- # Private helpers — pure, unit-testable without a reactor
+ # Extension points — pure, unit-testable without a reactor
# ------------------------------------------------------------------ #
- def _extract_item(self, response: Response) -> Iterator[WebPageItem | PDFItem]:
- ct = get_content_type(response)
-
- if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
- yield PDFItem(
- url=response.url,
- content=response.body,
- suffix="pdf",
- title="",
- content_type=ct,
- )
- return
-
- title = response.css("title::text").get(default="").strip()
- encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
-
- yield WebPageItem(
- url=response.url,
- content=response.text,
- suffix="html",
- title=title,
- content_type=ct,
- encoding=encoding,
- )
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ raise NotImplementedError("parse_item must be implemented by the subclass")
+ def parse_follow_links(self, response: Response) -> Iterator[Link]:
+ yield from self._le.extract_links(response)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/toscrape.py b/src/data_manager/collectors/scrapers/spiders/toscrape.py
new file mode 100644
index 000000000..4b05fd1a8
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/toscrape.py
@@ -0,0 +1,28 @@
+from typing import Iterator
+from scrapy.http import Response
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
+from src.data_manager.collectors.scrapers.parsers.toscrape import parse_toscrape_page
+from scrapy.link import Link
+
+class ToscrapeSpider(LinkSpider):
+ """
+ Spider for scraping HTML pages from toscrape.com.
+ """
+
+ name = "toscrape"
+
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ """
+ @url https://quotes.toscrape.com/
+ @returns items 1
+ @scrapes url title
+ """
+ yield from parse_toscrape_page(response)
+
+ def parse_follow_links(self, response: Response) -> Iterator[Link]:
+ """
+ @url https://quotes.toscrape.com/
+ @returns requests 1
+ """
+ yield from super().parse_follow_links(response)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 969bd0476..0b97f7cf2 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -2,20 +2,22 @@
from typing import Iterator
from scrapy import Spider, Request
from scrapy.http import Response
-from urllib.parse import urlparse
-from src.data_manager.collectors.scrapers.items import TWikiPageItem
-from src.data_manager.collectors.scrapers.utils import get_content_type, same_host_links
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
+from src.data_manager.collectors.scrapers.parsers.twiki import parse_twiki_page
+from scrapy.link import Link
-logger = logging.getLogger(__name__)
-class TwikiSpider(Spider):
+class TwikiSpider(LinkSpider):
"""
Minimal Twiki spider against a real Twiki target.
Public page — no SSO needed — isolates lifecycle learning from auth complexity.
"""
name = "twiki"
-
+
+ _DEFAULT_START_URL = "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
+
custom_settings = {
"ROBOTSTXT_OBEY": False,
"DOWNLOAD_DELAY": 60,
@@ -23,61 +25,19 @@ class TwikiSpider(Spider):
"RETRY_TIMES": 0,
}
- async def start(self):
- """
- Seed request for the CRAB3 Twiki config page.
- Building the habit: always use start_requests() with errback attached,
- never rely on the start_urls shortcut in production spiders.
- """
- start_url = "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
- self._base_host = urlparse(start_url).netloc
- yield Request(
- url=start_url,
- callback=self.parse,
- errback=self.errback,
- )
-
- def parse(self, response: Response) -> Iterator[TWikiPageItem | Request]:
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
-
@url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
@returns items 1 1
- @scrapes url title same_host_links_count
+ @scrapes url title
"""
- self.logger.info("Status %s for %s", response.status, response.url)
-
- yield from parse_twiki_page(response) # Yield item
- # then, follow links
- shlinks = same_host_links(self._base_host, response)
- logger.info("Found %d same-host links", len(shlinks))
-
- def errback(self, failure):
- self.logger.error(
- "Request failed: %s — %s",
- failure.request.url,
- repr(failure.value),
- )
+ yield from parse_twiki_page(response)
-def parse_twiki_page(response: Response) -> Iterator[TWikiPageItem]:
- # Twiki-specific selectors
- title = (
- response.css("#topic-title::text").get()
- or response.css(".patternTitle::text").get()
- or response.css("title::text").get("").split("<")[0].strip()
- )
- # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
- body_text = " ".join(
- response.css("#twikiMainContents *::text, .patternMain *::text").getall()
- ).strip()
-
-
- logger.info("Found title: %r", title)
-
- yield TWikiPageItem(
- url=response.url,
- title=title,
- body_length=len(body_text),
- body_preview=body_text[:300],
- content_type=get_content_type(response)
- )
+ def parse_follow_links(self, response: Response) -> Iterator[Link]:
+ """
+ Follow links to other Twiki pages.
+ @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
+ @returns requests 1
+ """
+ yield from super().parse_follow_links(response)
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
index 1e7f2e5c6..83eb6605d 100644
--- a/src/data_manager/collectors/scrapers/utils.py
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -2,20 +2,20 @@
from urllib.parse import urlparse
from scrapy.http import Response
+from src.data_manager.collectors.scrapers.types import Url
_IMAGE_EXTS = frozenset({
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".ico", ".webp"
})
-def same_host_links(base_host, response: Response) -> List[str]:
+def same_host_links(base_host: str, urls: list[Url]) -> list[Url]:
"""
- Return deduplicated same-host, non-image absolute URLs on this page.
+ Return deduplicated same-host, non-image absolute URLs preserving the original order.
"""
seen = set()
links = []
- for href in response.css("a::attr(href)").getall():
- url = response.urljoin(href)
+ for url in urls:
parsed = urlparse(url)
if parsed.netloc != base_host:
continue
From dfce70d13bcedd8d1984273ca0b0745fb8221481 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Mon, 30 Mar 2026 22:11:56 +0200
Subject: [PATCH 11/55] refactor LinkSpider, accepted/accounted real
Twiki-usecases configurable crawler args.
---
.../collectors/scrapers/spiders/link.py | 32 +++++++++++--------
.../collectors/scrapers/spiders/toscrape.py | 2 ++
.../collectors/scrapers/spiders/twiki.py | 2 +-
3 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 76b8d5546..38e7a3ecd 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -4,6 +4,7 @@
from scrapy.http import Response
from scrapy.linkextractors import LinkExtractor
from scrapy.link import Link
+from src.data_manager.collectors.scrapers.utils import _IMAGE_EXTS
from src.data_manager.collectors.scrapers.items import WebPageItem
class LinkSpider(Spider):
@@ -17,26 +18,29 @@ class LinkSpider(Spider):
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
max_depth = int(kwargs.get("max_depth", 1))
- max_page = int(kwargs.get("max_page", 0))
+ max_pages = int(kwargs.get("max_pages", 0))
crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
- if max_page:
- crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_page, priority="spider")
+ if max_pages:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ delay = kwargs.get("delay")
+ if delay is not None:
+ crawler.settings.set("DOWNLOAD_DELAY", int(delay), priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
- def __init__(self, start_urls: list[str] = None, max_depth: int = 1, max_page: int = 0, allow: list[str] = None, deny: list[str] = None, *args, **kwargs):
+ def __init__(self, start_urls: list[str] = None, max_depth: int = 1, max_pages: int = 0, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, *args, **kwargs):
super().__init__(*args, **kwargs)
- if start_urls is None:
- raise ValueError("LinkSpider requires start_urls list parameter")
- self._start_urls = start_urls
- self._base_host = urlparse(start_urls[0]).netloc
+ self._start_urls = start_urls or getattr(self, "_DEFAULT_START_URLS", [])
+ self._base_host = urlparse(start_urls[0]).netloc if start_urls else None
self._max_depth = int(max_depth)
- self._max_page = int(max_page)
+ # Stored for introspection only — enforcement is via Scrapy settings set in from_crawler.
+ self._max_pages = int(max_pages)
+ self._delay = int(delay) if delay is not None else None
self._le = LinkExtractor(
allow=allow or [],
deny=deny or [],
- allow_domains=[self._base_host],
- deny_extensions=[".jpg", ".jpeg", ".png", ".gif",
- ".bmp", ".svg", ".ico", ".webp"],
+ allow_domains=[self._base_host] if self._base_host else [],
+ deny_extensions=list(_IMAGE_EXTS),
+ canonicalize=canonicalize,
unique=True,
)
@@ -46,6 +50,8 @@ async def start(self):
Building the habit: always attach errback here, never rely on
start_urls shortcut in production spiders.
"""
+ if not self._start_urls:
+ raise ValueError("LinkSpider requires start_urls to be set")
for url in self._start_urls:
yield Request(url=url, callback=self.parse, errback=self.errback, meta={"depth": 0})
@@ -78,7 +84,7 @@ def errback(self, failure):
)
# ------------------------------------------------------------------ #
- # Extension points — pure, unit-testable without a reactor
+ # Extension points — pure, unit-testable/checkable without a reactor
# ------------------------------------------------------------------ #
def parse_item(self, response: Response) -> Iterator[WebPageItem]:
diff --git a/src/data_manager/collectors/scrapers/spiders/toscrape.py b/src/data_manager/collectors/scrapers/spiders/toscrape.py
index 4b05fd1a8..c615c034c 100644
--- a/src/data_manager/collectors/scrapers/spiders/toscrape.py
+++ b/src/data_manager/collectors/scrapers/spiders/toscrape.py
@@ -12,6 +12,8 @@ class ToscrapeSpider(LinkSpider):
name = "toscrape"
+ _DEFAULT_START_URLS = ["https://quotes.toscrape.com/"]
+
def parse_item(self, response: Response) -> Iterator[WebPageItem]:
"""
@url https://quotes.toscrape.com/
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 0b97f7cf2..a72d8d680 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -16,7 +16,7 @@ class TwikiSpider(LinkSpider):
name = "twiki"
- _DEFAULT_START_URL = "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
+ _DEFAULT_START_URLS = ["https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"]
custom_settings = {
"ROBOTSTXT_OBEY": False,
From 649b5c72775332e89af6b407079bb40f24466145 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Mon, 30 Mar 2026 22:25:24 +0200
Subject: [PATCH 12/55] cleaner by put the scrapy contracts under parse, scrapy
checkable on both toscrape, twiki examples.
---
.../collectors/scrapers/parsers/twiki.py | 6 ++---
.../collectors/scrapers/spiders/toscrape.py | 14 +++++-------
.../collectors/scrapers/spiders/twiki.py | 17 +++++---------
src/data_manager/collectors/scrapers/utils.py | 22 -------------------
4 files changed, 15 insertions(+), 44 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index aa2da5201..71f17030e 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -19,7 +19,7 @@ def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
yield WebPageItem(
url=response.url,
title=title,
- body_length=len(body_text),
- body_preview=body_text[:300],
- content_type=get_content_type(response)
+ content=body_text,
+ suffix="html",
+ content_type=get_content_type(response),
)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/toscrape.py b/src/data_manager/collectors/scrapers/spiders/toscrape.py
index c615c034c..67af9e815 100644
--- a/src/data_manager/collectors/scrapers/spiders/toscrape.py
+++ b/src/data_manager/collectors/scrapers/spiders/toscrape.py
@@ -1,4 +1,5 @@
from typing import Iterator
+from scrapy import Request
from scrapy.http import Response
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
@@ -14,17 +15,14 @@ class ToscrapeSpider(LinkSpider):
_DEFAULT_START_URLS = ["https://quotes.toscrape.com/"]
- def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
@url https://quotes.toscrape.com/
@returns items 1
+ @returns requests 1
@scrapes url title
"""
- yield from parse_toscrape_page(response)
+ yield from super().parse(response)
- def parse_follow_links(self, response: Response) -> Iterator[Link]:
- """
- @url https://quotes.toscrape.com/
- @returns requests 1
- """
- yield from super().parse_follow_links(response)
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ yield from parse_toscrape_page(response)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index a72d8d680..8c6151749 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -1,6 +1,5 @@
-import logging
from typing import Iterator
-from scrapy import Spider, Request
+from scrapy import Request
from scrapy.http import Response
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
@@ -25,19 +24,15 @@ class TwikiSpider(LinkSpider):
"RETRY_TIMES": 0,
}
- def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
@url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
@returns items 1 1
@scrapes url title
- """
- yield from parse_twiki_page(response)
-
- def parse_follow_links(self, response: Response) -> Iterator[Link]:
- """
- Follow links to other Twiki pages.
- @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
@returns requests 1
"""
- yield from super().parse_follow_links(response)
+ yield from super().parse(response)
+
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ yield from parse_twiki_page(response)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
index 83eb6605d..a290ae30d 100644
--- a/src/data_manager/collectors/scrapers/utils.py
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -1,31 +1,9 @@
-from typing import List
-from urllib.parse import urlparse
-
from scrapy.http import Response
-from src.data_manager.collectors.scrapers.types import Url
_IMAGE_EXTS = frozenset({
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".ico", ".webp"
})
-def same_host_links(base_host: str, urls: list[Url]) -> list[Url]:
- """
- Return deduplicated same-host, non-image absolute URLs preserving the original order.
- """
-
- seen = set()
- links = []
- for url in urls:
- parsed = urlparse(url)
- if parsed.netloc != base_host:
- continue
- if any(parsed.path.lower().endswith(e) for e in _IMAGE_EXTS):
- continue
- if url not in seen:
- seen.add(url)
- links.append(url)
- return links
-
def get_content_type(response: Response) -> str:
"""Decode the Content-Type header bytes to str."""
raw: bytes = response.headers.get("Content-Type", b"") or b""
From daba5c257fe6438c091920902b47978b181104b7 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Mon, 30 Mar 2026 23:02:34 +0200
Subject: [PATCH 13/55] refactored how to proper crawler engine settings, safe
default values for toscrape, twiki.
---
.../collectors/scrapers/spiders/link.py | 37 +++++++++----------
.../collectors/scrapers/spiders/twiki.py | 12 ++++--
2 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 38e7a3ecd..5c798775a 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -15,26 +15,29 @@ class LinkSpider(Spider):
name = "link"
+ custom_settings = {
+ "DEPTH_LIMIT": 1, # Default max depth
+ "DOWNLOAD_DELAY": 2, # Default (download) delay
+ "CLOSESPIDER_PAGECOUNT": 500 # Default max pages
+ }
+
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
- max_depth = int(kwargs.get("max_depth", 1))
- max_pages = int(kwargs.get("max_pages", 0))
- crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
- if max_pages:
- crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ max_depth = kwargs.get("max_depth")
+ max_pages = kwargs.get("max_pages")
delay = kwargs.get("delay")
- if delay is not None:
- crawler.settings.set("DOWNLOAD_DELAY", int(delay), priority="spider")
+ if max_depth:
+ crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
+ if max_pages:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ if delay:
+ crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
- def __init__(self, start_urls: list[str] = None, max_depth: int = 1, max_pages: int = 0, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, *args, **kwargs):
+ def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, *args, **kwargs):
super().__init__(*args, **kwargs)
self._start_urls = start_urls or getattr(self, "_DEFAULT_START_URLS", [])
self._base_host = urlparse(start_urls[0]).netloc if start_urls else None
- self._max_depth = int(max_depth)
- # Stored for introspection only — enforcement is via Scrapy settings set in from_crawler.
- self._max_pages = int(max_pages)
- self._delay = int(delay) if delay is not None else None
self._le = LinkExtractor(
allow=allow or [],
deny=deny or [],
@@ -65,16 +68,12 @@ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
def follow_links(self, response: Response) -> Iterator[Request]:
current_depth = response.meta.get("depth", 0)
- if current_depth >= self._max_depth:
+ if current_depth >= self.settings.get("DEPTH_LIMIT"):
+ self.logger.info("Reached max depth %d", self.settings.get("DEPTH_LIMIT"))
return
for link in self.parse_follow_links(response):
self.logger.info("Following %s at depth %d", link.url, current_depth)
- yield Request(
- link.url,
- callback=self.parse,
- errback=self.errback,
- meta={"depth": current_depth + 1},
- )
+ yield Request(link.url, callback=self.parse, errback=self.errback, meta={"depth": current_depth + 1})
def errback(self, failure):
self.logger.error(
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 8c6151749..74411d210 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -15,13 +15,17 @@ class TwikiSpider(LinkSpider):
name = "twiki"
- _DEFAULT_START_URLS = ["https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"]
-
+ _DEFAULT_START_URLS = [
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
+ ]
+
custom_settings = {
"ROBOTSTXT_OBEY": False,
- "DOWNLOAD_DELAY": 60,
"DOWNLOAD_TIMEOUT": 120,
- "RETRY_TIMES": 0,
+ "RETRY_TIMES": 0, # Very Safe no retries
+ "DEPTH_LIMIT": 1, # Default max depth
+ "DOWNLOAD_DELAY": 60, # Default (download) delay
+ "CLOSESPIDER_PAGECOUNT": 1 # Very Safe Default max pages
}
def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
From 2ab074ad7e697a80f7cdb1cdacea3080c94878f3 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Mon, 30 Mar 2026 23:41:55 +0200
Subject: [PATCH 14/55] cleanest way to normalize url via LinkExtractor's
process_value, Real-world twiki crawling contracts with saftest default
values.
---
.../collectors/scrapers/spiders/link.py | 15 ++++++----
.../collectors/scrapers/spiders/twiki.py | 30 +++++++++++++++----
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 5c798775a..c93543a9f 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -1,7 +1,7 @@
-from typing import Iterator
+from typing import Iterator, Callable
from urllib.parse import urlparse
-from scrapy import Request, Spider
-from scrapy.http import Response
+from scrapy import Spider
+from scrapy.http import Response, Request
from scrapy.linkextractors import LinkExtractor
from scrapy.link import Link
from src.data_manager.collectors.scrapers.utils import _IMAGE_EXTS
@@ -34,16 +34,19 @@ def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
- def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, *args, **kwargs):
+ def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, process_value: Callable[[str], str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._start_urls = start_urls or getattr(self, "_DEFAULT_START_URLS", [])
- self._base_host = urlparse(start_urls[0]).netloc if start_urls else None
+ self._base_host = urlparse(self._start_urls[0]).netloc if self._start_urls else None
+ default_deny = getattr(self, "_DEFAULT_DENY", [])
+ default_process_value = getattr(self, "_DEFAULT_PROCESS_VALUE", None)
self._le = LinkExtractor(
allow=allow or [],
- deny=deny or [],
+ deny=(deny or []) + default_deny,
allow_domains=[self._base_host] if self._base_host else [],
deny_extensions=list(_IMAGE_EXTS),
canonicalize=canonicalize,
+ process_value=process_value or default_process_value,
unique=True,
)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 74411d210..7d33f8833 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -1,10 +1,9 @@
from typing import Iterator
-from scrapy import Request
-from scrapy.http import Response
+from urllib.parse import urlparse
+from scrapy.http import Response, Request
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
from src.data_manager.collectors.scrapers.parsers.twiki import parse_twiki_page
-from scrapy.link import Link
class TwikiSpider(LinkSpider):
@@ -16,7 +15,19 @@ class TwikiSpider(LinkSpider):
name = "twiki"
_DEFAULT_START_URLS = [
- "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile"
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide"
+ ]
+
+ _DEFAULT_DENY = [
+ "/bin/edit",
+ "/bin/logon",
+ "/bin/oops",
+ "/bin/attach",
+ "/bin/search",
+ "/bin/rdiff",
+ "/bin/history",
+ "/bin/raw",
+ "/LeftBarLeftBar",
]
custom_settings = {
@@ -28,13 +39,20 @@ class TwikiSpider(LinkSpider):
"CLOSESPIDER_PAGECOUNT": 1 # Very Safe Default max pages
}
+ @staticmethod
+ def _normalize_url(url: str) -> str:
+ """Keep TWiki URLs clean: only scheme + netloc + path — drop query params and fragment."""
+ return urlparse(url)._replace(query="", fragment="").geturl() # type: ignore
+
+ _DEFAULT_PROCESS_VALUE = _normalize_url
+
def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
- @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile
+ @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
@returns items 1 1
@scrapes url title
- @returns requests 1
+ @returns requests 110 110
"""
yield from super().parse(response)
From b4e519265597ce26914a8efc86de7349b8dc156a Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 00:10:52 +0200
Subject: [PATCH 15/55] add more generic twiki default_deny patterns.
---
.../collectors/scrapers/spiders/twiki.py | 35 +++++++++++++------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 7d33f8833..d044b0e1f 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -19,15 +19,30 @@ class TwikiSpider(LinkSpider):
]
_DEFAULT_DENY = [
- "/bin/edit",
- "/bin/logon",
- "/bin/oops",
- "/bin/attach",
- "/bin/search",
- "/bin/rdiff",
- "/bin/history",
- "/bin/raw",
- "/LeftBarLeftBar",
+ # CGI endpoints — no content, mostly we allow just /bin/view/ or /bin/viewauth/
+ r"/bin/edit",
+ r"/bin/logon",
+ r"/bin/oops",
+ r"/bin/attach",
+ r"/bin/search",
+ r"/bin/rdiff",
+ r"/bin/history",
+ r"/bin/raw",
+ r"/bin/genpdf", # PDF generation — not content
+ r"/bin/view/Main", # user profile pages, not content
+ # Navigation/structural pages
+ r"LeftBarLeftBar",
+ r"/bin/view/[^/]+/WebLeftBar", # sidebar navigation template
+ r"/bin/view/[^/]+/WebTopBar", # top navigation bar
+ r"/bin/view/[^/]+/WebChanges", # recent changes — floods with links
+ r"/bin/view/[^/]+/WebIndex", # alphabetical index — floods with links
+ r"/bin/view/[^/]+/WebStatistics", # statistics pages
+ r"/bin/view/[^/]+/WebNotify", # notification subscriptions
+ r"/bin/view/[^/]+/WebPreferences", # wiki preferences
+ # Discard Topic List page, too many links in https://twiki.cern.ch/twiki/bin/view/CMSPublic/WebTopicList
+ r"/bin/view/[^/]+/WebTopicList", # too many links, or should been put as seeds_urls.
+ r"/bin/view/[^/]+/WebSearch", # search page — floods with links
+ r"/bin/view/[^/]+/WebChanges", # recent changes — floods with links
]
custom_settings = {
@@ -52,7 +67,7 @@ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
@url https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
@returns items 1 1
@scrapes url title
- @returns requests 110 110
+ @returns requests 1 100
"""
yield from super().parse(response)
From 742811acf27dbc71fb7a52065655532fef62dd47 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 03:33:09 +0200
Subject: [PATCH 16/55] refactored AuthProvider, Middlewares, with clear OCP,
SoC boundary, works in HeavyIon use-cases.
---
.../collectors/scrapers/auth/__init__.py | 0
.../collectors/scrapers/auth/base.py | 96 +++++
.../collectors/scrapers/auth/cern_sso.py | 272 +++++++++++++
.../collectors/scrapers/middlewares.py | 372 ++++++++++++++++++
.../collectors/scrapers/settings.py | 13 +-
.../collectors/scrapers/spiders/twiki.py | 10 +-
6 files changed, 759 insertions(+), 4 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/auth/__init__.py
create mode 100644 src/data_manager/collectors/scrapers/auth/base.py
create mode 100644 src/data_manager/collectors/scrapers/auth/cern_sso.py
create mode 100644 src/data_manager/collectors/scrapers/middlewares.py
diff --git a/src/data_manager/collectors/scrapers/auth/__init__.py b/src/data_manager/collectors/scrapers/auth/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/auth/base.py b/src/data_manager/collectors/scrapers/auth/base.py
new file mode 100644
index 000000000..f85830868
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/auth/base.py
@@ -0,0 +1,96 @@
+"""
+Base auth contract: Credentials value object + AuthProvider ABC.
+
+Scrapy SoC note
+---------------
+Providers are *credential factories only*. They know how to acquire,
+validate, and refresh credentials. They have zero knowledge of Scrapy
+Requests, Responses, spiders, or pipelines. The middleware decides *when*
+to call the provider; the provider decides *how* to produce valid credentials.
+
+Credential lifecycle (owned by AuthDownloaderMiddleware):
+ 1. acquire(url) — full login flow, called lazily on the first request
+ 2. inject — middleware stamps cookies/headers onto the Request
+ 3. is_valid() — middleware may pre-check before each request (optional)
+ 4. refresh(url) — called on 401/403 or detected login-redirect
+ 5. invalidate() — marks credentials stale; next request triggers refresh
+ 6. close() — release browser/driver resources on spider_closed signal
+"""
+from __future__ import annotations
+
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+
+@dataclass
+class Credentials:
+ """Immutable value object carrying whatever the downloader needs.
+
+ Either ``cookies`` (session-based SSO) or ``headers`` (bearer token) or both.
+ Never mutated after creation — callers call provider.refresh() to get a new one.
+
+ ``acquired_at`` and ``ttl_seconds`` are optional hints. If the provider
+ knows the session lifetime (e.g. from a Set-Cookie Max-Age), it sets them
+ so the middleware can pre-emptively refresh before a request fails rather
+ than waiting for a 401.
+
+ ``_valid`` is an internal flag; use invalidate() / is_valid() rather than
+ touching it directly.
+ """
+
+ cookies: List[Dict] = field(default_factory=list)
+ headers: Dict[str, str] = field(default_factory=dict)
+ acquired_at: float = field(default_factory=time.monotonic)
+ ttl_seconds: Optional[float] = None # None = unknown / infinite
+ _valid: bool = field(default=True, repr=False, init=False)
+
+ def is_empty(self) -> bool:
+ return not self.cookies and not self.headers
+
+ def is_valid(self) -> bool:
+ """Return False if explicitly invalidated or if TTL has elapsed."""
+ if not self._valid:
+ return False
+ if self.ttl_seconds is not None:
+ return (time.monotonic() - self.acquired_at) < self.ttl_seconds
+ return True
+
+ def invalidate(self) -> None:
+ """Mark these credentials as stale. Thread-safe for single-threaded Twisted."""
+ self._valid = False
+
+
+class AuthProvider(ABC):
+ """Abstract base for all auth providers.
+
+ Instantiated once per crawl inside AuthDownloaderMiddleware.from_crawler()
+ so providers can be swapped for test fakes without touching any spider.
+
+ Concrete implementations must be importable via their dotted class path
+ registered in settings.SPIDER_AUTH_PROVIDERS.
+ """
+
+ @abstractmethod
+ def acquire(self, url: str) -> Optional[Credentials]:
+ """Full authentication flow. Returns Credentials or None on failure."""
+
+ def refresh(self, url: str) -> Optional[Credentials]:
+ """Re-authenticate. Default: delegates to acquire().
+
+ Override for providers that have a cheaper refresh path (e.g. a
+ /token/refresh endpoint that doesn't need a full browser login).
+ """
+ return self.acquire(url)
+
+ def is_session_expired(self, response) -> bool:
+ """Return True if response indicates session expiry.
+ Default checks only explicit HTTP auth codes via the middleware's
+ failure_codes list. Override for providers whose SSO signals
+ expiry via a 302→200 poison-pill (CERN) or a JSON error body (APIs).
+ """
+ return False
+
+ def close(self) -> None:
+ """Release resources (browser context, HTTP session, etc.)."""
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/auth/cern_sso.py b/src/data_manager/collectors/scrapers/auth/cern_sso.py
new file mode 100644
index 000000000..79a3e67e0
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/auth/cern_sso.py
@@ -0,0 +1,272 @@
+"""
+CERN SSO auth provider — Playwright implementation.
+
+Why Playwright over Selenium (legacy SSOScraper used Selenium)
+--------------------------------------------------------------
+The legacy SSOScraper mixed browser lifecycle, cookie collection, crawling, and
+link extraction into one class. Now that auth is a pure credential factory
+(Boundary B from the spec), the browser only needs to log in and hand back
+cookies — Playwright's sync API is less boilerplate for this narrow use case:
+
+ - No geckodriver binary management (Playwright installs its own browsers)
+ - BrowserContext.cookies() returns the exact dict format Scrapy expects
+ - context.clear_cookies() + re-login is cheaper than quitting/restarting
+ a WebDriver session — critical for mid-crawl refresh without stalling the
+ Twisted reactor for a long time
+ - storage_state() lets us persist/restore auth state across restarts if we
+ ever want that (Phase 2 enhancement)
+
+Design
+------
+One ``Browser`` instance lives for the lifetime of the crawl (created lazily).
+Each acquire/refresh operates on a fresh ``BrowserContext`` so sessions never
+bleed between attempts. The old context is closed before opening a new one.
+
+Invalidation
+------------
+The middleware calls ``credentials.invalidate()`` and then ``provider.refresh()``
+when it detects a 401, 403, or a login-page redirect. ``refresh()`` here does:
+
+ 1. close the existing BrowserContext (clearing all cookies server-side too)
+ 2. open a new BrowserContext
+ 3. navigate to the target URL (which triggers the SSO redirect)
+ 4. fill in credentials and submit
+ 5. return a fresh Credentials object
+
+The Browser process itself is NOT restarted on refresh — only the context,
+which is a lightweight operation (~200ms vs ~2s for a full browser restart).
+"""
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional
+
+from playwright.sync_api import (
+ Browser,
+ BrowserContext,
+ Page,
+ Playwright,
+ sync_playwright,
+)
+
+from src.utils.env import read_secret
+from src.utils.logging import get_logger
+from .base import AuthProvider, Credentials
+
+logger = get_logger(__name__)
+
+# Keycloak login form element IDs (CERN SSO uses standard Keycloak)
+_USERNAME_SELECTOR = "#username"
+_PASSWORD_SELECTOR = "#password"
+_SUBMIT_SELECTOR = "#kc-login"
+_LOGIN_TIMEOUT_MS = 20_000 # ms — Playwright uses milliseconds
+
+# URL patterns that indicate we landed on a login page instead of content.
+# Used by the middleware to detect the SSO poison-pill (302 → /login → 200 OK).
+LOGIN_URL_PATTERNS: List[str] = [
+ r"auth\.cern\.ch",
+ r"/login",
+ r"/sso/",
+ r"keycloak",
+]
+_LOGIN_RE = re.compile("|".join(LOGIN_URL_PATTERNS), re.IGNORECASE)
+
+
+def looks_like_login_page(url: str) -> bool:
+ """Return True if *url* matches known CERN SSO login page patterns.
+
+ Exported so the middleware can call it from process_response without
+ importing the whole provider.
+ """
+ return bool(_LOGIN_RE.search(url))
+
+
+class CERNSSOProvider(AuthProvider):
+ """Acquires CERN SSO session cookies via a headless Playwright browser.
+
+ Args:
+ username: CERN SSO username. Falls back to SSO_USERNAME secret.
+ password: CERN SSO password. Falls back to SSO_PASSWORD secret.
+ headless: Run browser headlessly (default True).
+ browser_type: 'chromium' | 'firefox' | 'webkit' (default 'chromium').
+ Chromium is faster for headless cookie extraction.
+ slow_mo_ms: Playwright slow-motion delay in ms. 0 in production,
+ useful for debugging (e.g. 500).
+ """
+
+ def __init__(
+ self,
+ username: Optional[str] = None,
+ password: Optional[str] = None,
+ headless: bool = True,
+ browser_type: str = "chromium",
+ slow_mo_ms: int = 0,
+ ) -> None:
+ self.username: str = username or read_secret("SSO_USERNAME") or ""
+ self.password: str = password or read_secret("SSO_PASSWORD") or ""
+ self.headless = headless
+ self.browser_type = browser_type
+ self.slow_mo_ms = slow_mo_ms
+
+ if not self.username or not self.password:
+ raise ValueError(
+ "CERNSSOProvider requires SSO_USERNAME and SSO_PASSWORD. "
+ "Set them as secrets or pass them explicitly."
+ )
+
+ # Lazily initialised — browser starts only when acquire() is first called.
+ self._playwright: Optional[Playwright] = None
+ self._browser: Optional[Browser] = None
+ self._context: Optional[BrowserContext] = None
+
+ logger.info(
+ "CERNSSOProvider ready (browser=%s, headless=%s)",
+ browser_type,
+ headless,
+ )
+
+ # ------------------------------------------------------------------
+ # AuthProvider contract
+ # ------------------------------------------------------------------
+
+ def acquire(self, url: str) -> Optional[Credentials]:
+ """Full CERN SSO login flow. Returns cookies as Credentials."""
+ self._ensure_browser()
+ self._open_fresh_context()
+ return self._login_and_extract(url)
+
+ def refresh(self, url: str) -> Optional[Credentials]:
+ """Refresh by wiping the existing context and re-logging in.
+
+ Reuses the running Browser process — only the BrowserContext is
+ discarded, which is fast (~200 ms) and avoids stalling the Twisted
+ reactor for a full browser restart.
+ """
+ logger.info("CERNSSOProvider: refreshing session for %s", url)
+ self._close_context() # wipe cookies server-side
+ self._open_fresh_context() # blank slate
+ return self._login_and_extract(url)
+
+ def is_session_expired(self, response) -> bool:
+ return looks_like_login_page(response.url)
+
+ def close(self) -> None:
+ """Quit the browser process. Called by middleware on spider_closed."""
+ self._close_context()
+ if self._browser:
+ try:
+ self._browser.close()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: browser.close() raised: %s", exc)
+ finally:
+ self._browser = None
+ if self._playwright:
+ try:
+ self._playwright.stop()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: playwright.stop() raised: %s", exc)
+ finally:
+ self._playwright = None
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _ensure_browser(self) -> None:
+ if self._playwright is None:
+ self._playwright = sync_playwright().start()
+ if self._browser is None:
+ launcher = getattr(self._playwright, self.browser_type)
+ self._browser = launcher.launch(
+ headless=self.headless,
+ slow_mo=self.slow_mo_ms,
+ )
+ logger.info(
+ "CERNSSOProvider: %s browser started (headless=%s)",
+ self.browser_type,
+ self.headless,
+ )
+
+ def _open_fresh_context(self) -> None:
+ """Close any existing context and open a blank new one."""
+ self._close_context()
+ assert self._browser is not None
+ self._context = self._browser.new_context(
+ # Accept cookies from any domain so SSO redirects set cookies freely.
+ ignore_https_errors=True,
+ )
+
+ def _close_context(self) -> None:
+ if self._context:
+ try:
+ self._context.close()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: context.close() raised: %s", exc)
+ finally:
+ self._context = None
+
+ def _login_and_extract(self, url: str) -> Optional[Credentials]:
+ """Navigate to *url*, complete SSO login, return Credentials."""
+ assert self._context is not None
+ page: Page = self._context.new_page()
+ try:
+ page.goto(url, wait_until="networkidle", timeout=30_000)
+
+ # Public page: loaded directly without SSO redirect — return whatever
+ # cookies the browser has (may be empty, that's fine for public pages).
+ if not looks_like_login_page(page.url):
+ raw_cookies = self._context.cookies()
+ logger.info("CERNSSOProvider: no SSO redirect for %s, returning browser cookies", url)
+ return Credentials(cookies=raw_cookies)
+
+ if not self._fill_login_form(page):
+ return None
+
+ # After submit, wait for navigation away from the login page.
+ page.wait_for_url(
+ lambda u: not looks_like_login_page(u),
+ timeout=_LOGIN_TIMEOUT_MS,
+ )
+
+ # Navigate back to the original URL so all domain cookies are set.
+ page.goto(url, wait_until="networkidle", timeout=30_000)
+
+ raw_cookies: List[Dict] = self._context.cookies()
+ logger.debug(
+ "CERNSSOProvider: acquired %d cookies for %s",
+ len(raw_cookies),
+ url,
+ )
+ return Credentials(cookies=raw_cookies)
+
+ except Exception as exc:
+ logger.error(
+ "CERNSSOProvider: login flow failed for %s: %s",
+ url,
+ exc,
+ exc_info=True,
+ )
+ return None
+ finally:
+ try:
+ page.close()
+ except Exception:
+ pass
+
+ def _fill_login_form(self, page: Page) -> bool:
+ """Fill in and submit the Keycloak login form.
+
+ Returns True if the submit was reached without timeout.
+ """
+ try:
+ page.wait_for_selector(_USERNAME_SELECTOR, timeout=_LOGIN_TIMEOUT_MS)
+ page.fill(_USERNAME_SELECTOR, self.username)
+ page.fill(_PASSWORD_SELECTOR, self.password)
+ page.click(_SUBMIT_SELECTOR)
+ logger.info("CERNSSOProvider: login form submitted")
+ return True
+ except Exception as exc:
+ logger.error(
+ "CERNSSOProvider: could not find/fill login form: %s", exc
+ )
+ return False
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/middlewares.py b/src/data_manager/collectors/scrapers/middlewares.py
new file mode 100644
index 000000000..db35989e0
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/middlewares.py
@@ -0,0 +1,372 @@
+"""
+AuthDownloaderMiddleware — the single place where auth intersects Scrapy's
+request/response lifecycle.
+
+Everything else (spiders, parsers, pipelines) is auth-blind.
+
+Middleware ordering (FR-3a — must be documented here per spec)
+--------------------------------------------------------------
+Request path (outbound):
+ 500 AuthDownloaderMiddleware ← injects cookies/tokens FIRST
+ 550 RetryMiddleware ← retries after credentials are attached
+ 600 RedirectMiddleware ← follows 302s last
+
+Response path (inbound — reversed order):
+ 600 RedirectMiddleware ← resolves 302, re-queues new URL
+ 550 RetryMiddleware ← handles transport errors
+ 500 AuthDownloaderMiddleware ← sees the FINAL response (200 / 401 / 403)
+ or catches the SSO poison-pill 200
+
+Why auth before retry?
+ If RetryMiddleware ran before auth on the *request* path, retried requests
+ would carry no credentials and immediately receive another 401. The retry
+ counter exhausts before auth can refresh. Placing auth at 500 ensures
+ every outbound request carries valid credentials before retry even fires.
+
+Why we do NOT handle 302 directly
+ Scrapy's RedirectMiddleware (600) follows 302s before our middleware sees
+ the response — we receive the final destination status. *However*, CERN
+ SSO signals session expiry with a silent 302 → /login → 200 OK chain.
+ The final 200 looks healthy but contains a login page. We detect this
+ in process_response via ``_is_login_redirect(response)``.
+
+Required settings (settings.py)
+--------------------------------
+ DOWNLOADER_MIDDLEWARES = {
+ "src.data_manager.collectors.scrapers.middlewares.AuthDownloaderMiddleware": 500,
+ "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
+ # RedirectMiddleware stays at its default 600
+ }
+
+ SPIDER_AUTH_PROVIDERS = {
+ "cern_sso": {
+ "class": "src.data_manager.collectors.scrapers.auth.cern_sso.CERNSSOProvider",
+ "kwargs": {"headless": True},
+ },
+ "indico": {
+ "class": "src.data_manager.collectors.scrapers.auth.indico_bearer.IndicoBearerAuthProvider",
+ "kwargs": {},
+ },
+ }
+
+ AUTH_FAILURE_CODES = [401, 403] # optional; this is the default
+
+Spider contract
+---------------
+A spider opts into auth by declaring:
+
+ auth_provider_name = "cern_sso" # matches a key in SPIDER_AUTH_PROVIDERS
+
+Spiders without this attribute are public and completely bypass this middleware.
+"""
+from __future__ import annotations
+
+import importlib
+from typing import Dict, Optional, TYPE_CHECKING
+
+from scrapy import signals
+from scrapy.exceptions import IgnoreRequest
+from scrapy.http import Request, Response
+from twisted.internet.threads import deferToThread
+
+from src.utils.logging import get_logger
+from src.data_manager.collectors.scrapers.auth.base import AuthProvider, Credentials
+
+if TYPE_CHECKING:
+ from scrapy import Spider
+ from scrapy.crawler import Crawler
+
+logger = get_logger(__name__)
+
+# Meta key that marks a request as a post-refresh retry.
+# Prevents infinite refresh loops: if a retried request also fails auth,
+# the middleware closes the spider instead of refreshing again.
+_AUTH_RETRY_META_KEY = "_auth_retry"
+
+
+class AuthDownloaderMiddleware:
+ """Injects auth credentials and handles mid-crawl session expiry.
+
+ Auth-provider-agnostic: resolves which provider to use from
+ ``spider.auth_provider_name`` + ``settings.SPIDER_AUTH_PROVIDERS``.
+ """
+
+ def __init__(
+ self,
+ auth_providers_config: Dict,
+ auth_failure_codes: list,
+ ) -> None:
+ self._config = auth_providers_config
+ self._failure_codes = set(auth_failure_codes)
+ # Keyed by provider name. Populated lazily on first use.
+ self._providers: Dict[str, AuthProvider] = {}
+ self._credentials: Dict[str, Optional[Credentials]] = {}
+
+ # ------------------------------------------------------------------
+ # Scrapy classmethod + signal wiring
+ # ------------------------------------------------------------------
+
+ @classmethod
+ def from_crawler(cls, crawler: "Crawler") -> "AuthDownloaderMiddleware":
+ mw = cls(
+ auth_providers_config=crawler.settings.getdict(
+ "SPIDER_AUTH_PROVIDERS", {}
+ ),
+ auth_failure_codes=crawler.settings.getlist(
+ "AUTH_FAILURE_CODES", [401, 403]
+ ),
+ )
+ crawler.signals.connect(mw._on_spider_closed, signal=signals.spider_closed)
+ return mw
+
+ def _on_spider_closed(self, spider: "Spider", reason: str) -> None:
+ for name, provider in self._providers.items():
+ try:
+ provider.close()
+ logger.debug("AuthMiddleware: closed provider %r", name)
+ except Exception as exc:
+ logger.warning(
+ "AuthMiddleware: error closing provider %r: %s", name, exc
+ )
+
+ # ------------------------------------------------------------------
+ # process_request — inject credentials before the request is sent
+ # ------------------------------------------------------------------
+
+ def process_request(self, request: Request, spider: "Spider") -> None:
+ """Inject credentials. No-op for spiders without auth_provider_name."""
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if not provider_name:
+ return
+
+ # Cache hit — no thread needed, inject inline
+ cached = self._credentials.get(provider_name)
+ if cached is not None and cached.is_valid():
+ _inject(request, cached)
+ return None
+
+ # Cold start or stale — acquire blocks (Playwright), run in thread pool
+ return deferToThread(self._blocking_acquire_and_inject, request, provider_name, spider)
+
+
+ def _blocking_acquire_and_inject(self, request: Request, provider_name: str, spider: "Spider") -> None:
+ """Runs in a thread — safe for sync Playwright."""
+ creds = self._get_valid_credentials(provider_name, request.url, spider)
+ if creds is None:
+ logger.error(
+ "AuthMiddleware: could not acquire credentials for %r — "
+ "closing spider.", provider_name
+ )
+ self._close_spider(spider, "auth_acquisition_failed")
+ raise IgnoreRequest("Auth acquisition failed -- no credentials found")
+
+ _inject(request, creds)
+
+ # ------------------------------------------------------------------
+ # process_response — detect auth failure, refresh once, then close
+ # ------------------------------------------------------------------
+
+ def process_response(
+ self, request: Request, response: Response, spider: "Spider"
+ ) -> Response | Request:
+ """Detect 401/403 and SSO login-redirect poison pill."""
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if not provider_name:
+ return response
+
+ provider = self._resolve_provider(provider_name)
+ failure_reason = self._detect_auth_failure(response, provider) if provider else None
+ if failure_reason is None:
+ return response # healthy response — pass through
+
+ if request.meta.get(_AUTH_RETRY_META_KEY):
+ # Already refreshed once. A second failure means the session is
+ # broken beyond repair; do not retry again.
+ logger.error(
+ "AuthMiddleware: auth failure persists after refresh "
+ "(%s, url=%s). Closing spider.",
+ failure_reason,
+ request.url,
+ )
+ self._close_spider(spider, "auth_expired")
+ return response
+
+ logger.warning(
+ "AuthMiddleware: %s detected for %s — refreshing credentials.",
+ failure_reason,
+ request.url,
+ )
+
+ # Invalidate the cached credentials so _get_valid_credentials knows
+ # they're stale before the next process_request call.
+ cached = self._credentials.get(provider_name)
+ if cached:
+ cached.invalidate()
+
+ # refresh also runs Playwright — thread pool
+ return deferToThread(self._blocking_refresh_and_retry, request, provider_name, spider, failure_reason)
+
+ def _blocking_refresh_and_retry(self, request: Request, provider_name: str, spider: "Spider", failure_reason: str) -> Request:
+ """Runs in a thread — safe for sync Playwright."""
+ fresh = self._do_refresh(provider_name, request.url, spider)
+ if fresh is None:
+ self._close_spider(spider, "auth_expired")
+ raise IgnoreRequest("auth refresh failed")
+ retry = request.copy()
+ retry.meta[_AUTH_RETRY_META_KEY] = True
+ retry = retry.replace(dont_filter=True)
+ _inject(retry, fresh)
+ return retry
+
+ # ------------------------------------------------------------------
+ # process_exception — log transport errors; let RetryMiddleware handle
+ # ------------------------------------------------------------------
+
+ def process_exception(
+ self, request: Request, exception: Exception, spider: "Spider"
+ ) -> None:
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if provider_name:
+ logger.warning(
+ "AuthMiddleware: transport error [provider=%r] %s — %s",
+ provider_name,
+ request.url,
+ exception,
+ )
+ # Return None → other middlewares (RetryMiddleware) handle it.
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _detect_auth_failure(self, response, provider: AuthProvider):
+ """Return a failure label or None if the response looks healthy.
+
+ Checks two failure modes:
+ 1. Explicit HTTP auth codes (401, 403).
+ 2. SSO poison-pill: a 200 OK whose final URL is a login page.
+ CERN SSO sometimes redirects expired sessions to /login and returns
+ a 200 with the login form HTML. This is invisible to RetryMiddleware
+ because the status code is 200 — only URL inspection reveals the trap.
+ """
+ if response.status in self._failure_codes:
+ return f"HTTP {response.status}"
+ if provider.is_session_expired(response):
+ return "session-expired (provider-detected)"
+ return None
+
+
+ def _get_valid_credentials(
+ self, provider_name: str, url: str, spider: "Spider"
+ ) -> Optional[Credentials]:
+ """Return cached credentials if still valid, or acquire fresh ones."""
+ cached = self._credentials.get(provider_name)
+ if cached is not None and cached.is_valid():
+ return cached
+
+ # Cache miss or explicitly invalidated / TTL expired — acquire fresh.
+ logger.info(
+ "AuthMiddleware: acquiring credentials via %r for %s",
+ provider_name, url,
+ )
+ provider = self._resolve_provider(provider_name)
+ if provider is None:
+ return None
+
+ fresh = provider.acquire(url)
+ self._credentials[provider_name] = fresh
+ return fresh
+
+ def _do_refresh(
+ self, provider_name: str, url: str, spider: "Spider"
+ ) -> Optional[Credentials]:
+ """Delegate refresh to the provider and update the cache."""
+ provider = self._resolve_provider(provider_name)
+ if provider is None:
+ return None
+ fresh = provider.refresh(url)
+ self._credentials[provider_name] = fresh
+ return fresh
+
+ def _resolve_provider(self, name: str) -> Optional[AuthProvider]:
+ """Return a cached provider, instantiating it on first call."""
+ if name in self._providers:
+ return self._providers[name]
+
+ entry = self._config.get(name)
+ if not entry:
+ logger.error(
+ "AuthMiddleware: no SPIDER_AUTH_PROVIDERS entry for %r. "
+ "Check settings.py.", name
+ )
+ return None
+
+ class_path: str = entry["class"]
+ kwargs: dict = entry.get("kwargs", {})
+ try:
+ module_path, class_name = class_path.rsplit(".", 1)
+ module = importlib.import_module(module_path)
+ provider_cls = getattr(module, class_name)
+ provider: AuthProvider = provider_cls(**kwargs)
+ except Exception as exc:
+ logger.error(
+ "AuthMiddleware: could not instantiate %r: %s",
+ class_path, exc, exc_info=True,
+ )
+ return None
+
+ self._providers[name] = provider
+ return provider
+
+ @staticmethod
+ def _close_spider(spider: "Spider", reason: str) -> None:
+ logger.error("AuthMiddleware: closing spider (reason=%r)", reason)
+ try:
+ spider.crawler.engine.close_spider(spider, reason)
+ except Exception as exc:
+ logger.error("AuthMiddleware: engine.close_spider failed: %s", exc)
+
+
+# ---------------------------------------------------------------------------
+# Standalone helper — lives outside the class so it's testable without
+# constructing the full middleware.
+# ---------------------------------------------------------------------------
+
+def _inject(request: Request, credentials: Credentials) -> None:
+ """Stamp cookies and/or auth headers onto a Scrapy Request in-place.
+
+ Scrapy's Request.cookies accepts a list[dict] (same format Playwright's
+ context.cookies() returns) or a plain dict. We always normalise to
+ list[dict] and merge rather than replace, so existing cookies (e.g. from
+ a previous inject or a spider-level cookies= argument) are preserved.
+
+ Headers are set directly on request.headers which is mutable.
+
+ Note: Request.cookies is read-only after construction; we use
+ request.replace(cookies=...) to produce a new Request object, then
+ update the reference via the caller. But since Scrapy passes Request
+ objects by reference and the middleware hooks return None (pass-through)
+ or a new Request, we instead mutate headers (mutable) and re-build the
+ cookie jar using the internal _cookies attribute that Scrapy exposes.
+ This is the idiomatic approach used by Scrapy's own cookie middleware.
+ """
+ if credentials.cookies:
+ cookie_header = "; ".join(
+ f"{c['name']}={c['value']}" for c in credentials.cookies
+ )
+ request.headers["Cookie"] = cookie_header
+ request.meta["dont_merge_cookies"] = True
+ # # Merge new cookies over existing ones (last write wins per name).
+ # existing: list = list(request.cookies) if isinstance(request.cookies, list) else [
+ # {"name": k, "value": v} for k, v in (request.cookies or {}).items()
+ # ]
+ # merged: Dict[str, dict] = {c["name"]: c for c in existing}
+ # for cookie in credentials.cookies:
+ # merged[cookie["name"]] = cookie
+ # Replace is safe here — process_request returns None so Scrapy uses
+ # the same object; we mutate via internal attribute.
+ # request._cookies = list(merged.values())
+
+ if credentials.headers:
+ for key, value in credentials.headers.items():
+ request.headers[key] = value
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index 39b964b2d..546b09f53 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -79,7 +79,18 @@
# ---------------------------------------------------------------------------
# Middlewares, Pipelines and Extensions Priorities
# ---------------------------------------------------------------------------
-DOWNLOADER_MIDDLEWARES = { }
+DOWNLOADER_MIDDLEWARES = {
+ "src.data_manager.collectors.scrapers.middlewares.AuthDownloaderMiddleware": 500,
+ "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
+ # RedirectMiddleware stays at its default 600 — no entry needed
+}
+
+SPIDER_AUTH_PROVIDERS = {
+ "cern_sso": {
+ "class": "src.data_manager.collectors.scrapers.auth.cern_sso.CERNSSOProvider",
+ "kwargs": {"headless": True},
+ },
+}
ITEM_PIPELINES = { }
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index d044b0e1f..7c8b569d8 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -9,13 +9,16 @@
class TwikiSpider(LinkSpider):
"""
Minimal Twiki spider against a real Twiki target.
- Public page — no SSO needed — isolates lifecycle learning from auth complexity.
+ Support CERN SSO authentication.
"""
name = "twiki"
+ auth_provider_name = "cern_sso"
+
_DEFAULT_START_URLS = [
- "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide"
+ "https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons", # private page
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide", # public page
]
_DEFAULT_DENY = [
@@ -51,7 +54,8 @@ class TwikiSpider(LinkSpider):
"RETRY_TIMES": 0, # Very Safe no retries
"DEPTH_LIMIT": 1, # Default max depth
"DOWNLOAD_DELAY": 60, # Default (download) delay
- "CLOSESPIDER_PAGECOUNT": 1 # Very Safe Default max pages
+ "CLOSESPIDER_PAGECOUNT": 1, # Very Safe Default max pages
+ "COOKIES_ENABLED": False, # disable CookiesMiddleware jar
}
@staticmethod
From 7fa9073fbb3ff6b08f1accd3587d7821812612a8 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 05:05:55 +0200
Subject: [PATCH 17/55] decouple git collection from scrapers; add GitResource
and GitManager.
---
src/data_manager/collectors/git_manager.py | 307 ++++++++++++++++++
src/data_manager/collectors/git_resource.py | 61 ++++
src/data_manager/collectors/scrapers/items.py | 2 +-
3 files changed, 369 insertions(+), 1 deletion(-)
create mode 100644 src/data_manager/collectors/git_manager.py
create mode 100644 src/data_manager/collectors/git_resource.py
diff --git a/src/data_manager/collectors/git_manager.py b/src/data_manager/collectors/git_manager.py
new file mode 100644
index 000000000..ab8a1603b
--- /dev/null
+++ b/src/data_manager/collectors/git_manager.py
@@ -0,0 +1,307 @@
+# src/data_manager/collectors/git_manager.py
+from __future__ import annotations
+
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
+
+from git import Repo
+
+from src.data_manager.collectors.git_resource import GitResource
+from src.data_manager.collectors.persistence import PersistenceService
+from src.utils.config_access import get_global_config
+from src.utils.env import read_secret
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+_DEFAULT_CODE_SUFFIXES = {
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
+ ".c", ".cpp", ".h", ".hpp", ".sh", ".sql",
+ ".json", ".yaml", ".yml", ".toml", ".md", ".txt",
+}
+_DEFAULT_EXCLUDE_DIRS = {
+ ".git", "node_modules", ".venv", "venv", "__pycache__",
+ ".idea", ".vscode", "dist", "build",
+}
+
+
+class GitManager:
+ """
+ Collects git repositories (MkDocs docs + code files) into the shared data path.
+
+ Interface mirrors LocalFileManager — instantiate with dm_config, then call
+ collect_all_from_config(persistence) or collect(urls, persistence) directly.
+ """
+
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
+ global_config = get_global_config()
+ self.data_path = Path(global_config["DATA_PATH"])
+
+ sources_config = (dm_config or {}).get("sources", {}) or {}
+ self.config: Dict[str, Any] = (
+ dict(sources_config.get("git", {}))
+ if isinstance(sources_config, dict)
+ else {}
+ )
+
+ self.enabled = self.config.get("enabled", True)
+ self.git_dir = Path(self.data_path) / "raw_git_repos"
+ self.git_dir.mkdir(parents=True, exist_ok=True)
+
+ self.code_suffixes = {
+ s.lower()
+ for s in self.config.get("code_suffixes", _DEFAULT_CODE_SUFFIXES)
+ }
+ self.exclude_dirs = set(
+ self.config.get("exclude_dirs", _DEFAULT_EXCLUDE_DIRS)
+ )
+ self.max_file_size_bytes = int(
+ self.config.get("max_file_size_bytes", 1_000_000)
+ )
+
+ self.git_username = read_secret("GIT_USERNAME")
+ self.git_token = read_secret("GIT_TOKEN")
+ self._credentials_available = bool(self.git_username and self.git_token)
+ if not self._credentials_available:
+ logger.info("No git credentials supplied; will attempt public repo cloning.")
+
+ # ── Public interface (mirrors LocalFileManager) ───────────────────────────
+
+ def collect_all_from_config(self, persistence: PersistenceService) -> None:
+ if not self.enabled:
+ logger.info("Git source disabled; skipping")
+ return
+ urls: List[str] = self.config.get("urls", [])
+ if not urls:
+ logger.info("No git URLs configured; skipping")
+ return
+ self.collect(urls, persistence)
+
+ def collect(self, git_urls: List[str], persistence: PersistenceService) -> None:
+ """Collect a list of git URLs and persist each harvested file."""
+ if not git_urls:
+ logger.warning("No git URLs provided; skipping")
+ return
+
+ for url in git_urls:
+ try:
+ repo_info = self._prepare_repository(url)
+ except ValueError as exc:
+ logger.info("%s", exc)
+ continue
+ except Exception as exc:
+ logger.error("Failed to clone %s: %s", url, exc)
+ continue
+
+ try:
+ target_dir = self.data_path / "git" / repo_info["repo_name"]
+ for resource in self._harvest_repository(repo_info):
+ self._persist_one(resource, persistence, target_dir)
+ finally:
+ shutil.rmtree(repo_info["repo_path"], ignore_errors=True)
+
+ logger.info("Git collection complete")
+
+ # ── Internal harvest ──────────────────────────────────────────────────────
+
+ def _harvest_repository(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ yield from self._harvest_mkdocs(repo_info)
+ yield from self._harvest_code(repo_info)
+
+ def _harvest_mkdocs(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ repo_path: Path = repo_info["repo_path"]
+ docs_dir = repo_path / "docs"
+ if not docs_dir.exists():
+ logger.info("Skipping MkDocs harvest for %s; no docs/ dir", repo_path)
+ return
+
+ mkdocs_site_url: Optional[str] = repo_info["mkdocs_site_url"]
+ base_url: str = repo_info["web_base_url"]
+ ref: str = repo_info["ref"]
+ repo_name: str = repo_info["repo_name"]
+ repo_url: str = repo_info["repo_url"]
+
+ for md_path in docs_dir.rglob("*.md"):
+ if mkdocs_site_url:
+ url = mkdocs_site_url + md_path.relative_to(docs_dir).with_suffix("").as_posix()
+ else:
+ url = self._build_blob_url(base_url, ref, md_path.relative_to(repo_path))
+
+ text = md_path.read_text(encoding="utf-8", errors="ignore")
+ if not text.strip():
+ logger.info("Skipping empty doc: %s", md_path)
+ continue
+
+ yield GitResource(
+ repo_url=repo_url,
+ file_path=str(Path(repo_name) / md_path.relative_to(repo_path)),
+ content=text,
+ source_type="git",
+ branch=repo_info.get("branch", ""),
+ ref=ref,
+ title=md_path.stem.replace("_", " ").replace("-", " ").title(),
+ )
+
+ def _harvest_code(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ repo_path: Path = repo_info["repo_path"]
+ base_url: str = repo_info["web_base_url"]
+ ref: str = repo_info["ref"]
+ repo_name: str = repo_info["repo_name"]
+ repo_url: str = repo_info["repo_url"]
+
+ for file_path in self._iter_code_files(repo_path):
+ rel = file_path.relative_to(repo_path)
+
+ # avoid overlap with _harvest_mkdocs
+ if rel.parts and rel.parts[0] == "docs" and file_path.suffix.lower() == ".md":
+ continue
+
+ if not self._is_allowed_suffix(file_path):
+ continue
+
+ try:
+ if file_path.stat().st_size > self.max_file_size_bytes:
+ logger.warning("Skipping %s — exceeds size limit", file_path)
+ continue
+ except OSError:
+ continue
+
+ if self._looks_binary(file_path):
+ continue
+
+ try:
+ text = file_path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+ if not text.strip():
+ continue
+
+ yield GitResource(
+ repo_url=repo_url,
+ file_path=str(Path(repo_name) / rel),
+ content=text,
+ source_type="git",
+ branch=repo_info.get("branch", ""),
+ ref=ref,
+ title=None,
+ )
+
+ # ── Repository preparation ────────────────────────────────────────────────
+
+ def _prepare_repository(self, url: str) -> Dict[str, Any]:
+ url_dict = self._parse_url(url)
+ repo_path = self._clone_repo(url_dict)
+ return {
+ "repo_path": repo_path,
+ "repo_name": url_dict["repo_name"],
+ "repo_url": url_dict["original_url"],
+ "branch": url_dict["branch"] or "",
+ "mkdocs_site_url": self._read_mkdocs_site_url(repo_path),
+ "ref": self._determine_ref(repo_path, url_dict["branch"]),
+ "web_base_url": self._compute_web_base_url(url_dict["original_url"]),
+ }
+
+ def _parse_url(self, url: str) -> Dict[str, Any]:
+ match = re.search(
+ r"(?:github|gitlab)\.[\w.]+\/[^\/]+\/([\w.-]+)(?:\.git|\/|$)",
+ url, re.IGNORECASE,
+ )
+ if not match:
+ raise ValueError(f"Git URL does not match expected format: {url}")
+ repo_name = match.group(1)
+
+ if self._credentials_available:
+ if "gitlab" in url:
+ clone_url = url.replace("gitlab", f"{self.git_username}:{self.git_token}@gitlab")
+ elif "github" in url:
+ clone_url = url.replace("github", f"{self.git_username}:{self.git_token}@github")
+ else:
+ clone_url = url
+ else:
+ clone_url = url
+
+ branch = None
+ parts = re.split(r"/(?:-/)?tree/", clone_url, maxsplit=1)
+ if len(parts) > 1:
+ branch = parts[1].strip("/") or None
+ clone_url = parts[0].rstrip("/")
+
+ return {"original_url": url, "clone_url": clone_url, "repo_name": repo_name, "branch": branch}
+
+ def _clone_repo(self, url_dict: Dict[str, Any]) -> Path:
+ repo_path = self.git_dir / url_dict["repo_name"]
+ logger.info("Cloning %s …", url_dict["repo_name"])
+ kwargs = {}
+ if url_dict["branch"]:
+ kwargs["branch"] = url_dict["branch"]
+ Repo.clone_from(url_dict["clone_url"], repo_path, **kwargs)
+ return repo_path
+
+ def _read_mkdocs_site_url(self, repo_path: Path) -> Optional[str]:
+ mkdocs_file = repo_path / "mkdocs.yml"
+ if not mkdocs_file.exists():
+ return None
+ try:
+ from mkdocs.utils.yaml import yaml_load
+ with mkdocs_file.open() as f:
+ data = yaml_load(f)
+ site_url = data.get("site_url")
+ if not site_url:
+ return None
+ return site_url if site_url.endswith("/") else site_url + "/"
+ except Exception:
+ return None
+
+ def _compute_web_base_url(self, url: str) -> str:
+ sanitized = re.sub(r"//[^@/]+@", "//", url)
+ sanitized = re.split(r"/(?:-/)?tree/", sanitized, maxsplit=1)[0]
+ return sanitized.rstrip("/").removesuffix(".git")
+
+ def _determine_ref(self, repo_path: Path, branch: Optional[str]) -> str:
+ if branch:
+ return branch
+ try:
+ return Repo(repo_path).active_branch.name
+ except Exception:
+ try:
+ return Repo(repo_path).head.commit.hexsha[:7]
+ except Exception:
+ return "main"
+
+ def _build_blob_url(self, base_url: str, ref: str, rel: Path) -> str:
+ base = base_url.rstrip("/")
+ if "gitlab" in base:
+ return f"{base}/-/blob/{ref}/{rel.as_posix()}"
+ return f"{base}/blob/{ref}/{rel.as_posix()}"
+
+ # ── Helpers ───────────────────────────────────────────────────────────────
+
+ def _iter_code_files(self, repo_path: Path) -> Iterator[Path]:
+ for root, dirs, files in os.walk(repo_path):
+ dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
+ for name in files:
+ yield Path(root) / name
+
+ def _is_allowed_suffix(self, path: Path) -> bool:
+ return path.suffix.lower() in self.code_suffixes
+
+ def _looks_binary(self, path: Path) -> bool:
+ try:
+ return b"\0" in path.open("rb").read(8000)
+ except Exception:
+ return True
+
+ def _persist_one(
+ self,
+ resource: GitResource,
+ persistence: PersistenceService,
+ target_dir: Path,
+ ) -> None:
+ try:
+ persistence.persist_resource(resource, target_dir)
+ except Exception as exc:
+ logger.warning("Failed to persist %s: %s", resource.file_path, exc)
\ No newline at end of file
diff --git a/src/data_manager/collectors/git_resource.py b/src/data_manager/collectors/git_resource.py
new file mode 100644
index 000000000..678f0982c
--- /dev/null
+++ b/src/data_manager/collectors/git_resource.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+
+from src.data_manager.collectors.resource_base import BaseResource
+from src.data_manager.collectors.utils.metadata import ResourceMetadata
+
+
+@dataclass
+class GitResource(BaseResource):
+ """Representation of a single file harvested from a git repository."""
+
+ repo_url: str # canonical remote URL, credentials stripped
+ file_path: str # path within repo, e.g. "docs/guide.md"
+ content: Union[str, bytes]
+ source_type: str = "git"
+ branch: str = ""
+ ref: str = "" # commit SHA or tag; used in blob URLs
+ title: Optional[str] = None
+
+ def get_hash(self) -> str:
+ """
+ Stable hash on (repo_url, file_path) so re-harvests overwrite in-place.
+
+ Intentionally excludes ref/branch: the same file at a new commit
+ is still the same resource — it should update the catalog entry,
+ not create an orphan.
+ """
+ digest = hashlib.md5()
+ digest.update(f"{self.repo_url}::{self.file_path}".encode("utf-8", errors="ignore"))
+ return digest.hexdigest()[:12]
+
+ def get_filename(self) -> str:
+ return Path(self.file_path).name
+
+ def get_file_path(self, target_dir: Path) -> Path:
+ """Preserve the repo directory tree under target_dir."""
+ return target_dir / self.file_path
+
+ def get_content(self) -> Union[str, bytes]:
+ return self.content
+
+ def get_metadata(self) -> ResourceMetadata:
+ extra: dict[str, str] = {
+ "source_type": self.source_type,
+ "repo_url": self.repo_url,
+ "file_path": self.file_path,
+ "suffix": Path(self.file_path).suffix.lstrip(".") or "",
+ "display_name": self.file_path,
+ }
+ if self.branch:
+ extra["branch"] = self.branch
+ if self.ref:
+ extra["ref"] = self.ref
+ if self.title:
+ extra["title"] = self.title
+
+ return ResourceMetadata(file_name=self.get_filename(), extra=extra)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index f7ea5e6e6..c084f9ebf 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -40,7 +40,7 @@ class BasePageItem(scrapy.Item):
url = scrapy.Field()
content = scrapy.Field() # Full text or bytes — NOT a preview
suffix = scrapy.Field() # "html", "pdf", "md" etc.
- source_type = scrapy.Field() # "web" | "sso" | "twiki" | 'indico" | "discourse"
+ source_type = scrapy.Field() # "web" | "twiki" | "indico" | "discourse"
# Metadata fields — become ScrapedResource.metadata dict
title = scrapy.Field()
From 6128af4f448c89a4f91850c921a1d056f93de31f Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 05:33:52 +0200
Subject: [PATCH 18/55] add scraper_manager at collectors level, e2e wired from
legacy interface & instantiations to GitManager and Scrapy's ScraperManager.
---
src/bin/service_data_manager.py | 2 +-
src/data_manager/collectors/git_manager.py | 13 +-
.../collectors/scraper_manager.py | 219 ++++++++++++++++++
src/data_manager/data_manager.py | 7 +-
src/interfaces/uploader_app/app.py | 7 +-
5 files changed, 242 insertions(+), 6 deletions(-)
create mode 100644 src/data_manager/collectors/scraper_manager.py
diff --git a/src/bin/service_data_manager.py b/src/bin/service_data_manager.py
index 2a1eb6c1c..e102d9080 100644
--- a/src/bin/service_data_manager.py
+++ b/src/bin/service_data_manager.py
@@ -75,7 +75,7 @@ def trigger_update() -> None:
schedule_map: Dict[str, Callable[[Optional[str]], None]] = {
"local_files": lambda last_run=None: data_manager.localfile_manager.schedule_collect_local_files(data_manager.persistence, last_run=last_run),
"links": lambda last_run=None: data_manager.scraper_manager.schedule_collect_links(data_manager.persistence, last_run=last_run),
- "git": lambda last_run=None: data_manager.scraper_manager.schedule_collect_git(data_manager.persistence, last_run=last_run),
+ "git": lambda last_run=None: data_manager.git_manager.schedule_collect_git(data_manager.persistence, last_run=last_run),
"sso": lambda last_run=None: data_manager.scraper_manager.schedule_collect_sso(data_manager.persistence, last_run=last_run),
"jira": lambda last_run=None: data_manager.ticket_manager.schedule_collect_jira(data_manager.persistence, last_run=last_run),
"redmine": lambda last_run=None: data_manager.ticket_manager.schedule_collect_redmine(data_manager.persistence, last_run=last_run),
diff --git a/src/data_manager/collectors/git_manager.py b/src/data_manager/collectors/git_manager.py
index ab8a1603b..0978d2697 100644
--- a/src/data_manager/collectors/git_manager.py
+++ b/src/data_manager/collectors/git_manager.py
@@ -72,7 +72,6 @@ def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
def collect_all_from_config(self, persistence: PersistenceService) -> None:
if not self.enabled:
- logger.info("Git source disabled; skipping")
return
urls: List[str] = self.config.get("urls", [])
if not urls:
@@ -80,6 +79,18 @@ def collect_all_from_config(self, persistence: PersistenceService) -> None:
return
self.collect(urls, persistence)
+ def schedule_collect_git(
+ self, persistence: PersistenceService, last_run: Optional[str] = None
+ ) -> None:
+ """Re-harvest all repos known to the catalog (config + dynamically added)."""
+ metadata = persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="git", metadata_keys=["repo_url"]
+ )
+ urls = list({m[1]["repo_url"] for m in metadata if m[1].get("repo_url")})
+ if not urls:
+ return
+ self.collect(urls, persistence)
+
def collect(self, git_urls: List[str], persistence: PersistenceService) -> None:
"""Collect a list of git URLs and persist each harvested file."""
if not git_urls:
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
new file mode 100644
index 000000000..ee59404b4
--- /dev/null
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Type
+
+from scrapy.crawler import CrawlerProcess, Crawler
+from scrapy.utils.project import get_project_settings
+
+from src.data_manager.collectors.persistence import PersistenceService
+from src.utils.config_access import get_global_config
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+# Spider registry — add new spider classes here, nothing else changes
+_SPIDER_REGISTRY: Dict[str, str] = {
+ "link": "src.data_manager.collectors.scrapers.spiders.link.LinkSpider",
+ "twiki": "src.data_manager.collectors.scrapers.spiders.twiki.TwikiSpider",
+}
+
+
+def _import_spider(dotted_path: str):
+ module_path, cls_name = dotted_path.rsplit(".", 1)
+ import importlib
+ return getattr(importlib.import_module(module_path), cls_name)
+
+
+class ScraperManager:
+ """
+ Coordinates all web crawls as a single CrawlerProcess run.
+
+ One CrawlerProcess → one Twisted reactor → all spiders run concurrently.
+ Git collection is now GitManager's responsibility.
+ SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
+ """
+
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
+ global_config = get_global_config()
+ self.data_path = Path(global_config["DATA_PATH"])
+
+ sources_config = (dm_config or {}).get("sources", {}) or {}
+ links_config = sources_config.get("links", {}) if isinstance(sources_config, dict) else {}
+
+ self.config = links_config if isinstance(links_config, dict) else {}
+ self.enabled = self.config.get("enabled", True)
+ self.input_lists: List[str] = self.config.get("input_lists", [])
+
+ # Per-spider kwargs forwarded from config
+ self.max_depth: Optional[int] = self.config.get("max_depth")
+ self.max_pages: Optional[int] = self.config.get("max_pages")
+ self.delay: Optional[int] = self.config.get("download_delay")
+
+ # ── Public interface ──────────────────────────────────────────────────────
+
+ def collect_all_from_config(self, persistence: PersistenceService) -> None:
+ if not self.enabled:
+ logger.info("Web scraping disabled; skipping")
+ return
+
+ link_urls, sso_urls = self._collect_urls_from_lists_by_type(self.input_lists)
+ self._run_crawl(persistence, link_urls=link_urls, sso_urls=sso_urls)
+
+ def collect_links(
+ self,
+ persistence: PersistenceService,
+ link_urls: Optional[List[str]] = None,
+ ) -> None:
+ if not link_urls:
+ return
+ self._run_crawl(persistence, link_urls=link_urls)
+
+ def collect_sso(
+ self,
+ persistence: PersistenceService,
+ sso_urls: Optional[List[str]] = None,
+ ) -> None:
+ if not sso_urls:
+ return
+ self._run_crawl(persistence, sso_urls=sso_urls)
+
+ def schedule_collect_links(
+ self, persistence: PersistenceService, last_run: Optional[str] = None
+ ) -> None:
+ metadata = persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="web", metadata_keys=["url"]
+ )
+ urls = [m[1].get("url", "").strip() for m in metadata if m[1].get("url")]
+ self.collect_links(persistence, link_urls=urls)
+
+ def schedule_collect_sso(
+ self, persistence: PersistenceService, last_run: Optional[str] = None
+ ) -> None:
+ metadata = persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="sso", metadata_keys=["url"]
+ )
+ urls = [m[1].get("url", "").strip() for m in metadata if m[1].get("url")]
+ self.collect_sso(persistence, sso_urls=urls)
+
+ # ── CrawlerProcess wiring ─────────────────────────────────────────────────
+
+ def _run_crawl(
+ self,
+ persistence: PersistenceService,
+ link_urls: Optional[List[str]] = None,
+ sso_urls: Optional[List[str]] = None,
+ ) -> None:
+ """Build one CrawlerProcess, add all spiders, start the reactor."""
+ websites_dir = self.data_path / "websites"
+ websites_dir.mkdir(parents=True, exist_ok=True)
+
+ scrapy_settings = get_project_settings()
+ process = CrawlerProcess(scrapy_settings)
+
+ if link_urls:
+ self._add_crawler(
+ process,
+ spider_key="link",
+ persistence=persistence,
+ output_dir=websites_dir,
+ start_urls=link_urls,
+ )
+
+ if sso_urls:
+ self._add_crawler(
+ process,
+ spider_key="link",
+ persistence=persistence,
+ output_dir=websites_dir / "sso",
+ start_urls=sso_urls,
+ auth_provider_name="cern_sso",
+ )
+
+ if not process._crawlers:
+ logger.info("No URLs to crawl; skipping reactor start")
+ return
+
+ logger.info("Starting CrawlerProcess with %d spider(s)", len(process._crawlers))
+ process.start() # blocks until all spiders finish
+ logger.info("CrawlerProcess finished")
+
+ def _add_crawler(
+ self,
+ process: CrawlerProcess,
+ spider_key: str,
+ persistence: PersistenceService,
+ output_dir: Path,
+ **spider_kwargs,
+ ) -> None:
+ """
+ Create a Crawler for spider_key, inject PersistencePipeline settings,
+ and register it with the process.
+ """
+ SpiderClass = _import_spider(_SPIDER_REGISTRY[spider_key])
+ crawler: Crawler = process.create_crawler(SpiderClass)
+
+ # Inject persistence objects — live Python instances, must be priority="spider"
+ crawler.settings.set("PERSISTENCE_SERVICE", persistence, priority="spider")
+ crawler.settings.set("PERSISTENCE_OUTPUT_DIR", output_dir, priority="spider")
+ crawler.settings.set(
+ "ITEM_PIPELINES",
+ {"src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300},
+ priority="spider",
+ )
+
+ # Forward crawl tuning args if configured
+ if self.max_depth is not None:
+ spider_kwargs.setdefault("max_depth", self.max_depth)
+ if self.max_pages is not None:
+ spider_kwargs.setdefault("max_pages", self.max_pages)
+ if self.delay is not None:
+ spider_kwargs.setdefault("delay", self.delay)
+
+ process.crawl(crawler, **spider_kwargs)
+
+ # ── URL list parsing ──────────────────────────────────────────────────────
+
+ def _collect_urls_from_lists_by_type(
+ self, input_lists: List[str]
+ ) -> tuple[List[str], List[str]]:
+ """
+ Parse weblists and split by prefix.
+ sso- prefix → SSO-protected URLs (AuthDownloaderMiddleware handles auth)
+ no prefix → standard link URLs
+ git- prefix → ignored here (GitManager's responsibility)
+ """
+ link_urls: List[str] = []
+ sso_urls: List[str] = []
+
+ for raw_url in self._collect_urls_from_lists(input_lists):
+ if raw_url.startswith("git-"):
+ continue # GitManager owns these
+ if raw_url.startswith("sso-"):
+ sso_urls.append(raw_url.split("sso-", 1)[1])
+ else:
+ link_urls.append(raw_url)
+
+ return link_urls, sso_urls
+
+ def _collect_urls_from_lists(self, input_lists: List[str]) -> List[str]:
+ urls: List[str] = []
+ if not input_lists:
+ return urls
+ for list_name in input_lists:
+ list_path = Path("weblists") / Path(list_name).name
+ if not list_path.exists():
+ logger.warning("Input list not found: %s", list_path)
+ continue
+ urls.extend(self._extract_urls_from_file(list_path))
+ return urls
+
+ def _extract_urls_from_file(self, path: Path) -> List[str]:
+ urls: List[str] = []
+ with path.open("r") as f:
+ for line in f:
+ stripped = line.strip()
+ if not stripped or stripped.startswith("#"):
+ continue
+ urls.append(stripped.split(",")[0].strip())
+ return urls
\ No newline at end of file
diff --git a/src/data_manager/data_manager.py b/src/data_manager/data_manager.py
index 1f4b01a32..8dbf22a78 100644
--- a/src/data_manager/data_manager.py
+++ b/src/data_manager/data_manager.py
@@ -2,9 +2,10 @@
from typing import Callable, Optional
from src.data_manager.collectors.persistence import PersistenceService
-from src.data_manager.collectors.scrapers.scraper_manager import ScraperManager
+from src.data_manager.collectors.scraper_manager import ScraperManager
from src.data_manager.collectors.tickets.ticket_manager import TicketManager
from src.data_manager.collectors.localfile_manager import LocalFileManager
+from src.data_manager.collectors.git_manager import GitManager
from src.data_manager.vectorstore.manager import VectorStoreManager
from src.utils.config_access import get_full_config
from src.utils.config_service import ConfigService
@@ -36,6 +37,7 @@ def __init__(self, *, run_ingestion: bool = True, factory=None):
self.config["data_manager"]["sources"] = static_config.sources_config
self.localfile_manager = LocalFileManager(dm_config=self.config["data_manager"])
+ self.git_manager = GitManager(dm_config=self.config["data_manager"])
self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"])
self.ticket_manager = TicketManager(dm_config=self.config["data_manager"])
@@ -61,7 +63,8 @@ def run_ingestion(self, progress_callback: Optional[Callable[[str], None]] = Non
"""Execute initial ingestion and vectorstore update."""
source_aggregation = [
("Copying configured local files", lambda: self.localfile_manager.collect_all_from_config(self.persistence)),
- ("Scraping documents onto filesystem", lambda: self.scraper_manager.collect_all_from_config(self.persistence)),
+ ("Collecting git repos", lambda: self.git_manager.collect_all_from_config(self.persistence)),
+ ("Scraping web sources onto filesystem", lambda: self.scraper_manager.collect_all_from_config(self.persistence)),
("Fetching ticket data onto filesystem", lambda: self.ticket_manager.collect_all_from_config(self.persistence)),
]
diff --git a/src/interfaces/uploader_app/app.py b/src/interfaces/uploader_app/app.py
index f7fd20cc8..d340978fa 100644
--- a/src/interfaces/uploader_app/app.py
+++ b/src/interfaces/uploader_app/app.py
@@ -14,7 +14,8 @@
from src.data_manager.collectors.persistence import PersistenceService
from src.data_manager.collectors.localfile_manager import LocalFileManager
-from src.data_manager.collectors.scrapers.scraper_manager import ScraperManager
+from src.data_manager.collectors.scraper_manager import ScraperManager
+from src.data_manager.collectors.git_manager import GitManager
from src.data_manager.collectors.utils.catalog_postgres import PostgresCatalogService
from src.data_manager.collectors.tickets.ticket_manager import TicketManager
from src.data_manager.vectorstore.loader_utils import load_text_from_path
@@ -78,6 +79,7 @@ def __init__(
logger.warning("UPLOADER_SALT not set; account checks may fail.")
self.scraper_manager = ScraperManager(dm_config=self.config.get("data_manager"))
+ self.git_manager = GitManager(dm_config=self.config.get("data_manager"))
self.ticket_manager = TicketManager(dm_config=self.config.get("data_manager"))
self.localfile_manager = LocalFileManager(dm_config=self.config.get("data_manager"))
self.post_update_hook = post_update_hook
@@ -174,7 +176,8 @@ def add_git_repo(self):
return jsonify({"error": "missing_repo_url"}), 400
try:
- self.scraper_manager.collect_git(self.persistence, [repo_url.strip()])
+ self.git_manager.collect([repo_url.strip()], self.persistence)
+ self.persistence.flush_index()
self._update_source_status("git", state="idle", last_run=self._now_iso())
self._notify_update()
return jsonify({"status": "ok"})
From c4da024a193a9e26aefb2def3f4304814d2a7a7b Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 05:43:22 +0200
Subject: [PATCH 19/55] remove deprecated scrapers and dead code before
interface revision.
---
.../scrapers/integrations/__init__.py | 0
.../scrapers/integrations/git_scraper.py | 353 -------------
.../scrapers/integrations/sso_scraper.py | 466 ------------------
.../collectors/scrapers/scraped_resource.py | 11 -
.../collectors/scrapers/scraper.py | 314 ------------
.../collectors/scrapers/scraper_manager.py | 366 --------------
src/interfaces/uploader_app/app.py | 3 +-
7 files changed, 1 insertion(+), 1512 deletions(-)
delete mode 100644 src/data_manager/collectors/scrapers/integrations/__init__.py
delete mode 100644 src/data_manager/collectors/scrapers/integrations/git_scraper.py
delete mode 100644 src/data_manager/collectors/scrapers/integrations/sso_scraper.py
delete mode 100644 src/data_manager/collectors/scrapers/scraper.py
delete mode 100644 src/data_manager/collectors/scrapers/scraper_manager.py
diff --git a/src/data_manager/collectors/scrapers/integrations/__init__.py b/src/data_manager/collectors/scrapers/integrations/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/data_manager/collectors/scrapers/integrations/git_scraper.py b/src/data_manager/collectors/scrapers/integrations/git_scraper.py
deleted file mode 100644
index 7d73fd37a..000000000
--- a/src/data_manager/collectors/scrapers/integrations/git_scraper.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import os
-import re
-import shutil
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-from git import Repo
-from mkdocs.utils.yaml import yaml_load
-
-from src.utils.config_access import get_global_config
-from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-if TYPE_CHECKING:
- from src.data_manager.collectors.scrapers.scraper_manager import \
- ScraperManager
-
-global_config = get_global_config()
-
-class GitScraper:
- """Scraper integration that clones Git repositories and indexes MkDocs sites and code files."""
-
- def __init__(self, manager: "ScraperManager", git_config: Optional[Dict[str, Any]] = None) -> None:
- self.manager = manager
- self.config = git_config or {}
-
- # where we clone our repos to
- self.data_path = global_config["DATA_PATH"]
- self.git_dir = Path(self.data_path) / "raw_git_repos"
- self.git_dir.mkdir(parents=True, exist_ok=True)
-
- self.code_suffixes = {
- suffix.lower()
- for suffix in (
- self.config.get(
- "code_suffixes",
- [
- ".py",
- ".js",
- ".ts",
- ".tsx",
- ".jsx",
- ".java",
- ".go",
- ".rs",
- ".c",
- ".cpp",
- ".h",
- ".hpp",
- ".sh",
- ".sql",
- ".json",
- ".yaml",
- ".yml",
- ".toml",
- ".md",
- ".txt",
- ],
- )
- or []
- )
- }
- self.exclude_dirs = {
- dir_name
- for dir_name in (
- self.config.get(
- "exclude_dirs",
- [
- ".git",
- "node_modules",
- ".venv",
- "venv",
- "__pycache__",
- ".idea",
- ".vscode",
- "dist",
- "build",
- ],
- )
- or []
- )
- }
- self.max_file_size_bytes = int(self.config.get("max_file_size_bytes", 1_000_000))
-
- self.git_username = read_secret("GIT_USERNAME")
- self.git_token = read_secret("GIT_TOKEN")
- self._credentials_available = bool(self.git_username and self.git_token)
- if not self._credentials_available:
- logger.info("No git credentials supplied; will attempt public repo cloning.")
-
- def collect(self, git_urls: List[str]) -> List[ScrapedResource]:
- if not git_urls:
- logger.warning("No git URLs provided for scraping; skipping git scraper.")
- return []
-
- harvested: List[ScrapedResource] = []
-
- for url in git_urls:
- try:
- repo_info = self._prepare_repository(url)
- except ValueError as exc:
- logger.info(f"{exc}")
- continue
- except Exception as exc:
- logger.error(f"Failed to clone {url}: {exc}")
- continue
-
- try:
- harvested.extend(self._harvest_repository(repo_info))
- finally:
- shutil.rmtree(repo_info["repo_path"], ignore_errors=True)
-
- if harvested:
- logger.info("Git scraping was completed successfully")
-
- return harvested
-
- def _prepare_repository(self, url: str) -> Dict[str, Any]:
- url_dict = self._parse_url(url)
- repo_path = self._clone_repo(url_dict)
- mkdocs_site_url = self._read_mkdocs_site_url(repo_path)
- ref = self._determine_ref(repo_path, url_dict["branch"])
- web_base_url = self._compute_web_base_url(url_dict["original_url"])
-
- return {
- "repo_path": repo_path,
- "repo_name": url_dict["repo_name"],
- "mkdocs_site_url": mkdocs_site_url,
- "ref": ref,
- "web_base_url": web_base_url,
- }
-
- def _harvest_repository(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- resources: List[ScrapedResource] = []
- resources.extend(self._harvest_mkdocs(repo_info))
- resources.extend(self._harvest_code(repo_info))
- return resources
-
- def _harvest_mkdocs(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- repo_path = repo_info["repo_path"]
- mkdocs_site_url = repo_info["mkdocs_site_url"]
- base_url = repo_info["web_base_url"]
- ref = repo_info["ref"]
- docs_dir = repo_path / "docs"
- if not docs_dir.exists():
- logger.info(f"Skipping MkDocs harvesting for {repo_path}; missing docs directory")
- return []
-
- resources: List[ScrapedResource] = []
- parent_repo = repo_info["repo_name"]
- used_blob_links = False
- for markdown_path in docs_dir.rglob("*.md"):
- if mkdocs_site_url:
- current_url = mkdocs_site_url + markdown_path.relative_to(docs_dir).with_suffix("").as_posix()
- else:
- current_url = self._build_blob_url(base_url, ref, markdown_path.relative_to(repo_path))
- used_blob_links = True
- logger.info(f"Indexing Git doc: {current_url}")
- text_content = markdown_path.read_text(encoding="utf-8")
- relative_path = Path(parent_repo) / markdown_path.relative_to(repo_path)
- resource = ScrapedResource(
- url=current_url,
- content=text_content,
- suffix=markdown_path.suffix.lstrip(".") or "txt",
- source_type="git",
- metadata={
- "repo_path": str(markdown_path.relative_to(repo_path)),
- "title": markdown_path.stem.replace("_", " ").replace("-", " ").title(),
- "parent": parent_repo,
- },
- file_name=markdown_path.name,
- relative_path=str(relative_path),
- )
- if resource.content:
- resources.append(resource)
- else:
- logger.info(f"Resource {current_url} is empty. Skipping...")
-
- if used_blob_links and not mkdocs_site_url:
- logger.info(f"Used repository blob URLs for MkDocs content in {repo_path} (site_url missing)")
-
- return resources
-
- def _harvest_code(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- repo_path = repo_info["repo_path"]
- ref = repo_info["ref"]
- base_url = repo_info["web_base_url"]
- repo_name = repo_info["repo_name"]
-
- resources: List[ScrapedResource] = []
- for file_path in self._iter_code_files(repo_path):
- logger.debug(file_path)
- rel_path = file_path.relative_to(repo_path)
-
- # avoid overlap wtih _harvest_mkdocs
- if rel_path.parts and rel_path.parts[0] == "docs" and file_path.suffix.lower() == ".md":
- continue
-
- try:
- if file_path.stat().st_size > self.max_file_size_bytes:
- logger.warning(f"Skipping {file_path} due to file size")
- continue
- except OSError:
- continue
-
- if not self._is_allowed_suffix(file_path):
- logger.warning(f"Skipping {file_path} due to disallowed suffix")
- continue
-
- if self._looks_binary(file_path):
- logger.warning(f"Skipping {file_path} due to likely binary content")
- continue
-
- try:
- text_content = file_path.read_text(encoding="utf-8", errors="ignore")
- except Exception:
- continue
-
- if not text_content.strip():
- continue
-
- resource_url = self._build_blob_url(base_url, ref, rel_path)
- relative_path = Path(repo_name) / rel_path
- resource = ScrapedResource(
- url=resource_url,
- content=text_content,
- suffix=file_path.suffix.lstrip("."),
- source_type="git",
- metadata={
- "repo_path": str(rel_path),
- "parent": repo_name,
- "ref": ref,
- },
- file_name=file_path.name,
- relative_path=str(relative_path),
- )
- resources.append(resource)
-
- return resources
-
- def _parse_url(self, url: str) -> dict:
- branch_name = None
-
- regex_repo_name = r"(?:github|gitlab)\.[\w.]+\/[^\/]+\/([\w.-]+)(?:\.git|\/|$)"
- match = re.search(regex_repo_name, url, re.IGNORECASE)
- if not match:
- raise ValueError(f"The git url {url} does not match the expected format.")
-
- repo_name = match.group(1)
-
- # Only inject credentials if available (for private repos)
- if self._credentials_available:
- if "gitlab" in url:
- clone_from_url = url.replace("gitlab", f"{self.git_username}:{self.git_token}@gitlab")
- elif "github" in url:
- clone_from_url = url.replace("github", f"{self.git_username}:{self.git_token}@github")
- else:
- # For other hosts, try without credentials
- clone_from_url = url
- else:
- # No credentials - use URL as-is (for public repos)
- clone_from_url = url
-
- branch_split = re.split(r"/(?:-/)?tree/", clone_from_url, maxsplit=1)
- if len(branch_split) > 1:
- branch_name = branch_split[1].strip("/") or None
- clone_from_url = branch_split[0].rstrip("/")
-
- return {
- "original_url": url,
- "clone_url": clone_from_url,
- "repo_name": repo_name,
- "branch": branch_name,
- }
-
- def _clone_repo(self, url_dict: dict) -> Path:
- clone_url = url_dict["clone_url"]
- branch = url_dict["branch"]
- repo_name = url_dict["repo_name"]
-
- logger.info(f"Cloning repository {repo_name}...")
-
- repo_path = self.git_dir / repo_name
- if branch is None:
- Repo.clone_from(clone_url, repo_path)
- else:
- Repo.clone_from(clone_url, repo_path, branch=branch)
-
- return repo_path
-
- def _read_mkdocs_site_url(self, repo_path: Path) -> Optional[str]:
- mkdocs_file = repo_path / "mkdocs.yml"
- if not mkdocs_file.exists():
- return None
- try:
- with mkdocs_file.open("r") as file:
- data = yaml_load(file)
- site_url = data.get("site_url")
- if not site_url:
- return None
- return site_url if site_url.endswith("/") else site_url + "/"
- except Exception:
- logger.info(f"Could not read mkdocs.yml in {repo_path}")
- return None
-
- def _compute_web_base_url(self, original_url: str) -> str:
- sanitized = re.sub(r"//[^@/]+@", "//", original_url)
- sanitized = re.split(r"/(?:-/)?tree/", sanitized, maxsplit=1)[0]
- if sanitized.endswith(".git"):
- sanitized = sanitized[:-4]
- return sanitized.rstrip("/")
-
- def _determine_ref(self, repo_path: Path, requested_branch: Optional[str]) -> str:
- if requested_branch:
- return requested_branch
- repo: Optional[Repo] = None
- try:
- repo = Repo(repo_path)
- return repo.active_branch.name
- except Exception:
- try:
- repo = repo or Repo(repo_path)
- return repo.head.commit.hexsha[:7]
- except Exception:
- return "main"
-
- def _iter_code_files(self, repo_path: Path):
- for root, dirs, files in os.walk(repo_path):
- dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
- for filename in files:
- file_path = Path(root) / filename
- yield file_path
-
- def _is_allowed_suffix(self, file_path: Path) -> bool:
- return file_path.suffix.lower() in self.code_suffixes
-
- def _looks_binary(self, file_path: Path) -> bool:
- try:
- with file_path.open("rb") as file:
- sample = file.read(8000)
- return b"\0" in sample
- except Exception:
- return True
-
- def _build_blob_url(self, base_url: str, ref: str, rel_path: Path) -> str:
- base = base_url.rstrip("/")
- rel = rel_path.as_posix()
- if "gitlab" in base:
- return f"{base}/-/blob/{ref}/{rel}"
- return f"{base}/blob/{ref}/{rel}"
diff --git a/src/data_manager/collectors/scrapers/integrations/sso_scraper.py b/src/data_manager/collectors/scrapers/integrations/sso_scraper.py
deleted file mode 100644
index d03877bfb..000000000
--- a/src/data_manager/collectors/scrapers/integrations/sso_scraper.py
+++ /dev/null
@@ -1,466 +0,0 @@
-import hashlib
-import importlib
-import json
-import os
-import re
-import time
-import urllib.parse
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
-
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.options import Options as FirefoxOptions
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.common.exceptions import TimeoutException
-
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource, BrowserIntermediaryResult
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-class SSOScraper(ABC):
- """Generic base class for SSO-authenticated web scrapers."""
-
- def __init__(self, username=None, password=None, headless=True, site_type="generic", max_depth=2, selenium_url=None):
- """Initialize the SSO scraper with credentials and browser settings.
-
- Args:
- username (str, optional): SSO username. If None, will try to get from env vars.
- password (str, optional): SSO password. If None, will try to get from env vars.
- headless (bool): Whether to run the browser in headless mode.
- site_type (str): Type of site to scrape ('generic' or 'mkdocs')
- max_depth (int): Maximum number of levels to crawl per page.
- """
- self.username = username or self.get_username_from_env()
- self.password = password or self.get_password_from_env()
- self.headless = headless
- self.max_depth = max_depth
- self.site_type = site_type
- self.driver = None
- self.visited_urls = set()
- self.selenium_url = selenium_url
-
- if self.username:
- logger.info(f"Using username: {self.username}")
-
- def _is_image_url(self, url: str) -> bool:
- """Check if URL points to an image file."""
- image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.webp')
- parsed_url = urllib.parse.urlparse(url)
- path = parsed_url.path.lower()
- return any(path.endswith(ext) for ext in image_extensions)
-
- @abstractmethod
- def get_username_from_env(self):
- """Get username from environment variables. Override in subclasses."""
- pass
-
- @abstractmethod
- def get_password_from_env(self):
- """Get password from environment variables. Override in subclasses."""
- pass
-
- @abstractmethod
- def login(self):
- """Login to SSO with the provided credentials. Override in subclasses."""
- pass
-
- def setup_driver(self):
- """Configure and initialize the Firefox WebDriver."""
- firefox_options = FirefoxOptions()
- if self.headless:
- firefox_options.add_argument("--headless")
-
- # Additional options for better performance in containers
- firefox_options.add_argument("--no-sandbox")
- firefox_options.add_argument("--disable-dev-shm-usage")
- firefox_options.add_argument("--disable-gpu")
- firefox_options.add_argument("--window-size=1920,1080")
-
- # Create Firefox profile with preferences
- firefox_profile = webdriver.FirefoxProfile()
- firefox_profile.set_preference("dom.disable_open_during_load", False)
- firefox_profile.set_preference("browser.download.folderList", 2)
- firefox_profile.set_preference("browser.download.manager.showWhenStarting", False)
- firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
-
- # Initialize the driver with options
- if self.selenium_url:
- self.driver = webdriver.Remote(command_executor=self.selenium_url,options=firefox_options)
- else:
- self.driver = webdriver.Firefox(options=firefox_options)
- self.driver.set_page_load_timeout(30)
- logger.info(f"Starting Firefox browser in {'headless' if self.headless else 'visible'} mode...")
- return self.driver
-
- def navigate_to(self, url, wait_time=1):
- """Navigate to specified URL and wait for page to load."""
- if not self.driver:
- raise RuntimeError("WebDriver not initialized. Call setup_driver() first.")
-
- self.driver.get(url)
- time.sleep(wait_time) # Enable wait time for page loading
- logger.info(f"Navigated to {url}")
- logger.info(f"Page title: {self.driver.title}")
- return self.driver.title
-
- def get_links_with_same_hostname(self, base_url):
- """Extract all links from the current page that have the same hostname as base_url."""
- base_hostname = urllib.parse.urlparse(base_url).netloc
- links = []
-
- # Find all anchor tags
- if self.site_type == "mkdocs":
- # For MkDocs, prioritize navigation links
- anchors = self.driver.find_elements(By.CSS_SELECTOR, ".md-nav__link, .md-content a")
- else:
- anchors = self.driver.find_elements(By.TAG_NAME, "a")
-
- for anchor in anchors:
- try:
- href = anchor.get_attribute("href")
- if href and href.strip():
- parsed_url = urllib.parse.urlparse(href)
- # Check if the link has the same hostname and is not a fragment
- if parsed_url.netloc == base_hostname and parsed_url.scheme in ('http', 'https'):
- # Normalize the URL to prevent duplicates
- normalized_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
- if parsed_url.query:
- normalized_url += f"?{parsed_url.query}"
-
- # this works for CMS twiki but should be generalized
- normalized_url = normalized_url.split("?")[0]
- if 'bin/rdiff' in normalized_url or 'bin/edit' in normalized_url or 'bin/oops' in normalized_url or 'bin/attach' in normalized_url or 'bin/genpdf' in normalized_url or '/WebIndex' in normalized_url:
- continue
-
- if not self._clear_url(normalized_url):
- continue
-
- # Skip image files
- if self._is_image_url(normalized_url):
- logger.debug(f"Skipping image URL: {normalized_url}")
- continue
-
- links.append(normalized_url)
-
- except Exception as e:
- logger.error(f"Error extracting link: {e}")
-
- return list(set(links)) # Remove duplicates
-
- def extract_page_data(self, current_url):
- """Return the raw HTML payload for the current page."""
- if not self.driver:
- raise RuntimeError("WebDriver not initialized. Call setup_driver() first.")
-
- title = self.driver.title or ""
- content = self.driver.page_source or ""
-
- return {
- "url": current_url,
- "title": title,
- "content": content,
- "suffix": "html",
- }
-
- def crawl(self, start_url):
- """Crawl pages starting from the given URL, storing title and content of each page.
-
- Args:
- start_url (str): The URL to start crawling from
-
- Returns:
- List[Dict]: A list of dictionaries describing each visited page.
- """
- max_depth = self.max_depth
- depth = 0
-
- if not self.driver:
- self.setup_driver()
-
- # Reset crawling state
- self.visited_urls = set()
- self.page_data = []
- to_visit = [start_url]
- level_links = []
-
- # First authenticate through the start URL
- self.authenticate_and_navigate(start_url)
-
- base_hostname = urllib.parse.urlparse(start_url).netloc
- logger.info(f"Base hostname for crawling: {base_hostname}")
- logger.info(f"Site type: {self.site_type}")
-
- # History record
- pages_visited = 0
- self.visited_urls = set()
-
- while to_visit and depth < max_depth:
- current_url = to_visit.pop(0)
-
- # Skip if we've already visited this URL
- if current_url in self.visited_urls:
- continue
-
- # Skip image files
- if self._is_image_url(current_url):
- logger.debug(f"Skipping image URL: {current_url}")
- self.visited_urls.add(current_url)
- continue
-
- logger.info(f"Crawling page {depth + 1}/{max_depth}: {current_url}")
-
- try:
- # Navigate to the page
- self.navigate_to(current_url, wait_time=2)
-
- # Mark as visited
- self.visited_urls.add(current_url)
- pages_visited += 1
-
- # Extract and store page data
- page_data = self.extract_page_data(current_url)
- self.page_data.append(page_data)
- logger.info(f"Extracted data from {current_url} ({len(page_data['content'])} chars)")
-
- # Get links to follow
- new_links = self.get_links_with_same_hostname(current_url)
- logger.info(f"Found {len(new_links)} links on the page (nv: {pages_visited})")
-
- # Add new links to visit
- for link in new_links:
- if link not in self.visited_urls and link not in to_visit and link not in level_links:
- logger.info(f"Found new link: {link} (nv: {pages_visited})")
- level_links.append(link)
-
- # Scan next level if to_visit is empty
- if not to_visit:
- to_visit.extend(level_links)
- level_links = []
- depth += 1
-
- except Exception as e:
- logger.info(f"Error crawling {current_url}: {e}", exc_info=True)
- self.visited_urls.add(current_url) # Mark as visited to avoid retrying
-
- logger.info(f"Crawling complete. Visited {pages_visited} pages.")
- return list(self.page_data)
-
- def _clear_url(self, url: str) -> bool:
- """Basic filtering for duplicate or fragment-only URLs."""
- if not url:
- return False
-
- # Ignore pure fragments or JavaScript links
- if url.startswith("javascript:"):
- return False
-
- return True
-
- def close(self):
- """Close the browser and clean up resources."""
- if self.driver:
- logger.info("Closing browser...")
- self.driver.quit()
- self.driver = None
-
- def authenticate_and_navigate(self, url):
- """Complete authentication flow and navigate to target URL."""
-
- if not self.driver:
- self.setup_driver()
-
- try:
- # First navigate to trigger SSO
- self.driver.get(url)
-
- # Login
- if self.login():
- # Navigate back to target page
- title = self.navigate_to(url)
- return title
- else:
- return None
- except Exception as e:
- logger.warning(f"Error during authentication: {e}", exc_info=True)
- return None
-
- def authenticate(self, url):
- """Complete authentication flow and navigate to target URL."""
- try:
- if not self.driver:
- self.setup_driver()
-
- # First navigate to trigger SSO
- self.driver.get(url)
-
- # Login
- if self.login():
- # Navigate back to target page
- return self.driver.get_cookies()
- else:
- return None
- except Exception as e:
- logger.warning(f"Error during authentication: {e}", exc_info=True)
- return None
-
- def __enter__(self):
- """Context manager entry point."""
- self.setup_driver()
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- """Context manager exit point."""
- self.close()
-
-
-class CERNSSOScraper(SSOScraper):
- """A scraper to handle CERN SSO authentication and page navigation."""
-
- def get_username_from_env(self):
- """Get CERN SSO username from environment variables."""
- return read_secret("SSO_USERNAME")
-
- def get_password_from_env(self):
- """Get CERN SSO password from environment variables."""
- return read_secret("SSO_PASSWORD")
-
- def login(self):
- """Login to CERN SSO with the provided credentials."""
- if not self.username or not self.password:
- raise ValueError("Missing credentials for CERN SSO")
-
- try:
- wait = WebDriverWait(self.driver, 20)
-
- # Wait for login form to appear
- username_input = wait.until(
- EC.presence_of_element_located((By.ID, "username"))
- )
- username_input.send_keys(self.username)
- # time.sleep(1) # Optional sleep to ensure the input is registered
-
- password_input = wait.until(EC.presence_of_element_located((By.ID, "password")))
- password_input.send_keys(self.password)
- # time.sleep(1) # Optional sleep to ensure the input is registered
-
- sign_in = wait.until(EC.presence_of_element_located((By.ID, "kc-login")))
- sign_in.click()
-
- logger.info("Login credentials submitted")
- return True
- except TimeoutException as e:
- logger.error(f"Could not find username or password fields in due time: {e}", exc_info=True)
- except Exception as e:
- logger.error(f"Error during login: {e}",exc_info=True)
- return False
-
-
-class SSOCollector:
- """Collects resources behind SSO-protected URLs using configured scrapers."""
-
- def __init__(self, selenium_config: Dict[str, Dict]) -> None:
- self._config = selenium_config or {}
- self._enabled = self._config.get("enabled", False)
- self._class_name = self._config.get("selenium_class", "")
- self._class_map = self._config.get("selenium_class_map", {})
-
- def collect(self, url: str) -> List[ScrapedResource]:
- if not self._enabled:
- logger.error("SSO is disabled or not configured")
- return []
-
- scraper_class, scraper_kwargs = self._resolve_scraper()
- if scraper_class is None:
- return []
-
- try:
- with scraper_class(**scraper_kwargs) as scraper:
- payload = scraper.crawl(url)
- resources = self._extract_resources(scraper, payload)
- if not resources:
- logger.warning(f"No content extracted from SSO crawl for {url}")
- return resources
- except Exception as exc: # pragma: no cover - defensive catch
- logger.error(f"SSO scraping failed for {url}: {exc}")
- return []
-
- def _resolve_scraper(self):
- entry = self._class_map.get(self._class_name)
- if not entry:
- logger.error(f"SSO class {self._class_name} not configured")
- return None, {}
-
- scraper_class = entry.get("class")
- if isinstance(scraper_class, str):
- module_name = entry.get(
- "module",
- "src.data_manager.collectors.scrapers.integrations.sso_scraper",
- )
- module = importlib.import_module(module_name)
- scraper_class = getattr(module, scraper_class)
-
- scraper_kwargs = entry.get("kwargs", {})
- return scraper_class, scraper_kwargs
-
- def _extract_resources(self, scraper, payload) -> List[ScrapedResource]:
- resources: List[ScrapedResource] = []
-
- page_data = getattr(scraper, "page_data", None)
- if isinstance(page_data, list):
- for page in page_data:
- if not isinstance(page, dict):
- continue
- page_url = page.get("url")
- content = page.get("content")
- if not page_url or content is None:
- continue
-
- resources.append(
- ScrapedResource(
- url=page_url,
- content=content,
- suffix=page.get("suffix", "html"),
- source_type="sso",
- metadata={
- "title": page.get("title"),
- },
- )
- )
-
- elif isinstance(payload, list):
- for item in payload:
- if not isinstance(item, dict):
- continue
- page_url = item.get("url")
- content = item.get("content")
- if not page_url or content is None:
- continue
- resources.append(
- ScrapedResource(
- url=page_url,
- content=content,
- suffix=item.get("suffix", "html"),
- source_type="sso",
- metadata={
- "visible": str(self._visible).lower(),
- },
- )
- )
-
- elif isinstance(payload, dict):
- for page_url in payload.values():
- logger.warning(
- f"SSO scraper returned mapping without page content; skipping {page_url}"
- )
-
- elif payload is not None:
- logger.warning(
- f"Unsupported SSO payload type {type(payload).__name__}"
- )
-
- return resources
diff --git a/src/data_manager/collectors/scrapers/scraped_resource.py b/src/data_manager/collectors/scrapers/scraped_resource.py
index 357eaaf41..080e4cbb7 100644
--- a/src/data_manager/collectors/scrapers/scraped_resource.py
+++ b/src/data_manager/collectors/scrapers/scraped_resource.py
@@ -74,14 +74,3 @@ def _safe_relative_path(self) -> Optional[Path]:
if rel_path.is_absolute() or ".." in rel_path.parts:
return None
return rel_path
-
-@dataclass
-class BrowserIntermediaryResult:
- """
- this class is meant to provide a layer of abstraction for browser based scrapers (i.e selenium)
- it will format everything into a single class so that more complicated scraping results which may hit
- multiple tabs or pages at once can be handled in a uniform way by the LinkScraper class.
- """
-
- artifacts: List[Dict] # list of scraper results for each page produced by a seelnium navigation
- links: List[str] # links reached
diff --git a/src/data_manager/collectors/scrapers/scraper.py b/src/data_manager/collectors/scrapers/scraper.py
deleted file mode 100644
index 7fe1ef0e3..000000000
--- a/src/data_manager/collectors/scrapers/scraper.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import requests
-import re
-
-from typing import Dict, Iterator, List, Optional
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urljoin, urldefrag
-
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-class LinkScraper:
- """
- Single scraper for all our link needs that handles Selenium and requests.
- This class explicitly handles requests, but if selenium scraping is enabled for a link
- everything is passed through to the driver including how the page data is collected and
- how the next level of links are found. This class DOESNT own the selenium driver, that is
- owned by the scraper manager class.
- """
-
- def __init__(self, verify_urls: bool = True, enable_warnings: bool = True) -> None:
- self.verify_urls = verify_urls
- self.enable_warnings = enable_warnings
- # seen_urls tracks anything queued/visited; visited_urls tracks pages actually crawled.
- self.visited_urls = set()
- self.seen_urls = set()
-
- def _is_image_url(self, url: str) -> bool:
- """Check if URL points to an image file."""
- image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.webp')
- parsed_url = urlparse(url)
- path = parsed_url.path.lower()
- return any(path.endswith(ext) for ext in image_extensions)
-
- def reap(self, response, current_url: str, selenium_scrape: bool = False, authenticator = None):
- """
- probably the most complicated method here and most volatile in terms of maybe later needing a rewrite
-
- this method is here to deal with any result that it gets back. for a selenium resource it expects results as a
- BrowserIntermediaryResult, otherwhise it will handle it as a normal http response. it handles getting the next set
- of links and updating the page data gathered
-
- Args:
- response (BrowserIntermediaryResult | requests.response): whatever has been collected for the current_url by the scraper
- selenium_scrape (bool): whether or not selenium was used to scrape this content
- authenticator (SSOAuthenticator | None): client being used to crawl websites or just for auth
-
- Return (tuple[list[str], list[ScrapedResource]]): next links to crawl and resources collected
- """
-
- # mark as visited
- self._mark_visited(current_url)
-
- source_type = "web" if (authenticator is None) else "sso"
-
- resources = []
-
- if selenium_scrape: # deals with a selenium response (should work for both non authenitcated and authenticated sites in principle)
- assert(authenticator is not None) ## this shouldnt be tripped
-
- # For selenium scraping, we expect a simple dict from extract_page_data
- # containing url, title, content, suffix
- content = response.get("content", "")
- title = response.get("title", "")
- suffix = response.get("suffix", "html")
-
- resource = ScrapedResource(
- url=current_url,
- content=content,
- suffix=suffix,
- source_type=source_type,
- metadata={
- "title": title,
- "content_type": "rendered_html",
- "renderer": "selenium",
- },
- )
- res = authenticator.get_links_with_same_hostname(current_url)
- resources.append(resource)
-
- else: # deals with http response
- content_type = response.headers.get("Content-type")
-
- if current_url.lower().endswith(".pdf"):
- resource = ScrapedResource(
- url=current_url,
- content=response.content,
- suffix="pdf",
- source_type=source_type,
- metadata={"content_type": content_type},
- )
- else:
- resource = ScrapedResource(
- url=current_url,
- content=response.text,
- suffix="html",
- source_type=source_type,
- metadata={
- "content_type": content_type,
- "encoding": response.encoding,
- },
- )
- res = self.get_links_with_same_hostname(current_url, resource)
- resources.append(resource)
-
- return res, resources # either collected via http or via authenticators method
-
-
- def crawl(
- self,
- start_url: str,
- browserclient = None,
- max_depth: int = 1,
- selenium_scrape: bool = False,
- max_pages: Optional[int] = None,
- ):
- """
- crawl pages from a given starting url up to a given depth either using basic http or a provided browser client
-
- Args :
- start_url (str): Url to start crawling from
- authenticator (SSOAuthenticator): class used for handling authenticatoin for web resources
- max_depth (int): max depth of links to descend from the start url
- selenium_scrape (bool): tracks whether or not the page should be scraped through selenium or not
- max_pages (int | None): cap on total pages to visit before stopping
-
- Returns: List[ScrapedResource]
-
- """
- # Consume the iterator so page_data is populated for callers of crawl().
- for _ in self.crawl_iter(
- start_url,
- browserclient=browserclient,
- max_depth=max_depth,
- selenium_scrape=selenium_scrape,
- max_pages=max_pages,
- collect_page_data=True,
- ):
- pass
- return list(self.page_data)
-
- def crawl_iter(
- self,
- start_url: str,
- browserclient = None,
- max_depth: int = 1,
- selenium_scrape: bool = False,
- max_pages: Optional[int] = None,
- collect_page_data: bool = False,
- ) -> Iterator[ScrapedResource]:
- """
- crawl pages from a given starting url up to a given depth either using basic http or a provided browser client
-
- Args :
- start_url (str): Url to start crawling from
- authenticator (SSOAuthenticator): class used for handling authenticatoin for web resources
- max_depth (int): max depth of links to descend from the start url
- selenium_scrape (bool): tracks whether or not the page should be scraped through selenium or not
- max_pages (int | None): cap on total pages to visit before stopping
- collect_page_data (bool): whether to store resources on the scraper instance
-
- Returns: Iterator[ScrapedResource]
-
- """
-
- if not self.enable_warnings:
- import urllib3
- urllib3.disable_warnings()
-
- depth = 0
- self.visited_urls = set()
- self.seen_urls = set()
- self.page_data = []
- normalized_start_url = self._normalize_url(start_url)
- if not normalized_start_url:
- logger.error(f"Failed to crawl: {start_url}, could not normalize URL")
- return
- to_visit = [normalized_start_url]
- self.seen_urls.add(normalized_start_url)
- level_links = []
- pages_visited = 0
-
- base_hostname = urlparse(normalized_start_url).netloc
- logger.info(f"Base hostname for crawling: {base_hostname}")
-
- # session either stays none or becomes a requests.Session object if not selenium scraping
- session = None
-
- if selenium_scrape: # scrape page with pure selenium
- if browserclient is None:
- logger.error(f"Failed to crawl: {start_url}, auth is needed but no browser clilent was passed through")
- return []
- browserclient.authenticate_and_navigate(normalized_start_url)
-
- elif not selenium_scrape and browserclient is not None: # use browser client for auth but scrape with http request
- session = requests.Session()
- cookies = browserclient.authenticate(normalized_start_url)
- if cookies is not None:
- for cookie_args in cookies:
- cookie = requests.cookies.create_cookie(name=cookie_args['name'],
- value=cookie_args['value'],
- domain=cookie_args.get('domain'),
- path=cookie_args.get('path', '/'),
- expires=cookie_args.get('expires'),
- secure=cookie_args.get('secure', False))
- session.cookies.set_cookie(cookie)
-
- else: # pure html no browser client needed
- session = requests.Session()
-
- while to_visit and depth < max_depth:
- if max_pages is not None and pages_visited >= max_pages:
- logger.info(f"Reached max_pages={max_pages}; stopping crawl early.")
- break
- current_url = to_visit.pop(0)
-
- # Skip if we've already visited this URL
- if current_url in self.visited_urls:
- continue
-
- # Skip image files
- if self._is_image_url(current_url):
- logger.debug(f"Skipping image URL: {current_url}")
- self._mark_visited(current_url)
- continue
-
- logger.info(f"Crawling depth {depth + 1}/{max_depth}: {current_url}")
-
- try:
-
- # grab the page content
- if not selenium_scrape:
- assert (session is not None) # REMOVELATER
- response = session.get(current_url, verify = self.verify_urls)
- response.raise_for_status()
- else:
- assert (browserclient is not None) # REMOVELATER
- browserclient.navigate_to(current_url, wait_time = 2)
- response = browserclient.extract_page_data(current_url) # see the BrowserIntermediaryResult class to see what comes back here
-
-
- # Mark as visited and store content
- pages_visited += 1
- new_links, resources = self.reap(response, current_url, selenium_scrape, browserclient)
- for resource in resources:
- if collect_page_data:
- self.page_data.append(resource)
- yield resource
-
- for link in new_links:
- normalized_link = self._normalize_url(link)
- if not normalized_link:
- continue
- if normalized_link in self.seen_urls:
- continue
- logger.info(f"Found new link: {normalized_link} (nv: {pages_visited})")
- self.seen_urls.add(normalized_link)
- level_links.append(normalized_link)
-
- except Exception as e:
- logger.info(f"Error crawling {current_url}: {e}")
- self._mark_visited(current_url) # Mark as visited to avoid retrying
-
- if not to_visit:
- to_visit.extend(level_links)
- level_links = []
- depth += 1
-
- logger.info(f"Crawling complete. Visited {pages_visited} pages.")
- return
-
- def _normalize_url(self, url: str) -> Optional[str]:
- if not url:
- return None
-
- normalized, _ = urldefrag(url)
- parsed = urlparse(normalized)
- if not parsed.scheme:
- return normalized
- return parsed._replace(
- scheme=parsed.scheme.lower(),
- netloc=parsed.netloc.lower(),
- ).geturl()
-
- def _mark_visited(self, url: str) -> None:
- normalized = self._normalize_url(url)
- if not normalized:
- return
- self.visited_urls.add(normalized)
- self.seen_urls.add(normalized)
-
- def get_links_with_same_hostname(self, url: str, page_data: ScrapedResource):
- """Return all links on the page that share the same hostname as `url`. For now does not support PDFs"""
-
- base_url = self._normalize_url(url) or url
- base_hostname = urlparse(base_url).netloc
- links = set()
- a_tags = []
-
- if (page_data.suffix == "html"):
- soup = BeautifulSoup(page_data.content, "html.parser")
- a_tags = soup.find_all("a", href=True)
-
- # how many links found on the first level
- for tag in a_tags:
- full = urljoin(base_url, tag["href"])
- normalized = self._normalize_url(full)
- if not normalized:
- continue
- if urlparse(normalized).netloc == base_hostname:
- links.add(normalized)
- return list(links)
diff --git a/src/data_manager/collectors/scrapers/scraper_manager.py b/src/data_manager/collectors/scrapers/scraper_manager.py
deleted file mode 100644
index 1904f7f11..000000000
--- a/src/data_manager/collectors/scrapers/scraper_manager.py
+++ /dev/null
@@ -1,366 +0,0 @@
-import os
-import importlib
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
-
-from src.data_manager.collectors.persistence import PersistenceService
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource
-from src.data_manager.collectors.scrapers.scraper import LinkScraper
-from src.utils.config_access import get_global_config
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-if TYPE_CHECKING:
- from src.data_manager.collectors.scrapers.integrations.git_scraper import \
- GitScraper
-
-
-class ScraperManager:
- """Coordinates scraper integrations and centralises persistence logic."""
-
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
- global_config = get_global_config()
-
- sources_config = (dm_config or {}).get("sources", {}) or {}
- links_config = sources_config.get("links", {}) if isinstance(sources_config, dict) else {}
- selenium_config = links_config.get("selenium_scraper", {}) if isinstance(sources_config, dict) else {}
-
- git_config = sources_config.get("git", {}) if isinstance(sources_config, dict) else {}
- sso_config = sources_config.get("sso", {}) if isinstance(sources_config, dict) else {}
- self.base_depth = links_config.get('base_source_depth', 5)
- logger.debug(f"Using base depth of {self.base_depth} for weblist URLs")
-
- scraper_config = {}
- if isinstance(links_config, dict):
- scraper_config = links_config.get("html_scraper", {}) or {}
- self.config = scraper_config
- raw_max_pages = links_config.get("max_pages")
- self.max_pages = None
- if raw_max_pages not in (None, ""):
- try:
- self.max_pages = int(raw_max_pages)
- except (TypeError, ValueError):
- logger.warning(f"Invalid max_pages value {raw_max_pages}; ignoring.")
-
- self.links_enabled = True
- self.git_enabled = git_config.get("enabled", False) if isinstance(git_config, dict) else True
- self.git_config = git_config if isinstance(git_config, dict) else {}
- self.selenium_config = selenium_config or {}
- self.selenium_enabled = self.selenium_config.get("enabled", False)
- self.scrape_with_selenium = self.selenium_config.get("use_for_scraping", False)
-
- self.sso_enabled = bool(sso_config.get("enabled", False))
-
- self.data_path = Path(global_config["DATA_PATH"])
- self.input_lists = links_config.get("input_lists", [])
- self.git_dir = self.data_path / "git"
-
- self.data_path.mkdir(parents=True, exist_ok=True)
-
- self.web_scraper = LinkScraper(
- verify_urls=self.config.get("verify_urls", False), # Default to False for broader compatibility
- enable_warnings=self.config.get("enable_warnings", False),
- )
- self._git_scraper: Optional["GitScraper"] = None
-
- def collect_all_from_config(
- self, persistence: PersistenceService
- ) -> None:
- """Run the configured scrapers and persist their output."""
- link_urls, git_urls, sso_urls = self._collect_urls_from_lists_by_type(self.input_lists)
-
- if git_urls:
- self.git_enabled = True
- if sso_urls:
- self.sso_enabled = True
- self._ensure_sso_defaults()
-
- self.collect_links(persistence, link_urls=link_urls)
- self.collect_sso(persistence, sso_urls=sso_urls)
- self.collect_git(persistence, git_urls=git_urls)
-
- logger.info("Web scraping was completed successfully")
-
- def collect_links(
- self,
- persistence: PersistenceService,
- link_urls: List[str] = [],
- max_depth: Optional[int] = None,
- ) -> int:
- """Collect only standard link sources. Returns count of resources scraped."""
- if not self.links_enabled:
- logger.info("Links disabled, skipping link scraping")
- return 0
- if not link_urls:
- return 0
- websites_dir = persistence.data_path / "websites"
- if not os.path.exists(websites_dir):
- os.makedirs(websites_dir, exist_ok=True)
- return self._collect_links_from_urls(link_urls, persistence, websites_dir, max_depth=max_depth)
-
- def collect_git(
- self,
- persistence: PersistenceService,
- git_urls: Optional[List[str]] = None,
- ) -> None:
- """Collect only git sources."""
- if not self.git_enabled:
- logger.info("Git disabled, skipping git scraping")
- return
- if not git_urls:
- return
- git_dir = persistence.data_path / "git"
- if not os.path.exists(git_dir):
- os.makedirs(git_dir, exist_ok=True)
- self._collect_git_resources(git_urls, persistence, git_dir)
-
- def collect_sso(
- self,
- persistence: PersistenceService,
- sso_urls: Optional[List[str]] = None,
- ) -> None:
- """Collect only SSO sources."""
- if not self.sso_enabled:
- logger.info("SSO disabled, skipping SSO scraping")
- return
- self._ensure_sso_defaults()
- if not sso_urls:
- return
- sso_dir = persistence.data_path / "sso"
- if not os.path.exists(sso_dir):
- os.makedirs(sso_dir, exist_ok=True)
- self._collect_sso_from_urls(sso_urls, persistence, sso_dir)
-
- def schedule_collect_links(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- """
- Scheduled collection of link sources.
- For now, this behaves the same as a full collection, overriding last_run depending on the persistence layer.
- """
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="web", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "").strip() for m in metadata]
- catalog_urls = [u for u in catalog_urls if u]
- logger.info("Scheduled links collection found %d URL(s) in catalog", len(catalog_urls))
- self.collect_links(persistence, link_urls=catalog_urls)
-
- def schedule_collect_git(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="git", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "") for m in metadata]
- self.collect_git(persistence, git_urls=catalog_urls)
-
- def schedule_collect_sso(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="sso", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "") for m in metadata]
- self.collect_sso(persistence, sso_urls=catalog_urls)
-
- def _collect_links_from_urls(
- self,
- urls: List[str],
- persistence: PersistenceService,
- output_dir: Path,
- max_depth: Optional[int] = None,
- ) -> int:
- """Collect links from URLs and return total count of resources scraped."""
- # Initialize authenticator if selenium is enabled
- authenticator = None
- if self.selenium_enabled:
- authenticator_class, kwargs = self._resolve_scraper()
- if authenticator_class is not None:
- authenticator = authenticator_class(**kwargs)
-
- total_count = 0
- try:
- for url in urls:
- # For standard link collection, don't use selenium for scraping
- # (SSO urls are handled separately via collect_sso)
- count = self._handle_standard_url(
- url,
- persistence,
- output_dir,
- max_depth=max_depth if max_depth is not None else self.base_depth,
- client=None,
- use_client_for_scraping=False
- )
- total_count += count
- finally:
- if authenticator is not None:
- authenticator.close() # Close the authenticator properly and free the resources
- return total_count
-
- def _collect_sso_from_urls(
- self,
- urls: List[str],
- persistence: PersistenceService,
- output_dir: Path,
- ) -> None:
- """Collect SSO-protected URLs using selenium for authentication."""
- if not self.selenium_enabled:
- logger.error("SSO scraping requires data_manager.sources.links.selenium_scraper.enabled")
- return
- if not read_secret("SSO_USERNAME") or not read_secret("SSO_PASSWORD"):
- logger.error("SSO scraping requires SSO_USERNAME and SSO_PASSWORD secrets")
- return
- authenticator = None
- if self.selenium_enabled:
- authenticator_class, kwargs = self._resolve_scraper()
- if authenticator_class is not None:
- authenticator = authenticator_class(**kwargs)
-
- if authenticator is None:
- logger.error("SSO collection requires a valid selenium scraper configuration")
- return
-
- try:
- for url in urls:
- # For SSO URLs, use selenium client for authentication
- # scrape_with_selenium determines if we use selenium for scraping too
- self._handle_standard_url(
- url,
- persistence,
- output_dir,
- max_depth=self.base_depth,
- client=authenticator,
- use_client_for_scraping=self.scrape_with_selenium
- )
- finally:
- if authenticator is not None:
- authenticator.close()
-
- def _ensure_sso_defaults(self) -> None:
- if not self.selenium_config:
- self.selenium_config = {}
-
- if not self.selenium_enabled:
- self.selenium_config["enabled"] = True
- self.selenium_enabled = True
-
- if not self.selenium_config.get("selenium_class"):
- self.selenium_config["selenium_class"] = "CERNSSOScraper"
-
- class_map = self.selenium_config.setdefault("selenium_class_map", {})
- if "CERNSSOScraper" not in class_map:
- class_map["CERNSSOScraper"] = {
- "class": "CERNSSOScraper",
- "kwargs": {
- "headless": True,
- "max_depth": 2,
- },
- }
-
- def _collect_urls_from_lists(self, input_lists) -> List[str]:
- """Collect URLs from the configured weblists."""
- # Handle case where input_lists might be None
- urls: List[str] = []
- if not input_lists:
- return urls
- for list_name in input_lists:
- list_path = Path("weblists") / Path(list_name).name
- if not list_path.exists():
- logger.warning(f"Input list {list_path} not found.")
- continue
-
- urls.extend(self._extract_urls_from_file(list_path))
-
- return urls
-
- def _collect_urls_from_lists_by_type(self, input_lists: List[str]) -> tuple[List[str], List[str], List[str]]:
- """All types of URLs are in the same input lists, separate them via prefixes"""
- link_urls: List[str] = []
- git_urls: List[str] = []
- sso_urls: List[str] = []
- for raw_url in self._collect_urls_from_lists(input_lists):
- if raw_url.startswith("git-"):
- git_urls.append(raw_url.split("git-", 1)[1])
- continue
- if raw_url.startswith("sso-"):
- sso_urls.append(raw_url.split("sso-", 1)[1])
- continue
- link_urls.append(raw_url)
- return link_urls, git_urls, sso_urls
- def _resolve_scraper(self):
- class_name = self.selenium_config.get("selenium_class")
- class_map = self.selenium_config.get("selenium_class_map", {})
- selenium_url = self.selenium_config.get("selenium_url",None)
-
- entry = class_map.get(class_name)
-
- if not entry:
- logger.error(f"Selenium class {class_name} is not defined in the configuration")
- return None, {}
-
- scraper_class = entry.get("class")
- if isinstance(scraper_class, str):
- module_name = entry.get(
- "module",
- "src.data_manager.collectors.scrapers.integrations.sso_scraper",
- )
- module = importlib.import_module(module_name)
- scraper_class = getattr(module, scraper_class)
- scraper_kwargs = entry.get("kwargs", {})
- scraper_kwargs["selenium_url"] = selenium_url
- return scraper_class, scraper_kwargs
-
-
- def _handle_standard_url(
- self,
- url: str,
- persistence: PersistenceService,
- output_dir: Path,
- max_depth: int,
- client=None,
- use_client_for_scraping: bool = False,
- ) -> int:
- """Scrape a URL and persist resources. Returns count of resources scraped."""
- count = 0
- try:
- for resource in self.web_scraper.crawl_iter(
- url,
- browserclient=client,
- max_depth=max_depth,
- selenium_scrape=use_client_for_scraping,
- max_pages=self.max_pages,
- ):
- persistence.persist_resource(
- resource, output_dir
- )
- count += 1
- logger.info(f"Scraped {count} resources from {url}")
- except Exception as exc:
- logger.error(f"Failed to scrape {url}: {exc}", exc_info=exc)
- return count
-
- def _extract_urls_from_file(self, path: Path) -> List[str]:
- """Extract URLs from file, ignoring depth specifications for now."""
- urls: List[str] = []
- with path.open("r") as file:
- for line in file:
- stripped = line.strip()
- if not stripped or stripped.startswith("#"):
- continue
- # Extract just the URL part, ignoring depth specification if present
- url_depth = stripped.split(",")
- url = url_depth[0].strip()
- urls.append(url)
- return urls
-
- def _collect_git_resources(
- self,
- git_urls: List[str],
- persistence: PersistenceService,
- git_dir: Path,
- ) -> List[ScrapedResource]:
- git_scraper = self._get_git_scraper()
- resources = git_scraper.collect(git_urls)
- for resource in resources:
- persistence.persist_resource(resource, git_dir)
- return resources
-
- def _get_git_scraper(self) -> "GitScraper":
- if self._git_scraper is None:
- from src.data_manager.collectors.scrapers.integrations.git_scraper import \
- GitScraper
-
- self._git_scraper = GitScraper(manager=self, git_config=self.git_config)
- return self._git_scraper
diff --git a/src/interfaces/uploader_app/app.py b/src/interfaces/uploader_app/app.py
index d340978fa..1d4c9aa3e 100644
--- a/src/interfaces/uploader_app/app.py
+++ b/src/interfaces/uploader_app/app.py
@@ -281,13 +281,12 @@ def upload_url(self):
return jsonify({"error": "invalid_depth"}), 400
if depth < 0:
return jsonify({"error": "invalid_depth"}), 400
- # LinkScraper currently uses max_depth >= 1 for the initial URL fetch.
if depth == 0:
depth = 1
if url:
logger.info("Uploading the following URL: %s", url)
try:
- scraped_count = self.scraper_manager.collect_links(self.persistence, link_urls=[url], max_depth=depth)
+ scraped_count = self.scraper_manager.collect_links(self.persistence, link_urls=[url])
self.persistence.flush_index()
self._update_source_status("web", state="idle", last_run=self._now_iso())
added_to_urls = True
From b62c273f24a78bfa62e43e31b3040ac82262e12a Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 06:06:03 +0200
Subject: [PATCH 20/55] introduce generic link item parser, removed toscrape
example.
---
.../collectors/scrapers/parsers/link.py | 72 +++++++++++++++++++
.../collectors/scrapers/parsers/toscrape.py | 29 --------
.../collectors/scrapers/spiders/link.py | 13 +++-
.../collectors/scrapers/spiders/toscrape.py | 28 --------
4 files changed, 82 insertions(+), 60 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/parsers/link.py
delete mode 100644 src/data_manager/collectors/scrapers/parsers/toscrape.py
delete mode 100644 src/data_manager/collectors/scrapers/spiders/toscrape.py
diff --git a/src/data_manager/collectors/scrapers/parsers/link.py b/src/data_manager/collectors/scrapers/parsers/link.py
new file mode 100644
index 000000000..c0b9b6e49
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/parsers/link.py
@@ -0,0 +1,72 @@
+from typing import Iterator
+from scrapy.http import Response, TextResponse
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.utils import get_content_type
+# Tried in order — first non-empty match wins.
+# Covers: HTML5 semantic, ARIA landmark, common CMS patterns, final fallback.
+_CONTENT_SELECTORS = [
+ "main",
+ "article",
+ '[role="main"]',
+ "#content",
+ "#main",
+ "#main-content",
+ ".main-content", # MIT.edu Drupal wrapper
+ ".region-content", # Drupal generic region
+ ".content",
+ ".post-content",
+ ".entry-content",
+ "body",
+]
+def parse_link_page(response: Response) -> Iterator[WebPageItem]:
+ """
+ Generic page parser — works for any HTML page with no site-specific selectors.
+ Strategy:
+ - PDFs: return raw bytes, suffix="pdf".
+ - HTML: extract visible text from the first matching content container,
+ falling back through _CONTENT_SELECTORS to .
+ Full raw HTML is never stored — only visible text reaches the item.
+ Suitable as the default parse_item for LinkSpider subclasses that have
+ no meaningful site-specific structure to exploit.
+ """
+ ct = get_content_type(response)
+ # ── PDF ──────────────────────────────────────────────────────────────────
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield WebPageItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ source_type="web",
+ title="",
+ content_type=ct,
+ )
+ return
+ # ── HTML ─────────────────────────────────────────────────────────────────
+ title = (
+ response.css("h1::text").get()
+ or response.css("title::text").get()
+ or ""
+ ).strip()
+ body_text = _extract_main_text(response)
+ encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
+ if not body_text:
+ return # empty page — don't yield a blank item
+ yield WebPageItem(
+ url=response.url,
+ content=body_text,
+ suffix="html",
+ source_type="web",
+ title=title,
+ content_type=ct,
+ encoding=encoding,
+ )
+def _extract_main_text(response: Response) -> str:
+ """
+ Try content selectors in priority order.
+ Returns the first non-empty joined text, or empty string.
+ """
+ for selector in _CONTENT_SELECTORS:
+ text = " ".join(response.css(f"{selector} *::text").getall()).strip()
+ if text:
+ return text
+ return ""
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/parsers/toscrape.py b/src/data_manager/collectors/scrapers/parsers/toscrape.py
deleted file mode 100644
index 92a045933..000000000
--- a/src/data_manager/collectors/scrapers/parsers/toscrape.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Iterator
-from scrapy.http import Response, TextResponse
-from src.data_manager.collectors.scrapers.items import WebPageItem
-from src.data_manager.collectors.scrapers.utils import get_content_type
-
-def parse_toscrape_page(response: Response) -> Iterator[WebPageItem]:
- ct = get_content_type(response)
-
- if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
- yield WebPageItem(
- url=response.url,
- content=response.body,
- suffix="pdf",
- title="",
- content_type=ct,
- )
- return
-
- title = response.css("title::text").get(default="").strip()
- encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
-
- yield WebPageItem(
- url=response.url,
- content=response.text,
- suffix="html",
- title=title,
- content_type=ct,
- encoding=encoding,
- )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index c93543a9f..7e423a0fc 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -6,6 +6,7 @@
from scrapy.link import Link
from src.data_manager.collectors.scrapers.utils import _IMAGE_EXTS
from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.parsers.link import parse_link_page
class LinkSpider(Spider):
"""
@@ -15,6 +16,8 @@ class LinkSpider(Spider):
name = "link"
+ _DEFAULT_START_URLS = ["https://quotes.toscrape.com/"]
+
custom_settings = {
"DEPTH_LIMIT": 1, # Default max depth
"DOWNLOAD_DELAY": 2, # Default (download) delay
@@ -64,8 +67,12 @@ async def start(self):
def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
Extract one item per response, then yield follow Requests up to max_depth.
+ @url https://quotes.toscrape.com/
+ @returns items 1
+ @returns requests 1
+ @scrapes url title
"""
- yield from self.parse_item(response) # Yield Items
+ yield from self.parse_item(response) # Yield Item
yield from self.follow_links(response) # Yield Requests
@@ -88,9 +95,9 @@ def errback(self, failure):
# ------------------------------------------------------------------ #
# Extension points — pure, unit-testable/checkable without a reactor
# ------------------------------------------------------------------ #
-
+
def parse_item(self, response: Response) -> Iterator[WebPageItem]:
- raise NotImplementedError("parse_item must be implemented by the subclass")
+ yield from parse_link_page(response)
def parse_follow_links(self, response: Response) -> Iterator[Link]:
yield from self._le.extract_links(response)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/toscrape.py b/src/data_manager/collectors/scrapers/spiders/toscrape.py
deleted file mode 100644
index 67af9e815..000000000
--- a/src/data_manager/collectors/scrapers/spiders/toscrape.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Iterator
-from scrapy import Request
-from scrapy.http import Response
-from src.data_manager.collectors.scrapers.items import WebPageItem
-from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
-from src.data_manager.collectors.scrapers.parsers.toscrape import parse_toscrape_page
-from scrapy.link import Link
-
-class ToscrapeSpider(LinkSpider):
- """
- Spider for scraping HTML pages from toscrape.com.
- """
-
- name = "toscrape"
-
- _DEFAULT_START_URLS = ["https://quotes.toscrape.com/"]
-
- def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
- """
- @url https://quotes.toscrape.com/
- @returns items 1
- @returns requests 1
- @scrapes url title
- """
- yield from super().parse(response)
-
- def parse_item(self, response: Response) -> Iterator[WebPageItem]:
- yield from parse_toscrape_page(response)
From 08d1ddccf99615a0ccb91721e63cb89936b22910 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 06:17:24 +0200
Subject: [PATCH 21/55] base LinkSpider support allowed_domains.
---
src/data_manager/collectors/scrapers/spiders/link.py | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 7e423a0fc..8ddf79caf 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -11,7 +11,7 @@
class LinkSpider(Spider):
"""
Generic link-following spider for unauthenticated pages.
- Stays within the same hostname as start_url, up to max_depth.
+ Stays within the hostnames of all start_urls, up to max_depth.
"""
name = "link"
@@ -40,13 +40,17 @@ def from_crawler(cls, crawler, *args, **kwargs):
def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, process_value: Callable[[str], str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._start_urls = start_urls or getattr(self, "_DEFAULT_START_URLS", [])
- self._base_host = urlparse(self._start_urls[0]).netloc if self._start_urls else None
+ self._allowed_domains: set[str] = {
+ urlparse(u).netloc
+ for u in self._start_urls
+ if urlparse(u).netloc
+ }
default_deny = getattr(self, "_DEFAULT_DENY", [])
default_process_value = getattr(self, "_DEFAULT_PROCESS_VALUE", None)
self._le = LinkExtractor(
allow=allow or [],
deny=(deny or []) + default_deny,
- allow_domains=[self._base_host] if self._base_host else [],
+ allow_domains=list(self._allowed_domains),
deny_extensions=list(_IMAGE_EXTS),
canonicalize=canonicalize,
process_value=process_value or default_process_value,
From 355c128845059d28530a0555bc2fa82720e45507 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 08:19:43 +0200
Subject: [PATCH 22/55] dynamic scraper source loader, shared
PersistencePipeline among scrapers, wired to legacy interface.
---
src/bin/service_data_manager.py | 3 +-
.../collectors/scraper_manager.py | 226 ++++++------------
.../collectors/scrapers/pipelines.py | 148 +++++++++++-
.../collectors/scrapers/settings.py | 4 +-
src/data_manager/data_manager.py | 4 +-
5 files changed, 215 insertions(+), 170 deletions(-)
diff --git a/src/bin/service_data_manager.py b/src/bin/service_data_manager.py
index e102d9080..b1236ead1 100644
--- a/src/bin/service_data_manager.py
+++ b/src/bin/service_data_manager.py
@@ -74,9 +74,8 @@ def trigger_update() -> None:
schedule_map: Dict[str, Callable[[Optional[str]], None]] = {
"local_files": lambda last_run=None: data_manager.localfile_manager.schedule_collect_local_files(data_manager.persistence, last_run=last_run),
- "links": lambda last_run=None: data_manager.scraper_manager.schedule_collect_links(data_manager.persistence, last_run=last_run),
+ "web": lambda last_run=None: data_manager.scraper_manager.schedule_collect(last_run=last_run),
"git": lambda last_run=None: data_manager.git_manager.schedule_collect_git(data_manager.persistence, last_run=last_run),
- "sso": lambda last_run=None: data_manager.scraper_manager.schedule_collect_sso(data_manager.persistence, last_run=last_run),
"jira": lambda last_run=None: data_manager.ticket_manager.schedule_collect_jira(data_manager.persistence, last_run=last_run),
"redmine": lambda last_run=None: data_manager.ticket_manager.schedule_collect_redmine(data_manager.persistence, last_run=last_run),
}
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index ee59404b4..eb03b0730 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -1,29 +1,22 @@
from __future__ import annotations
from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Callable
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy.utils.project import get_project_settings
-
+from scrapy.spiderloader import SpiderLoader
+from scrapy.settings import Settings
+from scrapy import Spider
from src.data_manager.collectors.persistence import PersistenceService
from src.utils.config_access import get_global_config
from src.utils.logging import get_logger
logger = get_logger(__name__)
-# Spider registry — add new spider classes here, nothing else changes
-_SPIDER_REGISTRY: Dict[str, str] = {
- "link": "src.data_manager.collectors.scrapers.spiders.link.LinkSpider",
- "twiki": "src.data_manager.collectors.scrapers.spiders.twiki.TwikiSpider",
-}
-
-
-def _import_spider(dotted_path: str):
- module_path, cls_name = dotted_path.rsplit(".", 1)
- import importlib
- return getattr(importlib.import_module(module_path), cls_name)
-
+def _make_spider_loader(settings: Settings) -> Callable[[str], type[Spider]]:
+ """Bind settings once, return a name → SpiderClass callable."""
+ return SpiderLoader.from_settings(settings).load
class ScraperManager:
"""
@@ -34,180 +27,95 @@ class ScraperManager:
SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
"""
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None) -> None:
global_config = get_global_config()
self.data_path = Path(global_config["DATA_PATH"])
+ self.persistence = persistence
sources_config = (dm_config or {}).get("sources", {}) or {}
- links_config = sources_config.get("links", {}) if isinstance(sources_config, dict) else {}
- self.config = links_config if isinstance(links_config, dict) else {}
+ self.config = sources_config if isinstance(sources_config, dict) else {}
self.enabled = self.config.get("enabled", True)
- self.input_lists: List[str] = self.config.get("input_lists", [])
-
- # Per-spider kwargs forwarded from config
- self.max_depth: Optional[int] = self.config.get("max_depth")
- self.max_pages: Optional[int] = self.config.get("max_pages")
- self.delay: Optional[int] = self.config.get("download_delay")
# ── Public interface ──────────────────────────────────────────────────────
- def collect_all_from_config(self, persistence: PersistenceService) -> None:
- if not self.enabled:
- logger.info("Web scraping disabled; skipping")
- return
+ def collect_all_from_config(self) -> None:
+ self._run(self._config_urls)
- link_urls, sso_urls = self._collect_urls_from_lists_by_type(self.input_lists)
- self._run_crawl(persistence, link_urls=link_urls, sso_urls=sso_urls)
+ def schedule_collect(self, last_run: Optional[str] = None) -> None:
+ self._run(self._catalog_urls)
+
+ def collect(self, spider_key: str, urls: List[str]) -> None:
+ self._run(lambda key, cfg: urls if key == spider_key else [])
- def collect_links(
- self,
- persistence: PersistenceService,
- link_urls: Optional[List[str]] = None,
- ) -> None:
- if not link_urls:
- return
- self._run_crawl(persistence, link_urls=link_urls)
-
- def collect_sso(
- self,
- persistence: PersistenceService,
- sso_urls: Optional[List[str]] = None,
- ) -> None:
- if not sso_urls:
+ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
+ if not self.enabled:
+ logger.info("Web scraping disabled; skipping")
return
- self._run_crawl(persistence, sso_urls=sso_urls)
+ settings = get_project_settings()
+ process = CrawlerProcess(settings)
+ load_spider = _make_spider_loader(settings)
+ (self.data_path / "websites").mkdir(parents=True, exist_ok=True)
- def schedule_collect_links(
- self, persistence: PersistenceService, last_run: Optional[str] = None
- ) -> None:
- metadata = persistence.catalog.get_metadata_by_filter(
- "source_type", source_type="web", metadata_keys=["url"]
- )
- urls = [m[1].get("url", "").strip() for m in metadata if m[1].get("url")]
- self.collect_links(persistence, link_urls=urls)
+ for spider_key, cfg in self.config.items():
+ if not isinstance(cfg, dict):
+ continue
+ try:
+ SpiderClass = load_spider(spider_key)
+ except KeyError:
+ continue
+ urls = url_fn(spider_key, cfg)
+ if urls:
+ self._add_crawler(process, SpiderClass, urls, cfg)
- def schedule_collect_sso(
- self, persistence: PersistenceService, last_run: Optional[str] = None
- ) -> None:
- metadata = persistence.catalog.get_metadata_by_filter(
- "source_type", source_type="sso", metadata_keys=["url"]
- )
- urls = [m[1].get("url", "").strip() for m in metadata if m[1].get("url")]
- self.collect_sso(persistence, sso_urls=urls)
+ if process._crawlers:
+ process.start()
# ── CrawlerProcess wiring ─────────────────────────────────────────────────
- def _run_crawl(
- self,
- persistence: PersistenceService,
- link_urls: Optional[List[str]] = None,
- sso_urls: Optional[List[str]] = None,
- ) -> None:
- """Build one CrawlerProcess, add all spiders, start the reactor."""
- websites_dir = self.data_path / "websites"
- websites_dir.mkdir(parents=True, exist_ok=True)
-
- scrapy_settings = get_project_settings()
- process = CrawlerProcess(scrapy_settings)
-
- if link_urls:
- self._add_crawler(
- process,
- spider_key="link",
- persistence=persistence,
- output_dir=websites_dir,
- start_urls=link_urls,
- )
-
- if sso_urls:
- self._add_crawler(
- process,
- spider_key="link",
- persistence=persistence,
- output_dir=websites_dir / "sso",
- start_urls=sso_urls,
- auth_provider_name="cern_sso",
- )
-
- if not process._crawlers:
- logger.info("No URLs to crawl; skipping reactor start")
- return
-
- logger.info("Starting CrawlerProcess with %d spider(s)", len(process._crawlers))
- process.start() # blocks until all spiders finish
- logger.info("CrawlerProcess finished")
-
def _add_crawler(
self,
process: CrawlerProcess,
- spider_key: str,
- persistence: PersistenceService,
- output_dir: Path,
- **spider_kwargs,
+ spider_class: type[Spider],
+ urls: List[str],
+ cfg: Optional[Dict[str, Any]] = None,
) -> None:
"""
Create a Crawler for spider_key, inject PersistencePipeline settings,
and register it with the process.
"""
- SpiderClass = _import_spider(_SPIDER_REGISTRY[spider_key])
- crawler: Crawler = process.create_crawler(SpiderClass)
-
+ cfg = cfg or {}
+ crawler: Crawler = process.create_crawler(spider_class)
# Inject persistence objects — live Python instances, must be priority="spider"
- crawler.settings.set("PERSISTENCE_SERVICE", persistence, priority="spider")
- crawler.settings.set("PERSISTENCE_OUTPUT_DIR", output_dir, priority="spider")
- crawler.settings.set(
- "ITEM_PIPELINES",
- {"src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300},
- priority="spider",
- )
-
- # Forward crawl tuning args if configured
- if self.max_depth is not None:
- spider_kwargs.setdefault("max_depth", self.max_depth)
- if self.max_pages is not None:
- spider_kwargs.setdefault("max_pages", self.max_pages)
- if self.delay is not None:
- spider_kwargs.setdefault("delay", self.delay)
-
- process.crawl(crawler, **spider_kwargs)
-
- # ── URL list parsing ──────────────────────────────────────────────────────
-
- def _collect_urls_from_lists_by_type(
- self, input_lists: List[str]
- ) -> tuple[List[str], List[str]]:
- """
- Parse weblists and split by prefix.
- sso- prefix → SSO-protected URLs (AuthDownloaderMiddleware handles auth)
- no prefix → standard link URLs
- git- prefix → ignored here (GitManager's responsibility)
- """
- link_urls: List[str] = []
- sso_urls: List[str] = []
-
- for raw_url in self._collect_urls_from_lists(input_lists):
- if raw_url.startswith("git-"):
- continue # GitManager owns these
- if raw_url.startswith("sso-"):
- sso_urls.append(raw_url.split("sso-", 1)[1])
- else:
- link_urls.append(raw_url)
-
- return link_urls, sso_urls
-
- def _collect_urls_from_lists(self, input_lists: List[str]) -> List[str]:
- urls: List[str] = []
- if not input_lists:
- return urls
- for list_name in input_lists:
- list_path = Path("weblists") / Path(list_name).name
- if not list_path.exists():
- logger.warning("Input list not found: %s", list_path)
+ crawler.settings.set("PERSISTENCE_SERVICE", self.persistence, priority="spider")
+ crawler.settings.set("PERSISTENCE_OUTPUT_DIR", self.data_path / "websites", priority="spider")
+ process.crawl(crawler, start_urls=urls, **cfg)
+
+ # ── URL sources & list parsing ──────────────────────────────────────────────────────
+
+ def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
+ urls = list(cfg.get("urls") or [])
+ for list_path in cfg.get("input_lists") or []:
+ path = Path(list_path)
+ if not path.exists():
+ logger.warning("Input list not found: %s", path)
continue
- urls.extend(self._extract_urls_from_file(list_path))
+ urls.extend(self._extract_urls_from_file(path))
return urls
+ def _catalog_urls(self, spider_key: str, cfg: Dict) -> List[str]:
+ if not self.persistence:
+ return []
+
+ metadata = self.persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="web", metadata_keys=["url", "spider_name"]
+ )
+ return [
+ m[1].get("url", "").strip()
+ for m in metadata
+ if m[1].get("spider_name", "link") == spider_key and m[1].get("url")
+ ]
+
def _extract_urls_from_file(self, path: Path) -> List[str]:
urls: List[str] = []
with path.open("r") as f:
diff --git a/src/data_manager/collectors/scrapers/pipelines.py b/src/data_manager/collectors/scrapers/pipelines.py
index 1c1365794..f8fa138eb 100644
--- a/src/data_manager/collectors/scrapers/pipelines.py
+++ b/src/data_manager/collectors/scrapers/pipelines.py
@@ -1,8 +1,144 @@
+"""
+Persistence pipeline: converts Scrapy Items → ScrapedResource → PersistenceService.
+
+Design notes
+------------
+* Follows Scrapy's canonical ``from_crawler`` injection pattern.
+ The ``PersistenceService`` instance and output directory are set on
+ ``crawler.settings`` *programmatically* by ``ScraperManager`` before the
+ crawl starts — they are live Python objects, not serialised config values,
+ so they must never appear in settings.py or YAML.
+
+* SRP boundary: this pipeline does *two* things (adapt + persist). That is
+ intentional and acceptable because the two operations are trivially coupled
+ here (no branching logic in either). If adapter logic grows, extract it to
+ ``adapters/resource_adapter.py`` and import here.
+
+* The pipeline never raises — it logs and drops items on error so a single
+ bad page does not kill the crawl (OR-5 / FR-7b).
+
+Settings keys consumed
+----------------------
+PERSISTENCE_SERVICE : PersistenceService instance (required)
+PERSISTENCE_OUTPUT_DIR: pathlib.Path — where files are written (required)
+"""
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from scrapy import Spider
+from scrapy.exceptions import NotConfigured
+
from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
-class AdapterPipeline:
- def process_item(self, item, spider):
- resource = to_scraped_resource(item)
- # Implicitly, set site for every pair of spider/resource.
- resource.metadata["site"] = spider.name
- return item
+if TYPE_CHECKING:
+ from scrapy import Crawler
+ from src.data_manager.collectors.persistence import PersistenceService
+
+logger = logging.getLogger(__name__)
+
+SETTING_SERVICE = "PERSISTENCE_SERVICE"
+SETTING_OUTPUT_DIR = "PERSISTENCE_OUTPUT_DIR"
+
+class PersistencePipeline:
+ """
+ Scrapy item pipeline that persists scraped items via ``PersistenceService``.
+
+ Activation (in ScraperManager, before CrawlerProcess/Runner starts)::
+
+ crawler.settings.set(
+ "PERSISTENCE_SERVICE", persistence_service_instance, priority="spider"
+ )
+ crawler.settings.set(
+ "PERSISTENCE_OUTPUT_DIR", Path("/root/data/websites"), priority="spider"
+ )
+ crawler.settings.set(
+ "ITEM_PIPELINES",
+ {"src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300},
+ priority="spider",
+ )
+ """
+
+ def __init__(self, persistence: "PersistenceService", output_dir: Path) -> None:
+ self._persistence = persistence
+ self._output_dir = output_dir
+ self._success_count = 0
+ self._error_count = 0
+
+ # ------------------------------------------------------------------
+ # Scrapy lifecycle
+ # ------------------------------------------------------------------
+
+ @classmethod
+ def from_crawler(cls, crawler: "Crawler") -> "PersistencePipeline":
+ """Canonical Scrapy injection point — pulls service from settings."""
+ persistence = crawler.settings.get(SETTING_SERVICE)
+ output_dir = crawler.settings.get(SETTING_OUTPUT_DIR)
+
+ if persistence is None:
+ raise NotConfigured(
+ f"PersistencePipeline requires '{SETTING_SERVICE}' in crawler settings. "
+ "Set it programmatically in ScraperManager before starting the crawl."
+ )
+ if output_dir is None:
+ raise NotConfigured(
+ f"PersistencePipeline requires '{SETTING_OUTPUT_DIR}' in crawler settings."
+ )
+
+ instance = cls(persistence=persistence, output_dir=Path(output_dir))
+ return instance
+
+ def open_spider(self, spider: Spider) -> None:
+ self._output_dir.mkdir(parents=True, exist_ok=True)
+ logger.info(
+ "PersistencePipeline opened | output_dir=%s", self._output_dir
+ )
+
+ def close_spider(self, spider: Spider) -> None:
+ # Summary logged via spider_closed signal too, but belt-and-suspenders here.
+ logger.info(
+ "PersistencePipeline | spider=%s persisted=%d errors=%d",
+ spider.name,
+ self._success_count,
+ self._error_count,
+ )
+
+ def process_item(self, item, spider: Spider):
+ """
+ Convert item → ScrapedResource → persist.
+
+ Never raises; errors are logged and the item is dropped.
+ Returning the item allows other downstream pipelines to receive it.
+ """
+ try:
+ resource = to_scraped_resource(item)
+ resource.source_type = "web"
+ resource.metadata["spider_name"] = spider.name
+ except Exception as exc:
+ self._error_count += 1
+ logger.warning(
+ "Adapter failed for item from %s: %s | item=%r",
+ spider.name,
+ exc,
+ dict(item),
+ exc_info=False, # keep log concise; set True for debug
+ )
+ return item # drop from persistence but don't crash
+
+ try:
+ file_path = self._persistence.persist_resource(resource, self._output_dir)
+ self._success_count += 1
+ logger.debug(
+ "Persisted %s → %s", resource.get_hash(), file_path
+ )
+ except Exception as exc:
+ self._error_count += 1
+ logger.error(
+ "PersistenceService.persist_resource failed for %s: %s",
+ getattr(resource, "url", "unknown"),
+ exc,
+ exc_info=True,
+ )
+
+ return item
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index 546b09f53..1bd678547 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -92,7 +92,9 @@
},
}
-ITEM_PIPELINES = { }
+ITEM_PIPELINES = {
+ "src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300,
+}
EXTENSIONS = {
"scrapy.extensions.closespider.CloseSpider": 500,
diff --git a/src/data_manager/data_manager.py b/src/data_manager/data_manager.py
index 8dbf22a78..4c987eec2 100644
--- a/src/data_manager/data_manager.py
+++ b/src/data_manager/data_manager.py
@@ -38,7 +38,7 @@ def __init__(self, *, run_ingestion: bool = True, factory=None):
self.localfile_manager = LocalFileManager(dm_config=self.config["data_manager"])
self.git_manager = GitManager(dm_config=self.config["data_manager"])
- self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"])
+ self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"], persistence=self.persistence)
self.ticket_manager = TicketManager(dm_config=self.config["data_manager"])
self.vector_manager = VectorStoreManager(
@@ -64,7 +64,7 @@ def run_ingestion(self, progress_callback: Optional[Callable[[str], None]] = Non
source_aggregation = [
("Copying configured local files", lambda: self.localfile_manager.collect_all_from_config(self.persistence)),
("Collecting git repos", lambda: self.git_manager.collect_all_from_config(self.persistence)),
- ("Scraping web sources onto filesystem", lambda: self.scraper_manager.collect_all_from_config(self.persistence)),
+ ("Scraping web sources onto filesystem", lambda: self.scraper_manager.collect_all_from_config()),
("Fetching ticket data onto filesystem", lambda: self.ticket_manager.collect_all_from_config(self.persistence)),
]
From 6da57f8e1323113bf5cc75460d6019eedfbed84b Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 08:29:54 +0200
Subject: [PATCH 23/55] revert to proper default w/o sso-enforced.
---
src/data_manager/collectors/scrapers/spiders/twiki.py | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index 7c8b569d8..b3527ec7a 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -14,11 +14,8 @@ class TwikiSpider(LinkSpider):
name = "twiki"
- auth_provider_name = "cern_sso"
-
_DEFAULT_START_URLS = [
- "https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons", # private page
- "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide", # public page
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide", # public page
]
_DEFAULT_DENY = [
From 38ca4526e05029de499ff06059420792831a3053 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 08:49:04 +0200
Subject: [PATCH 24/55] scrapers params will be under source.web, fix bugs.
---
.../collectors/scraper_manager.py | 19 +++++++++++++++----
src/interfaces/uploader_app/app.py | 4 ++--
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index eb03b0730..99a508d5e 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -34,7 +34,7 @@ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: Pers
sources_config = (dm_config or {}).get("sources", {}) or {}
- self.config = sources_config if isinstance(sources_config, dict) else {}
+ self.config = sources_config.get("web", {}) if isinstance(sources_config, dict) else {}
self.enabled = self.config.get("enabled", True)
# ── Public interface ──────────────────────────────────────────────────────
@@ -46,7 +46,17 @@ def schedule_collect(self, last_run: Optional[str] = None) -> None:
self._run(self._catalog_urls)
def collect(self, spider_key: str, urls: List[str]) -> None:
- self._run(lambda key, cfg: urls if key == spider_key else [])
+ settings = get_project_settings()
+ process = CrawlerProcess(settings)
+ try:
+ SpiderClass = _make_spider_loader(settings)(spider_key)
+ except KeyError:
+ logger.error("Unknown spider: %s", spider_key)
+ return
+ cfg = self.config.get(spider_key, {}) # use config settings if present, else defaults
+ if urls:
+ self._add_crawler(process, SpiderClass, urls, cfg)
+ process.start()
def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
if not self.enabled:
@@ -57,6 +67,7 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
load_spider = _make_spider_loader(settings)
(self.data_path / "websites").mkdir(parents=True, exist_ok=True)
+ added = False
for spider_key, cfg in self.config.items():
if not isinstance(cfg, dict):
continue
@@ -67,8 +78,8 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
urls = url_fn(spider_key, cfg)
if urls:
self._add_crawler(process, SpiderClass, urls, cfg)
-
- if process._crawlers:
+ added = True
+ if added:
process.start()
# ── CrawlerProcess wiring ─────────────────────────────────────────────────
diff --git a/src/interfaces/uploader_app/app.py b/src/interfaces/uploader_app/app.py
index 1d4c9aa3e..05c9fc77f 100644
--- a/src/interfaces/uploader_app/app.py
+++ b/src/interfaces/uploader_app/app.py
@@ -78,7 +78,7 @@ def __init__(
if not self.salt:
logger.warning("UPLOADER_SALT not set; account checks may fail.")
- self.scraper_manager = ScraperManager(dm_config=self.config.get("data_manager"))
+ self.scraper_manager = ScraperManager(dm_config=self.config.get("data_manager"), persistence=self.persistence)
self.git_manager = GitManager(dm_config=self.config.get("data_manager"))
self.ticket_manager = TicketManager(dm_config=self.config.get("data_manager"))
self.localfile_manager = LocalFileManager(dm_config=self.config.get("data_manager"))
@@ -286,7 +286,7 @@ def upload_url(self):
if url:
logger.info("Uploading the following URL: %s", url)
try:
- scraped_count = self.scraper_manager.collect_links(self.persistence, link_urls=[url])
+ scraped_count = self.scraper_manager.collect("link", [url])
self.persistence.flush_index()
self._update_source_status("web", state="idle", last_run=self._now_iso())
added_to_urls = True
From 9903f21b46642012d4d0b8add00ca9fead4e8f38 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 09:03:52 +0200
Subject: [PATCH 25/55] migrate to new interface.
---
src/cli/templates/base-config.yaml | 66 ++++++++++++++++--------------
1 file changed, 36 insertions(+), 30 deletions(-)
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index ef6d21fee..5520ffc6d 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -188,40 +188,46 @@ data_manager:
{%- for path in paths %}
- {{ path }}
{%- endfor %}
- links:
- base_source_depth: {{ data_manager.sources.links.base_source_depth | default(1, true) }}
- max_pages: {{ data_manager.sources.links.max_pages | default(null, true) }}
- enabled: {{ data_manager.sources.links.enabled | default(true, true) }}
- visible: {{ data_manager.sources.links.visible | default(true, true) }}
- schedule: '{{ data_manager.sources.links.schedule | default("", true) }}'
- input_lists:
- {%- set link_lists = data_manager.sources.links.input_lists | default([], true) %}
- {%- for input_list in link_lists %}
- - {{ input_list }}
- {%- endfor %}
- html_scraper:
- reset_data: {{ data_manager.sources.links.html_scraper.reset_data | default(true, true) }}
- verify_urls: {{ data_manager.sources.links.html_scraper.verify_urls | default(false, true) }}
- enable_warnings: {{ data_manager.sources.links.html_scraper.enable_warnings | default(false, true) }}
- selenium_scraper:
- enabled: {{ data_manager.sources.links.selenium_scraper.selenium_scraper.enabled | default(false, True) }}
- visible: {{ data_manager.sources.links.selenium_scraper.selenium_scraper.visible | default(false, true) }}
- use_for_scraping: {{ data_manager.sources.links.selenium_scraper.use_for_scraping | default(false, true) }}
- selenium_class: {{ data_manager.sources.links.selenium_scraper.selenium_class | default('CERNSSOScraper', true) }}
- selenium_url: {{ data_manager.sources.links.selenium_scraper.selenium_url | default('null', true) }}
- selenium_class_map:
- CERNSSOScraper:
- class: {{ data_manager.sources.links.selenium_scraper.selenium_class_map.CERNSSOScraper.class | default('CERNSSOScraper', true) }}
- kwargs:
- headless: {{ data_manager.sources.links.selenium_scraper.selenium_class_map.CERNSSOScraper.kwargs.headless | default(true, true) }}
+ web:
+ visible: {{ data_manager.sources.web.visible | default(true, true) }}
+ link:
+ enabled: {{ data_manager.sources.web.link.enabled | default(true, true) }}
+ auth_provider_name: {{ data_manager.sources.web.link.auth_provider_name | default("", true) }}
+ schedule: '{{ data_manager.sources.web.link.schedule | default("", true) }}'
+ max_depth: {{ data_manager.sources.web.link.max_depth | default(3, true) }}
+ max_pages: {{ data_manager.sources.web.link.max_pages | default(null, true) }}
+ delay: {{ data_manager.sources.web.link.delay | default(1, true) }}
+ allow: {{ data_manager.sources.web.link.allow | default([], true) | tojson }}
+ deny: {{ data_manager.sources.web.link.deny | default([], true) | tojson }}
+ input_lists:
+ {%- for l in data_manager.sources.web.link.input_lists | default([], true) %}
+ - {{ l }}
+ {%- endfor %}
+ urls:
+ {%- for u in data_manager.sources.web.link.urls | default([], true) %}
+ - {{ u }}
+ {%- endfor %}
+ twiki:
+ enabled: {{ data_manager.sources.web.twiki.enabled | default(false, true) }}
+ auth_provider_name: {{ data_manager.sources.web.twiki.auth_provider_name | default("", true) }}
+ schedule: '{{ data_manager.sources.web.twiki.schedule | default("", true) }}'
+ max_depth: {{ data_manager.sources.web.twiki.max_depth | default(2, true) }}
+ max_pages: {{ data_manager.sources.web.twiki.max_pages | default(100, true) }}
+ delay: {{ data_manager.sources.web.twiki.delay | default(60, true) }}
+ allow: {{ data_manager.sources.web.twiki.allow | default([], true) | tojson }}
+ deny: {{ data_manager.sources.web.twiki.deny | default([], true) | tojson }}
+ input_lists:
+ {%- for l in data_manager.sources.web.twiki.input_lists | default([], true) %}
+ - {{ l }}
+ {%- endfor %}
+ urls:
+ {%- for u in data_manager.sources.web.twiki.urls | default([], true) %}
+ - {{ u }}
+ {%- endfor %}
git:
enabled: {{ data_manager.sources.git.enabled | default(true, true) }}
visible: {{ data_manager.sources.git.visible | default(true, true) }}
schedule: '{{ data_manager.sources.git.schedule | default("", true) }}'
- sso:
- enabled: {{ data_manager.sources.sso.enabled | default(true, true) }}
- visible: {{ data_manager.sources.sso.visible | default(true, true) }}
- schedule: '{{ data_manager.sources.sso.schedule | default("", true) }}'
jira:
enabled: {{ data_manager.sources.jira.enabled | default(true, true) }}
url: {{ data_manager.sources.jira.url | default('', true) }}
From 88f0fee71d290ac86e725c0a77719d37da43566c Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 09:36:37 +0200
Subject: [PATCH 26/55] add basic-scraping example.
---
.../deployments/basic-scraping/config.yaml | 73 +++++++++++++++++++
1 file changed, 73 insertions(+)
create mode 100644 examples/deployments/basic-scraping/config.yaml
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
new file mode 100644
index 000000000..ac36fc850
--- /dev/null
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -0,0 +1,73 @@
+# Basic configuration file for a Archi deployment
+# with a chat app interface, agent, and
+# PostgreSQL with pgvector for document storage.
+# The LLM is used through an existing Ollama server.
+#
+# run with:
+# archi create --name my-archi-scraping --config examples/deployments/basic-scraping/config.yaml --services chatbot --hostmode
+
+# Deployment example for CERN data sources:
+# Twiki (with optional SSO) + public links + Git repos
+#
+# Required env vars for SSO:
+# SSO_USERNAME=xxx SSO_PASSWORD=yyy
+
+name: my_archi
+
+services:
+ data_manager:
+ port: 7872
+ chat_app:
+ agent_class: CMSCompOpsAgent
+ agents_dir: examples/agents
+ default_provider: local
+ default_model: qwen3:32b
+ providers:
+ local:
+ enabled: true
+ base_url: http://submit76.mit.edu:7870 # make sure this matches your ollama server URL!
+ mode: ollama
+ default_model: "qwen3:32b" # make sure this matches a model you have downloaded locally with ollama
+ models:
+ - "qwen3:32b"
+ trained_on: "My data"
+ port: 7868
+ external_port: 7868
+ vectorstore:
+ backend: postgres # PostgreSQL with pgvector (only supported backend)
+
+data_manager:
+ embedding_name: HuggingFaceEmbeddings
+ sources:
+ web:
+ visible: true
+ link:
+ allow:
+ - ".*/blog/.*"
+ input_lists:
+ - examples/deployments/basic-agent/miscellanea.list
+ twiki:
+ enabled: true
+ auth_provider_name: cern_sso # remove if crawling public pages only
+ urls:
+ - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
+ - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons
+ allow:
+ - ".*CRAB3.*"
+ - ".*SWGuide.*"
+ - ".*WorkBook.*"
+ - ".*Crab.*"
+ - ".*Crab3.*"
+ - ".*HeavyIons.*"
+ - ".*HICollisions.*"
+ - ".*HIRel.*"
+ deny:
+ - ".*WorkBook.*"
+ max_depth: 2
+ max_pages: 100
+ delay: 60
+ git:
+ enabled: true
+ urls:
+ - https://github.com/dmwm/CRABServer
+ - https://github.com/dmwm/CRABClient
\ No newline at end of file
From f8c8ee3c166a4263d8470aa00b75cb0121fcd437 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 09:50:56 +0200
Subject: [PATCH 27/55] workaround backward compatible, Git credential should
be optional.
---
examples/deployments/basic-scraping/config.yaml | 2 ++
src/cli/source_registry.py | 4 ++--
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index ac36fc850..e60d35444 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -39,6 +39,8 @@ services:
data_manager:
embedding_name: HuggingFaceEmbeddings
sources:
+ links: # workaround for backward compatibility
+ input_lists: [] # empty list to avoid error
web:
visible: true
link:
diff --git a/src/cli/source_registry.py b/src/cli/source_registry.py
index 8ef05e5ea..65b4dad7c 100644
--- a/src/cli/source_registry.py
+++ b/src/cli/source_registry.py
@@ -44,8 +44,8 @@ def _register_defaults(self) -> None:
self.register(
SourceDefinition(
name="git",
- description="Git repository scraping for MkDocs-based documentation",
- required_secrets=["GIT_USERNAME", "GIT_TOKEN"],
+ description="Git repository scraping for MkDocs-based documentation, Optional GIT_USERNAME/GIT_TOKEN for private repos.",
+ required_secrets=[], # was ["GIT_USERNAME", "GIT_TOKEN"]
depends_on=["links"],
)
)
From 70efb66de000964fb5649b7c92dbaec8f0b6ec4b Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Tue, 31 Mar 2026 15:39:00 +0200
Subject: [PATCH 28/55] bug fixes, spider_loader can't find scrapy settings,
sso-scraping deps, refactor interface, GitManager works, link, twiki public +
sso sources example.
---
.../deployments/basic-scraping/config.yaml | 11 +++-
.../basic-scraping/miscellanea.list | 50 +++++++++++++++++++
pyproject.toml | 3 +-
src/cli/templates/base-config.yaml | 5 ++
.../dockerfiles/Dockerfile-data-manager | 4 ++
.../collectors/scraper_manager.py | 24 ++++++---
6 files changed, 88 insertions(+), 9 deletions(-)
create mode 100644 examples/deployments/basic-scraping/miscellanea.list
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index e60d35444..012fd2b36 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -46,8 +46,15 @@ data_manager:
link:
allow:
- ".*/blog/.*"
+ - ".*quotes.*"
+ - ".*/(page|author)/.*"
+ urls: # Meant for Demo scraping
+ - https://quotes.toscrape.com/ # typically easy on robots.txt and rate limits
+ max_depth: 2
+ max_pages: 200
+ delay: 2
input_lists:
- - examples/deployments/basic-agent/miscellanea.list
+ - examples/deployments/basic-scraping/miscellanea.list
twiki:
enabled: true
auth_provider_name: cern_sso # remove if crawling public pages only
@@ -66,7 +73,7 @@ data_manager:
deny:
- ".*WorkBook.*"
max_depth: 2
- max_pages: 100
+ max_pages: 500
delay: 60
git:
enabled: true
diff --git a/examples/deployments/basic-scraping/miscellanea.list b/examples/deployments/basic-scraping/miscellanea.list
new file mode 100644
index 000000000..14abe8055
--- /dev/null
+++ b/examples/deployments/basic-scraping/miscellanea.list
@@ -0,0 +1,50 @@
+# PPC
+https://ppc.mit.edu/blog/2016/05/08/hello-world/
+https://ppc.mit.edu/
+https://ppc.mit.edu/news/
+https://ppc.mit.edu/christoph-paus/
+https://ppc.mit.edu/dmytro-kovalskyi/
+https://ppc.mit.edu/gomez-ceballos/
+https://ppc.mit.edu/blog/2024/11/23/lhc-finishes-a-record-year/
+https://ppc.mit.edu/blog/2024/12/02/felicidades-cecilia/
+https://ppc.mit.edu/blog/2015/05/21/clipboard/
+https://ppc.mit.edu/blog/2025/01/12/published-first-diboson-paper-using-run-3-lhc-data/
+https://ppc.mit.edu/blog/2025/01/23/student-fcc-workshop-at-mit-v3-2025/
+https://ppc.mit.edu/blog/2025/01/23/new-chill-in-middleton/
+https://ppc.mit.edu/blog/2025/01/24/first-linux-server-installation-for-david-and-pietro/
+https://ppc.mit.edu/blog/2025/01/26/from-cern-to-mit-for-the-fcc-workshop/
+https://ppc.mit.edu/publications/
+https://ppc.mit.edu/blog/2025/02/08/detailed-schedule-for-the-european-strategy/
+https://ppc.mit.edu/blog/2025/02/14/first-cms-week-in-2025/
+https://ppc.mit.edu/blog/2025/02/18/exploring-the-higgs-boson-in-our-latest-result/
+https://ppc.mit.edu/blog/2025/02/04/news-from-the-chamonix-meeting/
+https://ppc.mit.edu/blog/2025/02/11/cms-data-archival-at-mit/
+https://ppc.mit.edu/blog/2025/03/28/cern-gets-support-from-canada/
+https://ppc.mit.edu/blog/2025/04/08/breakthrough-prize-in-physics-2025/
+https://ppc.mit.edu/blog/2025/04/04/the-fcc-at-cern-a-feasibly-circular-collider/
+https://ppc.mit.edu/blog/2025/04/08/cleo-reached-magic-issue-number-5000/
+https://ppc.mit.edu/blog/2025/04/14/maximizing-cms-competitive-advantage/
+https://ppc.mit.edu/blog/2025/04/25/sueps-at-aps-march-april-meeting/
+https://ppc.mit.edu/blog/2025/04/18/round-three/
+https://ppc.mit.edu/blog/2025/04/14/first-beams-with-a-splash-in-2025/
+https://ppc.mit.edu/blog/2025/05/27/fcc-weak-in-vienna-building-our-future/
+https://ppc.mit.edu/blog/2025/06/04/new-paper-on-arxiv-submit-a-physics-analysis-facility-at-mit/
+https://ppc.mit.edu/blog/2025/06/16/summer-cms-week-2025/
+https://ppc.mit.edu/blog/2025/05/05/cms-records-first-2025-high-energy-collisions/
+https://ppc.mit.edu/blog/2025/06/17/long-term-vision-for-particle-physics-from-the-national-academies/
+https://ppc.mit.edu/blog/2025/06/20/conclusion-of-junes-cern-council-session-has-major-consequences-for-cms/
+https://ppc.mit.edu/blog/2025/06/20/highest-pileup-recorded-at-cms-last-night/
+https://ppc.mit.edu/blog/2025/06/25/selfie-station-at-wilson-hall/
+https://ppc.mit.edu/mariarosaria-dalfonso/
+https://ppc.mit.edu/kenneth-long-2/
+https://ppc.mit.edu/blog/2025/06/27/open-symposium-on-the-european-strategy-for-particle-physics/
+https://ppc.mit.edu/blog/2025/07/03/bridging-physics-and-computing-throughput-computing-2025/
+https://ppc.mit.edu/pietro-lugato-2/
+https://ppc.mit.edu/luca-lavezzo/
+https://ppc.mit.edu/zhangqier-wang-2/
+https://ppc.mit.edu/blog/2025/07/14/welcome-our-first-ever-in-house-masters-student/
+# A2
+https://ppc.mit.edu/a2/
+# Personnel
+https://people.csail.mit.edu/kraska
+https://physics.mit.edu/faculty/christoph-paus
diff --git a/pyproject.toml b/pyproject.toml
index d6508450e..bc4465843 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,8 @@ dependencies = [
"isort==6.0.1",
"pre-commit>=4",
"psycopg2-binary==2.9.10",
- "Scrapy>=2.14.2"
+ "Scrapy>=2.14.2",
+ "playwright>=1.49.0,<2"
]
[project.scripts]
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index 5520ffc6d..7ed90f903 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -189,6 +189,7 @@ data_manager:
- {{ path }}
{%- endfor %}
web:
+ enabled: {{ data_manager.sources.web.enabled | default(true, true) }}
visible: {{ data_manager.sources.web.visible | default(true, true) }}
link:
enabled: {{ data_manager.sources.web.link.enabled | default(true, true) }}
@@ -228,6 +229,10 @@ data_manager:
enabled: {{ data_manager.sources.git.enabled | default(true, true) }}
visible: {{ data_manager.sources.git.visible | default(true, true) }}
schedule: '{{ data_manager.sources.git.schedule | default("", true) }}'
+ urls:
+ {%- for u in data_manager.sources.git.urls | default([], true) %}
+ - {{ u }}
+ {%- endfor %}
jira:
enabled: {{ data_manager.sources.jira.enabled | default(true, true) }}
url: {{ data_manager.sources.jira.url | default('', true) }}
diff --git a/src/cli/templates/dockerfiles/Dockerfile-data-manager b/src/cli/templates/dockerfiles/Dockerfile-data-manager
index 176d1c158..7adb2c473 100644
--- a/src/cli/templates/dockerfiles/Dockerfile-data-manager
+++ b/src/cli/templates/dockerfiles/Dockerfile-data-manager
@@ -35,6 +35,10 @@ COPY pyproject.toml pyproject.toml
COPY weblists weblists
RUN pip install --upgrade pip && pip install .
+# Chromium for Python Playwright (CERN SSO in Scrapy auth middleware).
+RUN python -m playwright install-deps chromium \
+ && python -m playwright install chromium
+
RUN chmod g+rx /root; chmod -R g+w /root/archi/src/interfaces
ARG APP_VERSION=unknown
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index 99a508d5e..5c89b85c1 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Callable
@@ -31,25 +32,32 @@ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: Pers
global_config = get_global_config()
self.data_path = Path(global_config["DATA_PATH"])
self.persistence = persistence
+ self.settings = Settings()
+ self.settings.setmodule(
+ "src.data_manager.collectors.scrapers.settings",
+ priority="project",
+ )
sources_config = (dm_config or {}).get("sources", {}) or {}
+ logger.info("sources_config: %s", json.dumps(sources_config, indent=2, default=str))
self.config = sources_config.get("web", {}) if isinstance(sources_config, dict) else {}
self.enabled = self.config.get("enabled", True)
# ── Public interface ──────────────────────────────────────────────────────
def collect_all_from_config(self) -> None:
+ logger.info("collect_all_from_config")
self._run(self._config_urls)
def schedule_collect(self, last_run: Optional[str] = None) -> None:
self._run(self._catalog_urls)
def collect(self, spider_key: str, urls: List[str]) -> None:
- settings = get_project_settings()
- process = CrawlerProcess(settings)
+ process = CrawlerProcess(self.settings)
+ logger.info("project_settings: %s", json.dumps(self.settings, indent=2, default=str))
try:
- SpiderClass = _make_spider_loader(settings)(spider_key)
+ SpiderClass = _make_spider_loader(self.settings)(spider_key)
except KeyError:
logger.error("Unknown spider: %s", spider_key)
return
@@ -62,13 +70,13 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
if not self.enabled:
logger.info("Web scraping disabled; skipping")
return
- settings = get_project_settings()
- process = CrawlerProcess(settings)
- load_spider = _make_spider_loader(settings)
+ process = CrawlerProcess(self.settings)
+ load_spider = _make_spider_loader(self.settings)
(self.data_path / "websites").mkdir(parents=True, exist_ok=True)
added = False
for spider_key, cfg in self.config.items():
+ logger.info("spider_key: %s, cfg: %s", spider_key, json.dumps(cfg, indent=2, default=str))
if not isinstance(cfg, dict):
continue
try:
@@ -76,9 +84,11 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
except KeyError:
continue
urls = url_fn(spider_key, cfg)
+ logger.info("urls: %s", urls)
if urls:
self._add_crawler(process, SpiderClass, urls, cfg)
added = True
+ logger.info("added: %s", added)
if added:
process.start()
@@ -106,6 +116,7 @@ def _add_crawler(
def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
urls = list(cfg.get("urls") or [])
+ logger.info("cfg_urls: urls: %s", urls)
for list_path in cfg.get("input_lists") or []:
path = Path(list_path)
if not path.exists():
@@ -118,6 +129,7 @@ def _catalog_urls(self, spider_key: str, cfg: Dict) -> List[str]:
if not self.persistence:
return []
+ logger.info("catalog_urls: spider_key: %s, cfg: %s", spider_key, json.dumps(cfg, indent=2, default=str))
metadata = self.persistence.catalog.get_metadata_by_filter(
"source_type", source_type="web", metadata_keys=["url", "spider_name"]
)
From ad7e7b3e63fb0b3513fa879890e33a027a535013 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 14:41:14 +0200
Subject: [PATCH 29/55] fix bugs twiki non-text response leak, playwright
handler can't start if not main thread, bugs scraper_manager not aware of
enabled flag.
---
.../deployments/basic-scraping/config.yaml | 2 --
.../basic-scraping/data/ingestion_status.json | 1 +
.../collectors/scraper_manager.py | 22 +++++++++----------
.../collectors/scrapers/parsers/twiki.py | 5 +++++
4 files changed, 17 insertions(+), 13 deletions(-)
create mode 100644 examples/deployments/basic-scraping/data/ingestion_status.json
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 012fd2b36..5f0f6d088 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -56,7 +56,6 @@ data_manager:
input_lists:
- examples/deployments/basic-scraping/miscellanea.list
twiki:
- enabled: true
auth_provider_name: cern_sso # remove if crawling public pages only
urls:
- https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
@@ -76,7 +75,6 @@ data_manager:
max_pages: 500
delay: 60
git:
- enabled: true
urls:
- https://github.com/dmwm/CRABServer
- https://github.com/dmwm/CRABClient
\ No newline at end of file
diff --git a/examples/deployments/basic-scraping/data/ingestion_status.json b/examples/deployments/basic-scraping/data/ingestion_status.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/examples/deployments/basic-scraping/data/ingestion_status.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index 5c89b85c1..355b4a7ba 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -19,6 +19,11 @@ def _make_spider_loader(settings: Settings) -> Callable[[str], type[Spider]]:
"""Bind settings once, return a name → SpiderClass callable."""
return SpiderLoader.from_settings(settings).load
+def _spider_section_enabled(cfg: Dict[str, Any]) -> bool:
+ """Respect web..enabled; missing or null → enabled (on)."""
+ v = cfg.get("enabled", True)
+ return bool(v) if v is not None else True
+
class ScraperManager:
"""
Coordinates all web crawls as a single CrawlerProcess run.
@@ -40,7 +45,6 @@ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: Pers
sources_config = (dm_config or {}).get("sources", {}) or {}
- logger.info("sources_config: %s", json.dumps(sources_config, indent=2, default=str))
self.config = sources_config.get("web", {}) if isinstance(sources_config, dict) else {}
self.enabled = self.config.get("enabled", True)
@@ -62,9 +66,10 @@ def collect(self, spider_key: str, urls: List[str]) -> None:
logger.error("Unknown spider: %s", spider_key)
return
cfg = self.config.get(spider_key, {}) # use config settings if present, else defaults
- if urls:
+ if urls and _spider_section_enabled(cfg):
self._add_crawler(process, SpiderClass, urls, cfg)
- process.start()
+ # Fix Twisted/Scrapy try to installs OS signal handlers (SIGINT / SIGTERM) while the code is running in a worker thread
+ process.start(install_signal_handlers=False)
def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
if not self.enabled:
@@ -76,7 +81,6 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
added = False
for spider_key, cfg in self.config.items():
- logger.info("spider_key: %s, cfg: %s", spider_key, json.dumps(cfg, indent=2, default=str))
if not isinstance(cfg, dict):
continue
try:
@@ -84,13 +88,11 @@ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
except KeyError:
continue
urls = url_fn(spider_key, cfg)
- logger.info("urls: %s", urls)
- if urls:
+ if urls and _spider_section_enabled(cfg):
self._add_crawler(process, SpiderClass, urls, cfg)
added = True
- logger.info("added: %s", added)
if added:
- process.start()
+ process.start(install_signal_handlers=False)
# ── CrawlerProcess wiring ─────────────────────────────────────────────────
@@ -116,7 +118,6 @@ def _add_crawler(
def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
urls = list(cfg.get("urls") or [])
- logger.info("cfg_urls: urls: %s", urls)
for list_path in cfg.get("input_lists") or []:
path = Path(list_path)
if not path.exists():
@@ -129,7 +130,6 @@ def _catalog_urls(self, spider_key: str, cfg: Dict) -> List[str]:
if not self.persistence:
return []
- logger.info("catalog_urls: spider_key: %s, cfg: %s", spider_key, json.dumps(cfg, indent=2, default=str))
metadata = self.persistence.catalog.get_metadata_by_filter(
"source_type", source_type="web", metadata_keys=["url", "spider_name"]
)
@@ -147,4 +147,4 @@ def _extract_urls_from_file(self, path: Path) -> List[str]:
if not stripped or stripped.startswith("#"):
continue
urls.append(stripped.split(",")[0].strip())
- return urls
\ No newline at end of file
+ return urls
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index 71f17030e..728a68f5a 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -5,6 +5,9 @@
def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
+ if not isinstance(response, TextResponse):
+ logger.debug("Skipping non-text response (no css): %s", response.url)
+ return
# Twiki-specific selectors
title = (
response.css("#topic-title::text").get()
@@ -21,5 +24,7 @@ def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
title=title,
content=body_text,
suffix="html",
+ source_type="web",
content_type=get_content_type(response),
+ encoding=response.encoding or "utf-8",
)
\ No newline at end of file
From be9b1258f3c4f1676aeada8a5b02fba78ceb927f Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 15:12:54 +0200
Subject: [PATCH 30/55] turn-on/test twiki.
---
examples/deployments/basic-scraping/config.yaml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 5f0f6d088..5d1b298b7 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -56,6 +56,7 @@ data_manager:
input_lists:
- examples/deployments/basic-scraping/miscellanea.list
twiki:
+ enabled: true
auth_provider_name: cern_sso # remove if crawling public pages only
urls:
- https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
@@ -77,4 +78,4 @@ data_manager:
git:
urls:
- https://github.com/dmwm/CRABServer
- - https://github.com/dmwm/CRABClient
\ No newline at end of file
+ - https://github.com/dmwm/CRABClient
From 42893f5646d1359877b8f41eb1334feb505d4958 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 16:32:42 +0200
Subject: [PATCH 31/55] fix bug incorrectly copy/resolve input_lists from
container weblists dir.
---
src/cli/managers/config_manager.py | 19 +++++++++++++++----
.../collectors/scraper_manager.py | 2 +-
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/src/cli/managers/config_manager.py b/src/cli/managers/config_manager.py
index 7ed9f9405..ad8f0caac 100644
--- a/src/cli/managers/config_manager.py
+++ b/src/cli/managers/config_manager.py
@@ -13,6 +13,8 @@
STATIC_FIELDS = ['global', 'services']
+_WEB_TOP_LEVEL_STATIC_KEYS = ["enabled", "visible"]
+
class ConfigurationManager:
"""Manages archi configuration loading and validation"""
@@ -266,10 +268,19 @@ def _collect_input_lists(self) -> None:
for conf in self.configs:
data_manager = conf.get('data_manager', {})
sources_section = data_manager.get('sources', {}) or {}
- links_section = sources_section.get('links', {}) if isinstance(sources_section, dict) else {}
- lists = links_section.get('input_lists') or []
- if isinstance(lists, list):
- collected.extend(lists)
+ if not isinstance(sources_section, dict):
+ continue
+ web = sources_section.get("web", {}) or {}
+ if not isinstance(web, dict):
+ continue
+ for spider_key, sub in web.items():
+ if spider_key in _WEB_TOP_LEVEL_STATIC_KEYS:
+ continue
+ if not isinstance(sub, dict):
+ continue
+ wlists = sub.get("input_lists") or []
+ if isinstance(wlists, list):
+ collected.extend(wlists)
self.input_list = sorted(set(collected)) if collected else []
def get_enabled_sources(self) -> List[str]:
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index 355b4a7ba..69a0093a8 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -119,7 +119,7 @@ def _add_crawler(
def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
urls = list(cfg.get("urls") or [])
for list_path in cfg.get("input_lists") or []:
- path = Path(list_path)
+ path = Path("weblists") / list_path.lstrip("/")
if not path.exists():
logger.warning("Input list not found: %s", path)
continue
From e56fa7f0faa6c7a2e0d45eb379d301e8d685e56d Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 16:39:41 +0200
Subject: [PATCH 32/55] test only twiki, no longers needed backwardcompatible
workaround.
---
examples/deployments/basic-scraping/config.yaml | 4 ++--
src/data_manager/collectors/scrapers/parsers/twiki.py | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 5d1b298b7..6a4b63f36 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -39,11 +39,10 @@ services:
data_manager:
embedding_name: HuggingFaceEmbeddings
sources:
- links: # workaround for backward compatibility
- input_lists: [] # empty list to avoid error
web:
visible: true
link:
+ enabled: false
allow:
- ".*/blog/.*"
- ".*quotes.*"
@@ -76,6 +75,7 @@ data_manager:
max_pages: 500
delay: 60
git:
+ enabled: false
urls:
- https://github.com/dmwm/CRABServer
- https://github.com/dmwm/CRABClient
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index 728a68f5a..b59fb439b 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -1,5 +1,5 @@
from typing import Iterator
-from scrapy.http import Response
+from scrapy.http import Response, TextResponse
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.utils import get_content_type
From 34bf2e39a8c96bd642ce24ed60e3dc7d584729d7 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 16:46:45 +0200
Subject: [PATCH 33/55] sources.links no longer exists, new sources.web have no
required fields.
---
src/cli/source_registry.py | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/src/cli/source_registry.py b/src/cli/source_registry.py
index 65b4dad7c..342b48e49 100644
--- a/src/cli/source_registry.py
+++ b/src/cli/source_registry.py
@@ -23,11 +23,9 @@ def __init__(self) -> None:
def _register_defaults(self) -> None:
self.register(
SourceDefinition(
- name="links",
- description="Basic HTTP/HTTPS link scraping from input lists",
- required_config_fields=[
- "data_manager.sources.links.input_lists",
- ],
+ name="web",
+ description="Basic HTTP/HTTPS, Scrapy web sources, seeds from urls and/or input_list",
+ required_config_fields=[],
)
)
self.register(
From e0a8df00d5a893594b9f9805ad614092983bce48 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 16:54:24 +0200
Subject: [PATCH 34/55] git and sso as well no longer depend on sources.links.
---
src/cli/source_registry.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/cli/source_registry.py b/src/cli/source_registry.py
index 342b48e49..1ac02efdf 100644
--- a/src/cli/source_registry.py
+++ b/src/cli/source_registry.py
@@ -34,9 +34,9 @@ def _register_defaults(self) -> None:
description="SSO-backed web crawling",
required_secrets=["SSO_USERNAME", "SSO_PASSWORD"],
required_config_fields=[
- "data_manager.sources.links.selenium_scraper.selenium_class",
+ "data_manager.sources.web",
],
- depends_on=["links"],
+ depends_on=["web"],
)
)
self.register(
@@ -44,7 +44,7 @@ def _register_defaults(self) -> None:
name="git",
description="Git repository scraping for MkDocs-based documentation, Optional GIT_USERNAME/GIT_TOKEN for private repos.",
required_secrets=[], # was ["GIT_USERNAME", "GIT_TOKEN"]
- depends_on=["links"],
+ depends_on=[], # no longer depends on links or webs, considered to be standalone manager.
)
)
self.register(
From 7acda5d4ac230310576b502a683b0728810a8d64 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 17:21:06 +0200
Subject: [PATCH 35/55] fix twiki is too strict for user, has to always set
enabled=True;
---
src/cli/templates/base-config.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index 7ed90f903..f7de9d62c 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -209,7 +209,7 @@ data_manager:
- {{ u }}
{%- endfor %}
twiki:
- enabled: {{ data_manager.sources.web.twiki.enabled | default(false, true) }}
+ enabled: {{ data_manager.sources.web.twiki.enabled | default(true, true) }}
auth_provider_name: {{ data_manager.sources.web.twiki.auth_provider_name | default("", true) }}
schedule: '{{ data_manager.sources.web.twiki.schedule | default("", true) }}'
max_depth: {{ data_manager.sources.web.twiki.max_depth | default(2, true) }}
From 4b7d4991f55fb86b8aa307f1bee4dc1d96e68091 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 19:08:28 +0200
Subject: [PATCH 36/55] less noisy, more clean & robust twiki generic parsed
response.
---
.../collectors/scrapers/parsers/twiki.py | 30 +++++++++++++--
.../collectors/scrapers/spiders/twiki.py | 2 +-
tests/unit/test_scrapers_resource_adapter.py | 38 +++++--------------
tests/unit/test_twiki_parser.py | 26 +++++++++++--
4 files changed, 58 insertions(+), 38 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index b59fb439b..7abfcd5f6 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -1,8 +1,30 @@
+import logging
from typing import Iterator
from scrapy.http import Response, TextResponse
+from src.utils.logging import get_logger
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.utils import get_content_type
+logger = get_logger(__name__)
+
+_TWIKI_BODY_SELECTORS = [
+ "body.patternViewPage #patternMainContents",
+ "#patternMainContents",
+ "body.patternViewPage #patternMain",
+ "#patternMain",
+ "#twikiMainContents",
+ ".patternViewBody",
+ ".twikiTopicText",
+ ".patternTopic",
+ ".patternContent",
+ ".patternMain", # class variant, rare
+]
+def _extract_twiki_body(response: Response) -> str:
+ for selector in _TWIKI_BODY_SELECTORS:
+ text = " ".join(response.css(f"{selector} *::text").getall()).strip()
+ if text:
+ return text
+ return ""
def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
if not isinstance(response, TextResponse):
@@ -15,10 +37,10 @@ def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
or response.css("title::text").get("").split("<")[0].strip()
)
# Main content div — Twiki wraps body in .patternMain or #twikiMainContents
- body_text = " ".join(
- response.css("#twikiMainContents *::text, .patternMain *::text").getall()
- ).strip()
-
+ body_text = _extract_twiki_body(response)
+ if not body_text:
+ logger.debug("No body text found in Twiki page: %s", response.url)
+ return
yield WebPageItem(
url=response.url,
title=title,
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index b3527ec7a..d437c9d7e 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -31,7 +31,7 @@ class TwikiSpider(LinkSpider):
r"/bin/genpdf", # PDF generation — not content
r"/bin/view/Main", # user profile pages, not content
# Navigation/structural pages
- r"LeftBarLeftBar",
+ r"LeftBar$", # just ignore all left bar pages
r"/bin/view/[^/]+/WebLeftBar", # sidebar navigation template
r"/bin/view/[^/]+/WebTopBar", # top navigation bar
r"/bin/view/[^/]+/WebChanges", # recent changes — floods with links
diff --git a/tests/unit/test_scrapers_resource_adapter.py b/tests/unit/test_scrapers_resource_adapter.py
index b1348dd42..95b4c79d8 100644
--- a/tests/unit/test_scrapers_resource_adapter.py
+++ b/tests/unit/test_scrapers_resource_adapter.py
@@ -1,49 +1,29 @@
import pytest
-from src.data_manager.collectors.scrapers.resource_adapter import to_scraped_resource
-from src.data_manager.collectors.scrapers.items import TWikiPageItem, PDFItem
+from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
+from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
# ---------------------------------------------------------------------------
-# WebPageItem, TWikiPageItem adapter
+# WebPageItem adapter
# ---------------------------------------------------------------------------
class TestWebPageItemAdapter:
- def _make_item(self, **overrides) -> TWikiPageItem:
+ def _make_item(self, **overrides) -> WebPageItem:
base = {
"url": "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
"content": "CRAB3ConfigurationFile",
"title": "CRAB3ConfigurationFile",
+ "suffix": "html",
+ "source_type": "web",
+ "content_type": "text/html",
+ "encoding": "utf-8",
}
- return TWikiPageItem({**base, **overrides})
+ return WebPageItem({**base, **overrides})
def test_returns_scraped_resource(self):
assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
- def test_default_source_type_is_web(self):
- assert to_scraped_resource(self._make_item()).source_type == "web"
-
-# ---------------------------------------------------------------------------
-# PDFItem adapter
-# ---------------------------------------------------------------------------
-
-class TestPDFAdapter:
- def _make_item(self, **overrides) -> PDFItem:
- base = {
- "url": "https://mit-teal.github.io/801/textbook/2ed_chapter01.pdf",
- "content": b"%PDF-1.4\n%mock pdf content\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF\n",
- "title": "mock pdf",
- "suffix": "pdf",
- "content_type": "application/pdf",
- }
- return PDFItem({**base, **overrides})
-
- def test_returns_scraped_resource(self):
- assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
-
- def test_default_source_type_is_web(self):
- assert to_scraped_resource(self._make_item()).source_type == "web"
-
# ---------------------------------------------------------------------------
# Unregistered item type — must fail loudly
# ---------------------------------------------------------------------------
diff --git a/tests/unit/test_twiki_parser.py b/tests/unit/test_twiki_parser.py
index 044899616..e810b2d07 100644
--- a/tests/unit/test_twiki_parser.py
+++ b/tests/unit/test_twiki_parser.py
@@ -1,19 +1,37 @@
# tests/unit/test_twiki_parser.py
from pathlib import Path
from scrapy.http import HtmlResponse, Request
+from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
from src.data_manager.collectors.scrapers.spiders.twiki import parse_twiki_page
FIXTURES = Path(__file__).parent / "fixtures"
-def fake_html_response(url: str, fixture_name: str) -> HtmlResponse:
+def fake_html_response(url: str, fixture_name: str, charset: str) -> HtmlResponse:
body = (FIXTURES / fixture_name).read_bytes()
- return HtmlResponse(url=url, body=body, encoding="utf-8", request=Request(url=url))
+ headers = {}
+ if charset:
+ headers[b"Content-Type"] = [f"text/html; charset={charset}".encode("ascii")]
+ # No `encoding=`: let Scrapy infer from headers + HTML meta (like a real download).
+ return HtmlResponse(
+ url=url,
+ status=200,
+ body=body,
+ headers=headers,
+ request=Request(url=url),
+ )
class TestParseTwikiPage:
- def test_prefers_topic_title(self):
+
+ def test_valid_response(self):
response = fake_html_response(
"https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
"twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html",
+ "iso-8859-1",
)
item = next(parse_twiki_page(response))
- assert item["title"] == "CRAB3ConfigurationFile"
+ assert item['title'] == "CRAB3ConfigurationFile"
+ assert item['suffix'] == "html"
+ assert item['source_type'] == "web"
+ assert item['content_type'] == "text/html; charset=iso-8859-1"
+ assert item['encoding'] == "cp1252"
+ assert item['content'] != "" # should be non-empty
\ No newline at end of file
From 25af09c8e6026673a5b609250cec0f02c5b53ee7 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 20:39:17 +0200
Subject: [PATCH 37/55] no links,pdfs has been discard, set title, much robust,
collect body as doms convert to markitdown later.
---
.../collectors/scrapers/parsers/twiki.py | 89 ++++++++++++++-----
tests/unit/test_twiki_parser.py | 27 ++++--
2 files changed, 88 insertions(+), 28 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index 7abfcd5f6..7e10dc352 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -1,13 +1,25 @@
-import logging
-from typing import Iterator
+"""
+TWiki / PatternSkin parser.
+
+1. **PDF** — same rule as ``parse_link_page``: raw ``response.body``, ``suffix="pdf"``.
+2. **HTML** — **outer HTML** of the main column (DOM subtree), not ``*::text``.
+
+Selectors are tried in order; first non-empty serialized node wins, then ``body``.
+"""
+from __future__ import annotations
+
+from typing import Iterator, List
+
from scrapy.http import Response, TextResponse
-from src.utils.logging import get_logger
+
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.utils import get_content_type
+from src.utils.logging import get_logger
+from urllib.parse import urlparse
logger = get_logger(__name__)
-_TWIKI_BODY_SELECTORS = [
+_TWIKI_DOM_SELECTORS: List[str] = [
"body.patternViewPage #patternMainContents",
"#patternMainContents",
"body.patternViewPage #patternMain",
@@ -17,36 +29,67 @@
".twikiTopicText",
".patternTopic",
".patternContent",
- ".patternMain", # class variant, rare
+ ".patternMain",
+ "body",
]
-def _extract_twiki_body(response: Response) -> str:
- for selector in _TWIKI_BODY_SELECTORS:
- text = " ".join(response.css(f"{selector} *::text").getall()).strip()
- if text:
- return text
+
+
+def _first_outer_html(response: Response, selectors: List[str]) -> str:
+ for selector in selectors:
+ nodes = response.css(selector)
+ if not nodes:
+ continue
+ html = nodes[0].get()
+ if html and html.strip():
+ return html.strip()
return ""
+
+def _twiki_title(response: TextResponse) -> str:
+ raw = (
+ response.css("#topic-title::text").get()
+ or response.css(".patternTitle::text").get()
+ or response.css("title::text").get()
+ or ""
+ )
+ if not isinstance(raw, str):
+ return ""
+ # CERN TWiki example: CRAB3ConfigurationFile < CMSPublic < TWiki
+ return raw.split("<")[0].strip()
+
+
def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
+ ct = get_content_type(response)
+
+ # ── PDF (aligned with parse_link_page) ─────────────────────────────────
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield WebPageItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ source_type="web",
+ title=urlparse(response.url).path.split("/")[-1].replace(".pdf", "").strip(),
+ content_type=ct,
+ )
+ return
+
+ # ── HTML DOM ────────────────────────────────────────────────────────────
if not isinstance(response, TextResponse):
logger.debug("Skipping non-text response (no css): %s", response.url)
return
- # Twiki-specific selectors
- title = (
- response.css("#topic-title::text").get()
- or response.css(".patternTitle::text").get()
- or response.css("title::text").get("").split("<")[0].strip()
- )
- # Main content div — Twiki wraps body in .patternMain or #twikiMainContents
- body_text = _extract_twiki_body(response)
- if not body_text:
- logger.debug("No body text found in Twiki page: %s", response.url)
+
+ title = _twiki_title(response)
+ body_html = _first_outer_html(response, _TWIKI_DOM_SELECTORS)
+ if not body_html:
+ logger.debug("No main-column HTML for Twiki page: %s", response.url)
return
+
yield WebPageItem(
url=response.url,
title=title,
- content=body_text,
+ content=body_html,
suffix="html",
source_type="web",
- content_type=get_content_type(response),
+ content_type=ct,
encoding=response.encoding or "utf-8",
- )
\ No newline at end of file
+ )
diff --git a/tests/unit/test_twiki_parser.py b/tests/unit/test_twiki_parser.py
index e810b2d07..9c9ab3e5f 100644
--- a/tests/unit/test_twiki_parser.py
+++ b/tests/unit/test_twiki_parser.py
@@ -1,8 +1,9 @@
# tests/unit/test_twiki_parser.py
from pathlib import Path
-from scrapy.http import HtmlResponse, Request
-from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
-from src.data_manager.collectors.scrapers.spiders.twiki import parse_twiki_page
+
+from scrapy.http import HtmlResponse, Request, Response
+
+from src.data_manager.collectors.scrapers.parsers.twiki import parse_twiki_page
FIXTURES = Path(__file__).parent / "fixtures"
@@ -22,7 +23,7 @@ def fake_html_response(url: str, fixture_name: str, charset: str) -> HtmlRespons
class TestParseTwikiPage:
- def test_valid_response(self):
+ def test_conventional_twiki_page(self):
response = fake_html_response(
"https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
"twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html",
@@ -34,4 +35,20 @@ def test_valid_response(self):
assert item['source_type'] == "web"
assert item['content_type'] == "text/html; charset=iso-8859-1"
assert item['encoding'] == "cp1252"
- assert item['content'] != "" # should be non-empty
\ No newline at end of file
+ # HTML fragment (outer tag + children), not flattened text — for MarkItDown etc.
+ assert "<" in item["content"] and ">" in item["content"]
+ assert "patternMainContents" in item["content"]
+ assert "href=" in item["content"]
+
+ def test_pdf_yields_bytes_like_link_parser(self):
+ url = "https://twiki.cern.ch/twiki/pub/CMSPublic/Topic/file.pdf"
+ response = Response(
+ url=url,
+ body=b"%PDF-1.4 minimal",
+ headers={b"Content-Type": [b"application/pdf"]},
+ request=Request(url=url),
+ )
+ item = next(parse_twiki_page(response))
+ assert item["suffix"] == "pdf"
+ assert item["content"] == b"%PDF-1.4 minimal"
+ assert item["title"] == "file"
\ No newline at end of file
From 3549679b5463714d5f04060b0014b5faca738def Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 21:26:37 +0200
Subject: [PATCH 38/55] ignored old doc and archives format, more robust body
extractor.
---
.../collectors/scrapers/parsers/link.py | 29 ++++++++++---------
.../collectors/scrapers/parsers/twiki.py | 14 ++-------
.../collectors/scrapers/spiders/link.py | 4 +--
src/data_manager/collectors/scrapers/utils.py | 13 +++++++--
4 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/parsers/link.py b/src/data_manager/collectors/scrapers/parsers/link.py
index c0b9b6e49..a4344b110 100644
--- a/src/data_manager/collectors/scrapers/parsers/link.py
+++ b/src/data_manager/collectors/scrapers/parsers/link.py
@@ -1,4 +1,4 @@
-from typing import Iterator
+from typing import Iterator, List
from scrapy.http import Response, TextResponse
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.utils import get_content_type
@@ -18,6 +18,17 @@
".entry-content",
"body",
]
+
+def _first_outer_html(response: Response, selectors: List[str]) -> str:
+ for selector in selectors:
+ nodes = response.css(selector)
+ if not nodes:
+ continue
+ html = nodes[0].get()
+ if html and html.strip():
+ return html.strip()
+ return ""
+
def parse_link_page(response: Response) -> Iterator[WebPageItem]:
"""
Generic page parser — works for any HTML page with no site-specific selectors.
@@ -37,7 +48,7 @@ def parse_link_page(response: Response) -> Iterator[WebPageItem]:
content=response.body,
suffix="pdf",
source_type="web",
- title="",
+ title=urlparse(response.url).path.split("/")[-1].replace(".pdf", "").strip(),
content_type=ct,
)
return
@@ -47,7 +58,7 @@ def parse_link_page(response: Response) -> Iterator[WebPageItem]:
or response.css("title::text").get()
or ""
).strip()
- body_text = _extract_main_text(response)
+ body_text = _first_outer_html(response, _CONTENT_SELECTORS)
encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
if not body_text:
return # empty page — don't yield a blank item
@@ -59,14 +70,4 @@ def parse_link_page(response: Response) -> Iterator[WebPageItem]:
title=title,
content_type=ct,
encoding=encoding,
- )
-def _extract_main_text(response: Response) -> str:
- """
- Try content selectors in priority order.
- Returns the first non-empty joined text, or empty string.
- """
- for selector in _CONTENT_SELECTORS:
- text = " ".join(response.css(f"{selector} *::text").getall()).strip()
- if text:
- return text
- return ""
\ No newline at end of file
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
index 7e10dc352..fa6b6745c 100644
--- a/src/data_manager/collectors/scrapers/parsers/twiki.py
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -8,11 +8,12 @@
"""
from __future__ import annotations
-from typing import Iterator, List
+from typing import Iterator
from scrapy.http import Response, TextResponse
from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.parsers.link import _first_outer_html
from src.data_manager.collectors.scrapers.utils import get_content_type
from src.utils.logging import get_logger
from urllib.parse import urlparse
@@ -34,17 +35,6 @@
]
-def _first_outer_html(response: Response, selectors: List[str]) -> str:
- for selector in selectors:
- nodes = response.css(selector)
- if not nodes:
- continue
- html = nodes[0].get()
- if html and html.strip():
- return html.strip()
- return ""
-
-
def _twiki_title(response: TextResponse) -> str:
raw = (
response.css("#topic-title::text").get()
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 8ddf79caf..61c6859d8 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -4,7 +4,7 @@
from scrapy.http import Response, Request
from scrapy.linkextractors import LinkExtractor
from scrapy.link import Link
-from src.data_manager.collectors.scrapers.utils import _IMAGE_EXTS
+from src.data_manager.collectors.scrapers.utils import IMAGE_EXTENSIONS, IGNORED_DOCUMENT_EXTENSIONS
from src.data_manager.collectors.scrapers.items import WebPageItem
from src.data_manager.collectors.scrapers.parsers.link import parse_link_page
@@ -51,7 +51,7 @@ def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_page
allow=allow or [],
deny=(deny or []) + default_deny,
allow_domains=list(self._allowed_domains),
- deny_extensions=list(_IMAGE_EXTS),
+ deny_extensions=(IMAGE_EXTENSIONS + IGNORED_DOCUMENT_EXTENSIONS),
canonicalize=canonicalize,
process_value=process_value or default_process_value,
unique=True,
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
index a290ae30d..38b9bb8dc 100644
--- a/src/data_manager/collectors/scrapers/utils.py
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -1,8 +1,17 @@
from scrapy.http import Response
-_IMAGE_EXTS = frozenset({
+IMAGE_EXTENSIONS = [
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".ico", ".webp"
-})
+]
+
+# .pdf, docs , xlsx, pptx are first class supported by MarkItDown
+IGNORED_DOCUMENT_EXTENSIONS = [
+ ".doc",
+ ".xls",
+ ".ppt",
+ ".zip",
+ ".rar",
+]
def get_content_type(response: Response) -> str:
"""Decode the Content-Type header bytes to str."""
From 536b2dbe32cf7fba9d733ef676b2554cd823ffd9 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 21:37:26 +0200
Subject: [PATCH 39/55] fix scrapy will add `.` for us!
---
src/data_manager/collectors/scrapers/utils.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
index 38b9bb8dc..003003d77 100644
--- a/src/data_manager/collectors/scrapers/utils.py
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -1,16 +1,16 @@
from scrapy.http import Response
IMAGE_EXTENSIONS = [
- ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".ico", ".webp"
+ "png", "jpg", "jpeg", "gif", "bmp", "svg", "ico", "webp"
]
-# .pdf, docs , xlsx, pptx are first class supported by MarkItDown
+# pdf, docs, xlsx, pptx are first class supported by MarkItDown
IGNORED_DOCUMENT_EXTENSIONS = [
- ".doc",
- ".xls",
- ".ppt",
- ".zip",
- ".rar",
+ "doc",
+ "xls",
+ "ppt",
+ "zip",
+ "rar",
]
def get_content_type(response: Response) -> str:
From c399241fcf0b7c84dd9a6592acedbd2af67ebedc Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 1 Apr 2026 22:53:59 +0200
Subject: [PATCH 40/55] informative logging about follow_links.
---
src/data_manager/collectors/scrapers/spiders/link.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index 61c6859d8..f237b722b 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -104,4 +104,6 @@ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
yield from parse_link_page(response)
def parse_follow_links(self, response: Response) -> Iterator[Link]:
- yield from self._le.extract_links(response)
\ No newline at end of file
+ links = self._le.extract_links(response)
+ self.logger.info("Extracted %d links from %s", len(links), response.url)
+ yield from links
\ No newline at end of file
From 2142948ec43da590a0168b1c6cfc012a0143f849 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 03:32:57 +0200
Subject: [PATCH 41/55] clean basic-scraping config example.
---
examples/deployments/basic-scraping/config.yaml | 10 +++-------
examples/deployments/basic-scraping/miscellanea.list | 1 -
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 6a4b63f36..85f83e617 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -40,22 +40,19 @@ data_manager:
embedding_name: HuggingFaceEmbeddings
sources:
web:
- visible: true
link:
- enabled: false
allow:
- ".*/blog/.*"
- ".*quotes.*"
- ".*/(page|author)/.*"
urls: # Meant for Demo scraping
- - https://quotes.toscrape.com/ # typically easy on robots.txt and rate limits
+ - https://ppc.mit.edu/news/
max_depth: 2
- max_pages: 200
- delay: 2
+ max_pages: 100
+ delay: 10
input_lists:
- examples/deployments/basic-scraping/miscellanea.list
twiki:
- enabled: true
auth_provider_name: cern_sso # remove if crawling public pages only
urls:
- https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
@@ -75,7 +72,6 @@ data_manager:
max_pages: 500
delay: 60
git:
- enabled: false
urls:
- https://github.com/dmwm/CRABServer
- https://github.com/dmwm/CRABClient
diff --git a/examples/deployments/basic-scraping/miscellanea.list b/examples/deployments/basic-scraping/miscellanea.list
index 14abe8055..65bfd0289 100644
--- a/examples/deployments/basic-scraping/miscellanea.list
+++ b/examples/deployments/basic-scraping/miscellanea.list
@@ -1,7 +1,6 @@
# PPC
https://ppc.mit.edu/blog/2016/05/08/hello-world/
https://ppc.mit.edu/
-https://ppc.mit.edu/news/
https://ppc.mit.edu/christoph-paus/
https://ppc.mit.edu/dmytro-kovalskyi/
https://ppc.mit.edu/gomez-ceballos/
From d1356d915cbb10c9f3851364caceadeed956ad43 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 03:37:06 +0200
Subject: [PATCH 42/55] [Dicourse] support recursion/iterator-based scraper,
with cern_sso
---
.../collectors/scrapers/adapters.py | 22 ++-
.../collectors/scrapers/auth/cern_sso.py | 12 +-
src/data_manager/collectors/scrapers/items.py | 11 ++
.../collectors/scrapers/pipelines.py | 1 -
.../collectors/scrapers/spiders/discourse.py | 166 ++++++++++++++++++
5 files changed, 207 insertions(+), 5 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/spiders/discourse.py
diff --git a/src/data_manager/collectors/scrapers/adapters.py b/src/data_manager/collectors/scrapers/adapters.py
index cb1c2e4c6..0bf3fdda9 100644
--- a/src/data_manager/collectors/scrapers/adapters.py
+++ b/src/data_manager/collectors/scrapers/adapters.py
@@ -28,7 +28,7 @@ def _html_page(item) -> ScrapedResource:
from functools import singledispatch
from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-from src.data_manager.collectors.scrapers.items import WebPageItem, IndicoPageItem
+from src.data_manager.collectors.scrapers.items import WebPageItem, IndicoPageItem, DiscourseTopicPageItem
@singledispatch
@@ -80,3 +80,23 @@ def _indico(item) -> ScrapedResource:
"category": item.get("category"),
},
)
+
+@to_scraped_resource.register(DiscourseTopicPageItem)
+def _discourse(item) -> ScrapedResource:
+ """
+ Discourse items carry topic-level metadata from the category JSON listing.
+ """
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "rss"),
+ source_type=item["source_type"],
+ metadata={
+ "content_type": item.get("content_type"),
+ "encoding": item.get("encoding"),
+ "title": item.get("title"),
+ "tags": item.get("tags"),
+ "has_accepted_answer": item.get("has_accepted_answer"),
+ "created_at": item.get("created_at"),
+ },
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/auth/cern_sso.py b/src/data_manager/collectors/scrapers/auth/cern_sso.py
index 79a3e67e0..9cf445f82 100644
--- a/src/data_manager/collectors/scrapers/auth/cern_sso.py
+++ b/src/data_manager/collectors/scrapers/auth/cern_sso.py
@@ -41,6 +41,7 @@
import re
from typing import Dict, List, Optional
+from urllib.parse import urlparse
from playwright.sync_api import (
Browser,
BrowserContext,
@@ -215,9 +216,14 @@ def _login_and_extract(self, url: str) -> Optional[Credentials]:
# Public page: loaded directly without SSO redirect — return whatever
# cookies the browser has (may be empty, that's fine for public pages).
if not looks_like_login_page(page.url):
- raw_cookies = self._context.cookies()
- logger.info("CERNSSOProvider: no SSO redirect for %s, returning browser cookies", url)
- return Credentials(cookies=raw_cookies)
+ # Try the site root — some sites like Discourse only redirect on the homepage
+ origin = f"{urlparse(url).scheme}://{urlparse(url).netloc}/"
+ page.goto(origin, wait_until="networkidle", timeout=30_000)
+ if not looks_like_login_page(page.url):
+ # Still no SSO redirect — return whatever cookies we have
+ raw_cookies = self._context.cookies()
+ logger.info("CERNSSOProvider: no SSO redirect for %s, returning browser cookies", url)
+ return Credentials(cookies=raw_cookies)
if not self._fill_login_form(page):
return None
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
index c084f9ebf..4e9affb71 100644
--- a/src/data_manager/collectors/scrapers/items.py
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -61,6 +61,17 @@ class WebPageItem(BasePageItem):
"""
pass
+class DiscourseTopicPageItem(BasePageItem):
+ """
+ Discourse topic item.
+ Carries topic-level metadata from the category JSON listing —
+ useful for naming, filtering, and status tracking in the adapter.
+ """
+ topic_id = scrapy.Field()
+ slug = scrapy.Field()
+ has_accepted_answer = scrapy.Field()
+ created_at = scrapy.Field()
+ tags = scrapy.Field()
class IndicoPageItem(BasePageItem):
"""
diff --git a/src/data_manager/collectors/scrapers/pipelines.py b/src/data_manager/collectors/scrapers/pipelines.py
index f8fa138eb..94ab0cf14 100644
--- a/src/data_manager/collectors/scrapers/pipelines.py
+++ b/src/data_manager/collectors/scrapers/pipelines.py
@@ -113,7 +113,6 @@ def process_item(self, item, spider: Spider):
"""
try:
resource = to_scraped_resource(item)
- resource.source_type = "web"
resource.metadata["spider_name"] = spider.name
except Exception as exc:
self._error_count += 1
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
new file mode 100644
index 000000000..ffb6f26e9
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -0,0 +1,166 @@
+"""
+Discourse spider — recursive JSON pagination, no link following.
+
+Seed: GET /c/{path}.json → first page of each category
+Recur: GET more_topics_url (from JSON) → next page (until exhausted)
+Fan-out: each topic → GET /t/{slug}/{id}.rss → yield WebPageItem
+"""
+from __future__ import annotations
+
+import re
+import json
+from typing import Any, Iterator, List, Optional
+from urllib.parse import urljoin
+
+from scrapy import Spider
+from scrapy.http import Request, Response, TextResponse
+
+from src.data_manager.collectors.scrapers.items import DiscourseTopicPageItem
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class DiscourseSpider(Spider):
+ name = "discourse"
+
+ _DEFAULT_BASE_URL = "https://cms-talk.web.cern.ch"
+ _DEFAULT_CATEGORY_PATHS: List[str] = [
+ "/c/offcomp/comptools/87",
+ ]
+
+ auth_provider_name = "cern_sso"
+
+ custom_settings = {
+ "ROBOTSTXT_OBEY": False,
+ "DOWNLOAD_DELAY": 60,
+ "RETRY_TIMES": 2,
+ "COOKIES_ENABLED": True,
+ }
+
+ def __init__(
+ self,
+ base_url: Optional[str] = None,
+ category_paths: Optional[List[str]] = None,
+ keywords: Optional[str] = None,
+ *args: Any,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(*args, **kwargs)
+ self.base_url = (base_url or self._DEFAULT_BASE_URL).rstrip("/")
+ self.category_paths = category_paths or self._DEFAULT_CATEGORY_PATHS
+ self.keywords_re = re.compile(keywords) if keywords else None
+
+ # ── Seeds: one request per category (page 0) ────────────────────────
+ async def start(self):
+ for path in self.category_paths:
+ path = path.strip("/")
+ url = f"{self.base_url}/{path}.json"
+ yield Request(
+ url=url,
+ callback=self.parse_category,
+ errback=self.errback,
+ meta={"category_path": path},
+ )
+
+ # ── Category JSON → topic RSS requests + next page ──────────────────
+ def parse_category(self, response: Response) -> Iterator[Request]:
+ try:
+ data = json.loads(response.text)
+ except (json.JSONDecodeError, AttributeError) as exc:
+ logger.error("Failed to parse category JSON %s: %s", response.url, exc)
+ return
+
+ topic_list = data.get("topic_list", {})
+ topics = topic_list.get("topics", []) or []
+ category_path = response.meta.get("category_path", "?")
+ logger.info(
+ "Category %s returned %d topics (%s)",
+ category_path, len(topics), response.url,
+ )
+
+ for topic in topics:
+ slug = topic.get("slug", "")
+ topic_id = topic.get("id")
+ if not slug or not topic_id:
+ continue
+ rss_url = f"{self.base_url}/t/{slug}/{topic_id}.rss"
+ yield Request(
+ url=rss_url,
+ callback=self.parse_topic,
+ errback=self.errback,
+ meta={
+ "topic_id": topic_id,
+ "slug": slug,
+ "title": topic.get("title", f"{slug} ({topic_id})"),
+ "tags": topic.get("tags", []),
+ "has_accepted_answer": topic.get("has_accepted_answer", False),
+ "created_at": topic.get("created_at", ""),
+ },
+ )
+
+ # Recurse: follow more_topics_url if present
+ more_url = topic_list.get("more_topics_url")
+ if more_url:
+ next_url = urljoin(response.url, more_url)
+ if ".json" not in next_url:
+ # Insert .json before the query string:
+ # /c/.../87?page=1 → /c/.../87.json?page=1
+ if "?" in next_url:
+ path, qs = next_url.split("?", 1)
+ next_url = f"{path}.json?{qs}"
+ else:
+ next_url += ".json"
+ yield Request(
+ url=next_url,
+ callback=self.parse_category,
+ errback=self.errback,
+ meta={"category_path": category_path},
+ )
+ else:
+ logger.info("Category %s exhausted (no more_topics_url)", category_path)
+
+ def _content_matches_keywords(self, text: str) -> bool:
+ """No keywords pattern means accept everything."""
+ if self.keywords_re is None:
+ return True
+ return bool(self.keywords_re.search(text))
+
+ # ── Topic RSS → DiscourseTopicPageItem ───────────────────────────────
+ def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
+ if not isinstance(response, TextResponse):
+ logger.debug("Skipping non-text response: %s", response.url)
+ return
+
+ if not self._content_matches_keywords(response.text):
+ logger.debug("Skipping topic (no keyword match): %s", response.url)
+ return
+
+ slug = response.meta.get("slug", "")
+ topic_id = response.meta.get("topic_id", "")
+ title = response.meta.get("title", "")
+ tags = response.meta.get("tags", [])
+
+ yield DiscourseTopicPageItem(
+ url=response.url,
+ content=response.text,
+ suffix="rss",
+ source_type="web",
+ title=title,
+ content_type=response.headers.get("Content-Type", b"").decode(
+ "utf-8", errors="replace"
+ ),
+ encoding=response.encoding or "utf-8",
+ topic_id=topic_id,
+ slug=slug,
+ tags=tags,
+ has_accepted_answer=response.meta.get("has_accepted_answer", False),
+ created_at=response.meta.get("created_at", ""),
+ )
+
+ def errback(self, failure):
+ self.logger.error(
+ "Request failed: %s — %s",
+ failure.request.url,
+ repr(failure.value),
+ )
\ No newline at end of file
From 3f12867195f42b9c88b2a9a0a0760d389bcc03ae Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 04:10:08 +0200
Subject: [PATCH 43/55] [Discourse] refined interfaces and config example.
---
.../deployments/basic-scraping/config.yaml | 14 ++++++++++
src/cli/templates/base-config.yaml | 23 ++++++++++++---
.../collectors/scrapers/spiders/discourse.py | 28 +++++++++++++++----
3 files changed, 55 insertions(+), 10 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 85f83e617..7b19198f5 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -71,6 +71,20 @@ data_manager:
max_depth: 2
max_pages: 500
delay: 60
+ discourse:
+ auth_provider_name: cern_sso
+ base_url: https://cms-talk.web.cern.ch
+ delay: 10
+ max_pages: 1000
+ category_paths:
+ - /c/offcomp/comptools/87
+ keywords:
+ - "Stefano Belforte"
+ - "Katy Ellis"
+ - "Krittin Phornsiricharoenphant"
+ - "Vijay Chakravarty"
+ - "Dario Mapelli"
+ - "Thanayut Seethongchuen"
git:
urls:
- https://github.com/dmwm/CRABServer
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index f7de9d62c..d59ba21ec 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -218,12 +218,27 @@ data_manager:
allow: {{ data_manager.sources.web.twiki.allow | default([], true) | tojson }}
deny: {{ data_manager.sources.web.twiki.deny | default([], true) | tojson }}
input_lists:
- {%- for l in data_manager.sources.web.twiki.input_lists | default([], true) %}
- - {{ l }}
+ {%- for list in data_manager.sources.web.twiki.input_lists | default([], true) %}
+ - {{ list }}
{%- endfor %}
urls:
- {%- for u in data_manager.sources.web.twiki.urls | default([], true) %}
- - {{ u }}
+ {%- for url in data_manager.sources.web.twiki.urls | default([], true) %}
+ - {{ url }}
+ {%- endfor %}
+ discourse:
+ enabled: {{ data_manager.sources.web.discourse.enabled | default(true, true) }}
+ auth_provider_name: {{ data_manager.sources.web.discourse.auth_provider_name | default("cern_sso", true) }}
+ schedule: '{{ data_manager.sources.web.discourse.schedule | default("", true) }}'
+ max_pages: {{ data_manager.sources.web.discourse.max_pages | default(500, true) }}
+ delay: {{ data_manager.sources.web.discourse.delay | default(10, true) }}
+ base_url: {{ data_manager.sources.web.discourse.base_url | default("https://cms-talk.web.cern.ch", true) }}
+ keywords:
+ {%- for keyword in data_manager.sources.web.discourse.keywords | default([], true) %}
+ - {{ keyword }}
+ {%- endfor %}
+ category_paths:
+ {%- for category_path in data_manager.sources.web.discourse.category_paths | default([], true) %}
+ - {{ category_path }}
{%- endfor %}
git:
enabled: {{ data_manager.sources.git.enabled | default(true, true) }}
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index ffb6f26e9..8b0f6bd74 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -3,7 +3,7 @@
Seed: GET /c/{path}.json → first page of each category
Recur: GET more_topics_url (from JSON) → next page (until exhausted)
-Fan-out: each topic → GET /t/{slug}/{id}.rss → yield WebPageItem
+Fan-out: each topic → GET /t/{slug}/{id}.rss → yield DiscourseTopicPageItem
"""
from __future__ import annotations
@@ -33,23 +33,39 @@ class DiscourseSpider(Spider):
custom_settings = {
"ROBOTSTXT_OBEY": False,
- "DOWNLOAD_DELAY": 60,
+ "DOWNLOAD_DELAY": 10, # default polite delay (seconds)
"RETRY_TIMES": 2,
"COOKIES_ENABLED": True,
+ "CLOSESPIDER_PAGECOUNT": 500, # safety cap on total responses
+ "CLOSESPIDER_ITEMCOUNT": 0, # 0 = no item-count limit
}
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ delay = kwargs.get("delay")
+ max_pages = kwargs.get("max_pages")
+ if delay:
+ crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
+ if max_pages:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ return super().from_crawler(crawler, *args, **kwargs)
+
def __init__(
self,
base_url: Optional[str] = None,
category_paths: Optional[List[str]] = None,
- keywords: Optional[str] = None,
+ keywords: Optional[List[str]] = None,
+ delay: Optional[int] = None,
+ max_pages: Optional[int] = None,
*args: Any,
**kwargs: Any,
) -> None:
super().__init__(*args, **kwargs)
self.base_url = (base_url or self._DEFAULT_BASE_URL).rstrip("/")
self.category_paths = category_paths or self._DEFAULT_CATEGORY_PATHS
- self.keywords_re = re.compile(keywords) if keywords else None
+ self.keywords_re: List[re.Pattern] = [
+ re.compile(kw, re.IGNORECASE) for kw in (keywords or [])
+ ]
# ── Seeds: one request per category (page 0) ────────────────────────
async def start(self):
@@ -122,9 +138,9 @@ def parse_category(self, response: Response) -> Iterator[Request]:
def _content_matches_keywords(self, text: str) -> bool:
"""No keywords pattern means accept everything."""
- if self.keywords_re is None:
+ if not self.keywords_re:
return True
- return bool(self.keywords_re.search(text))
+ return any(pattern.search(text) for pattern in self.keywords_re)
# ── Topic RSS → DiscourseTopicPageItem ───────────────────────────────
def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
From 621a5a3b4122961714aded14e460b6b29cc1ea00 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 05:03:44 +0200
Subject: [PATCH 44/55] [Discourse] ScraperManager now support iterative-based
Spider, no depth limit
---
.../deployments/basic-scraping/config.yaml | 37 +------------------
.../collectors/scraper_manager.py | 4 ++
.../collectors/scrapers/spiders/discourse.py | 1 +
3 files changed, 6 insertions(+), 36 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 7b19198f5..bf40dbe8f 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -40,42 +40,11 @@ data_manager:
embedding_name: HuggingFaceEmbeddings
sources:
web:
- link:
- allow:
- - ".*/blog/.*"
- - ".*quotes.*"
- - ".*/(page|author)/.*"
- urls: # Meant for Demo scraping
- - https://ppc.mit.edu/news/
- max_depth: 2
- max_pages: 100
- delay: 10
- input_lists:
- - examples/deployments/basic-scraping/miscellanea.list
- twiki:
- auth_provider_name: cern_sso # remove if crawling public pages only
- urls:
- - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
- - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons
- allow:
- - ".*CRAB3.*"
- - ".*SWGuide.*"
- - ".*WorkBook.*"
- - ".*Crab.*"
- - ".*Crab3.*"
- - ".*HeavyIons.*"
- - ".*HICollisions.*"
- - ".*HIRel.*"
- deny:
- - ".*WorkBook.*"
- max_depth: 2
- max_pages: 500
- delay: 60
discourse:
auth_provider_name: cern_sso
base_url: https://cms-talk.web.cern.ch
delay: 10
- max_pages: 1000
+ max_pages: 200
category_paths:
- /c/offcomp/comptools/87
keywords:
@@ -85,7 +54,3 @@ data_manager:
- "Vijay Chakravarty"
- "Dario Mapelli"
- "Thanayut Seethongchuen"
- git:
- urls:
- - https://github.com/dmwm/CRABServer
- - https://github.com/dmwm/CRABClient
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index 69a0093a8..b84f94dbb 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -124,6 +124,10 @@ def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
logger.warning("Input list not found: %s", path)
continue
urls.extend(self._extract_urls_from_file(path))
+ # Discourse (and similar API/Iterative spiders) don't use start_urls;
+ # category_paths or base_url signals the spider is configured.
+ if not urls and (cfg.get("category_paths") or cfg.get("base_url")):
+ urls = ["__api_spider__"]
return urls
def _catalog_urls(self, spider_key: str, cfg: Dict) -> List[str]:
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index 8b0f6bd74..1eff11e82 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -38,6 +38,7 @@ class DiscourseSpider(Spider):
"COOKIES_ENABLED": True,
"CLOSESPIDER_PAGECOUNT": 500, # safety cap on total responses
"CLOSESPIDER_ITEMCOUNT": 0, # 0 = no item-count limit
+ "DEPTH_LIMIT": 0, # 0 = no limit; pagination is not link depth tracking
}
@classmethod
From c559664e156b827910b4811c2cc94692f4f51beb Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 05:06:55 +0200
Subject: [PATCH 45/55] [Discourse] bring back full example
---
.../deployments/basic-scraping/config.yaml | 33 ++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index bf40dbe8f..28d989d74 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -40,11 +40,38 @@ data_manager:
embedding_name: HuggingFaceEmbeddings
sources:
web:
+ link:
+ urls:
+ - https://ppc.mit.edu/news/
+ max_depth: 2
+ max_pages: 100
+ delay: 10
+ input_lists:
+ - examples/deployments/basic-scraping/miscellanea.list
+ twiki:
+ auth_provider_name: cern_sso # remove if crawling public pages only
+ urls:
+ - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
+ - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons
+ allow:
+ - ".*CRAB3.*"
+ - ".*SWGuide.*"
+ - ".*WorkBook.*"
+ - ".*Crab.*"
+ - ".*Crab3.*"
+ - ".*HeavyIons.*"
+ - ".*HICollisions.*"
+ - ".*HIRel.*"
+ deny:
+ - ".*WorkBook.*"
+ max_depth: 2
+ max_pages: 500
+ delay: 60
discourse:
auth_provider_name: cern_sso
base_url: https://cms-talk.web.cern.ch
delay: 10
- max_pages: 200
+ max_pages: 1000
category_paths:
- /c/offcomp/comptools/87
keywords:
@@ -54,3 +81,7 @@ data_manager:
- "Vijay Chakravarty"
- "Dario Mapelli"
- "Thanayut Seethongchuen"
+ git:
+ urls:
+ - https://github.com/dmwm/CRABServer
+ - https://github.com/dmwm/CRABClient
From 995f2423b39b924512673699d5c338155bfd704c Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 05:18:45 +0200
Subject: [PATCH 46/55] [Discourse] workaround store rss as html, only best
support format by vectorsotre manager.
---
src/data_manager/collectors/scrapers/spiders/discourse.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index 1eff11e82..daae39586 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -161,7 +161,7 @@ def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
yield DiscourseTopicPageItem(
url=response.url,
content=response.text,
- suffix="rss",
+ suffix="html", # Workaround: vectorstore manager was not supporting RSS feed yet.
source_type="web",
title=title,
content_type=response.headers.get("Content-Type", b"").decode(
From 7a1c6f5d2bdaa5cf61f07e1408524d1a10c0c551 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Thu, 2 Apr 2026 05:24:04 +0200
Subject: [PATCH 47/55] [Discourse] scraped resource url better have no .rss
---
src/data_manager/collectors/scrapers/spiders/discourse.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index daae39586..890205d78 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -159,7 +159,7 @@ def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
tags = response.meta.get("tags", [])
yield DiscourseTopicPageItem(
- url=response.url,
+ url=response.url.replace(".rss", ""),
content=response.text,
suffix="html", # Workaround: vectorstore manager was not supporting RSS feed yet.
source_type="web",
From c75e8e5d8f43b0fbbeddd541d42227577054c06f Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Fri, 3 Apr 2026 00:18:40 +0200
Subject: [PATCH 48/55] scrapers support built-in anonymization
---
.../deployments/basic-scraping/config.yaml | 13 +++++++++
src/cli/templates/base-config.yaml | 3 ++
.../scrapers/middlewares/__init__.py | 0
.../auth_downloader.py} | 0
.../collectors/scrapers/pipelines/__init__.py | 0
.../scrapers/pipelines/anonymization.py | 28 +++++++++++++++++++
.../persistence.py} | 0
.../collectors/scrapers/settings.py | 5 ++--
.../collectors/scrapers/spiders/discourse.py | 7 +++--
.../collectors/scrapers/spiders/link.py | 3 ++
src/data_manager/vectorstore/loader_utils.py | 2 +-
11 files changed, 56 insertions(+), 5 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/middlewares/__init__.py
rename src/data_manager/collectors/scrapers/{middlewares.py => middlewares/auth_downloader.py} (100%)
create mode 100644 src/data_manager/collectors/scrapers/pipelines/__init__.py
create mode 100644 src/data_manager/collectors/scrapers/pipelines/anonymization.py
rename src/data_manager/collectors/scrapers/{pipelines.py => pipelines/persistence.py} (100%)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 28d989d74..b1077c819 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -50,6 +50,7 @@ data_manager:
- examples/deployments/basic-scraping/miscellanea.list
twiki:
auth_provider_name: cern_sso # remove if crawling public pages only
+ anonymize_data: true
urls:
- https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
- https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons
@@ -66,12 +67,14 @@ data_manager:
- ".*WorkBook.*"
max_depth: 2
max_pages: 500
+ anonymize_data: true
delay: 60
discourse:
auth_provider_name: cern_sso
base_url: https://cms-talk.web.cern.ch
delay: 10
max_pages: 1000
+ anonymize_data: true
category_paths:
- /c/offcomp/comptools/87
keywords:
@@ -85,3 +88,13 @@ data_manager:
urls:
- https://github.com/dmwm/CRABServer
- https://github.com/dmwm/CRABClient
+ utils:
+ anonymizer:
+ nlp_model: en_core_web_sm
+ excluded_words:
+ - Krittin
+ - Vijay
+ - Dario
+ - Thanayut
+ - Stefano
+ - Katy
\ No newline at end of file
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index d59ba21ec..294aa90fb 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -200,6 +200,7 @@ data_manager:
delay: {{ data_manager.sources.web.link.delay | default(1, true) }}
allow: {{ data_manager.sources.web.link.allow | default([], true) | tojson }}
deny: {{ data_manager.sources.web.link.deny | default([], true) | tojson }}
+ anonymize_data: {{ data_manager.sources.web.link.anonymize_data | default(false, true) }}
input_lists:
{%- for l in data_manager.sources.web.link.input_lists | default([], true) %}
- {{ l }}
@@ -217,6 +218,7 @@ data_manager:
delay: {{ data_manager.sources.web.twiki.delay | default(60, true) }}
allow: {{ data_manager.sources.web.twiki.allow | default([], true) | tojson }}
deny: {{ data_manager.sources.web.twiki.deny | default([], true) | tojson }}
+ anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
input_lists:
{%- for list in data_manager.sources.web.twiki.input_lists | default([], true) %}
- {{ list }}
@@ -231,6 +233,7 @@ data_manager:
schedule: '{{ data_manager.sources.web.discourse.schedule | default("", true) }}'
max_pages: {{ data_manager.sources.web.discourse.max_pages | default(500, true) }}
delay: {{ data_manager.sources.web.discourse.delay | default(10, true) }}
+ anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
base_url: {{ data_manager.sources.web.discourse.base_url | default("https://cms-talk.web.cern.ch", true) }}
keywords:
{%- for keyword in data_manager.sources.web.discourse.keywords | default([], true) %}
diff --git a/src/data_manager/collectors/scrapers/middlewares/__init__.py b/src/data_manager/collectors/scrapers/middlewares/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/middlewares.py b/src/data_manager/collectors/scrapers/middlewares/auth_downloader.py
similarity index 100%
rename from src/data_manager/collectors/scrapers/middlewares.py
rename to src/data_manager/collectors/scrapers/middlewares/auth_downloader.py
diff --git a/src/data_manager/collectors/scrapers/pipelines/__init__.py b/src/data_manager/collectors/scrapers/pipelines/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/pipelines/anonymization.py b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
new file mode 100644
index 000000000..1328b24e5
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
@@ -0,0 +1,28 @@
+from typing import TYPE_CHECKING
+
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.scrapers.items import BasePageItem
+
+from scrapy import Spider
+
+
+class AnonymizationPipeline:
+ """Runs at priority 250, before PersistencePipeline (300)."""
+
+ def __init__(self, anonymizer: Anonymizer) -> None:
+ self._anonymizer = anonymizer
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ enabled = crawler.settings.getbool("ANONYMIZE_DATA", True)
+ if not enabled:
+ return cls(anonymizer=None)
+ return cls(anonymizer=Anonymizer()) # type: ignore
+
+ def process_item(self, item: BasePageItem, spider: Spider) -> BasePageItem:
+ if self._anonymizer is not None:
+ if isinstance(item.get("content"), str):
+ item["content"] = self._anonymizer.anonymize(item["content"])
+ if isinstance(item.get("title"), str):
+ item["title"] = self._anonymizer.anonymize(item["title"])
+ return item
diff --git a/src/data_manager/collectors/scrapers/pipelines.py b/src/data_manager/collectors/scrapers/pipelines/persistence.py
similarity index 100%
rename from src/data_manager/collectors/scrapers/pipelines.py
rename to src/data_manager/collectors/scrapers/pipelines/persistence.py
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index 1bd678547..c9b6224dd 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -80,7 +80,7 @@
# Middlewares, Pipelines and Extensions Priorities
# ---------------------------------------------------------------------------
DOWNLOADER_MIDDLEWARES = {
- "src.data_manager.collectors.scrapers.middlewares.AuthDownloaderMiddleware": 500,
+ "src.data_manager.collectors.scrapers.middlewares.auth_downloader.AuthDownloaderMiddleware": 500,
"scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
# RedirectMiddleware stays at its default 600 — no entry needed
}
@@ -93,7 +93,8 @@
}
ITEM_PIPELINES = {
- "src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300,
+ "src.data_manager.collectors.scrapers.pipelines.anonymization.AnonymizationPipeline": 250,
+ "src.data_manager.collectors.scrapers.pipelines.persistence.PersistencePipeline": 300,
}
EXTENSIONS = {
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index 890205d78..64f90cbd1 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -45,10 +45,13 @@ class DiscourseSpider(Spider):
def from_crawler(cls, crawler, *args, **kwargs):
delay = kwargs.get("delay")
max_pages = kwargs.get("max_pages")
+ anonymize_data = kwargs.get("anonymize_data")
if delay:
crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
if max_pages:
crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ if anonymize_data:
+ crawler.settings.set("ANONYMIZE_DATA", anonymize_data, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
def __init__(
@@ -159,9 +162,9 @@ def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
tags = response.meta.get("tags", [])
yield DiscourseTopicPageItem(
- url=response.url.replace(".rss", ""),
+ url=response.url.replace(".rss", ""), # This will redirect to normal browser view of the topic.
content=response.text,
- suffix="html", # Workaround: vectorstore manager was not supporting RSS feed yet.
+ suffix="rss",
source_type="web",
title=title,
content_type=response.headers.get("Content-Type", b"").decode(
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index f237b722b..e2cd0ae11 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -29,12 +29,15 @@ def from_crawler(cls, crawler, *args, **kwargs):
max_depth = kwargs.get("max_depth")
max_pages = kwargs.get("max_pages")
delay = kwargs.get("delay")
+ anonymize_data = kwargs.get("anonymize_data")
if max_depth:
crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
if max_pages:
crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
if delay:
crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
+ if anonymize_data:
+ crawler.settings.set("anonymize_data", anonymize_data, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, process_value: Callable[[str], str] = None, *args, **kwargs):
diff --git a/src/data_manager/vectorstore/loader_utils.py b/src/data_manager/vectorstore/loader_utils.py
index 622386439..6aea94536 100644
--- a/src/data_manager/vectorstore/loader_utils.py
+++ b/src/data_manager/vectorstore/loader_utils.py
@@ -28,7 +28,7 @@ def select_loader(file_path: str | Path):
return TextLoader(str(path))
if file_extension == ".py":
return PythonLoader(str(path))
- if file_extension in {".html", ".htm"}:
+ if file_extension in {".html", ".htm", ".rss", ".xml"}:
return BSHTMLLoader(str(path), bs_kwargs={"features": "html.parser"})
if file_extension == ".pdf":
return PyPDFLoader(str(path))
From 8aec27e489032a2f34d78f0b25208544453b92d2 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sat, 4 Apr 2026 23:02:45 +0200
Subject: [PATCH 49/55] [Anonymizer][Discourse, Twiki] cover markups html, rss
as much as possible, should refactoring later to have structure/unstructure
redact more separately.
---
.../deployments/basic-scraping/config.yaml | 10 +-
.../collectors/scraper_manager.py | 5 +-
.../scrapers/pipelines/anonymization.py | 39 ++++-
.../collectors/scrapers/spiders/discourse.py | 7 +-
.../collectors/scrapers/spiders/twiki.py | 10 +-
.../collectors/utils/anonymizer.py | 153 +++++++++++++++---
src/data_manager/data_manager.py | 4 +-
7 files changed, 181 insertions(+), 47 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index b1077c819..425003053 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -77,6 +77,7 @@ data_manager:
anonymize_data: true
category_paths:
- /c/offcomp/comptools/87
+ - /c/offcomp/ais/150
keywords:
- "Stefano Belforte"
- "Katy Ellis"
@@ -90,11 +91,4 @@ data_manager:
- https://github.com/dmwm/CRABClient
utils:
anonymizer:
- nlp_model: en_core_web_sm
- excluded_words:
- - Krittin
- - Vijay
- - Dario
- - Thanayut
- - Stefano
- - Katy
\ No newline at end of file
+ nlp_model: en_core_web_sm
\ No newline at end of file
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index b84f94dbb..eb36b7b28 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -9,6 +9,7 @@
from scrapy.spiderloader import SpiderLoader
from scrapy.settings import Settings
from scrapy import Spider
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
from src.data_manager.collectors.persistence import PersistenceService
from src.utils.config_access import get_global_config
from src.utils.logging import get_logger
@@ -33,10 +34,11 @@ class ScraperManager:
SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
"""
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None) -> None:
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None) -> None:
global_config = get_global_config()
self.data_path = Path(global_config["DATA_PATH"])
self.persistence = persistence
+ self.anonymizer = anonymizer
self.settings = Settings()
self.settings.setmodule(
"src.data_manager.collectors.scrapers.settings",
@@ -112,6 +114,7 @@ def _add_crawler(
# Inject persistence objects — live Python instances, must be priority="spider"
crawler.settings.set("PERSISTENCE_SERVICE", self.persistence, priority="spider")
crawler.settings.set("PERSISTENCE_OUTPUT_DIR", self.data_path / "websites", priority="spider")
+ crawler.settings.set("ANONYMIZER_SERVICE", self.anonymizer, priority="spider")
process.crawl(crawler, start_urls=urls, **cfg)
# ── URL sources & list parsing ──────────────────────────────────────────────────────
diff --git a/src/data_manager/collectors/scrapers/pipelines/anonymization.py b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
index 1328b24e5..592b74e80 100644
--- a/src/data_manager/collectors/scrapers/pipelines/anonymization.py
+++ b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
@@ -4,25 +4,50 @@
from src.data_manager.collectors.scrapers.items import BasePageItem
from scrapy import Spider
+from src.utils.logging import get_logger
+logger = get_logger(__name__)
class AnonymizationPipeline:
"""Runs at priority 250, before PersistencePipeline (300)."""
+ _DEFAULT_ANONYMIZER_CONFIG = {
+ "utils": {
+ "anonymizer": {
+ "nlp_model": "en_core_web_sm",
+ "excluded_words": ["John", "Jane", "Doe"],
+ "greeting_patterns": [
+ r"^(hi|hello|hey|greetings|dear)\b",
+ r"^\w+,\s*",
+ ],
+ "signoff_patterns": [
+ r"\b(regards|sincerely|best regards|cheers|thank you)\b",
+ r"^\s*[-~]+\s*$",
+ ],
+ "email_pattern": r"[\w\.-]+@[\w\.-]+\.\w+",
+ "username_pattern": r"\[~[^\]]+\]",
+ }
+ }
+ }
+
def __init__(self, anonymizer: Anonymizer) -> None:
self._anonymizer = anonymizer
@classmethod
def from_crawler(cls, crawler):
enabled = crawler.settings.getbool("ANONYMIZE_DATA", True)
+ anonymizer = crawler.settings.get("ANONYMIZER_SERVICE")
if not enabled:
- return cls(anonymizer=None)
- return cls(anonymizer=Anonymizer()) # type: ignore
+ raise NotConfigured("Anonymization is disabled")
+ if anonymizer is None:
+ # when we use scrapy cmd, we don't have the anonymizer service provided
+ dm_config = cls._DEFAULT_ANONYMIZER_CONFIG
+ return cls(anonymizer=Anonymizer(dm_config))
+ return cls(anonymizer=anonymizer)
def process_item(self, item: BasePageItem, spider: Spider) -> BasePageItem:
- if self._anonymizer is not None:
- if isinstance(item.get("content"), str):
- item["content"] = self._anonymizer.anonymize(item["content"])
- if isinstance(item.get("title"), str):
- item["title"] = self._anonymizer.anonymize(item["title"])
+ if isinstance(item.get("content"), str):
+ item["content"] = self._anonymizer.anonymize_markup(item["content"])
+ if isinstance(item.get("title"), str):
+ item["title"] = self._anonymizer.anonymize(item["title"])
return item
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index 64f90cbd1..5a35ea2b5 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -26,7 +26,7 @@ class DiscourseSpider(Spider):
_DEFAULT_BASE_URL = "https://cms-talk.web.cern.ch"
_DEFAULT_CATEGORY_PATHS: List[str] = [
- "/c/offcomp/comptools/87",
+ "/c/offcomp/ais/150",
]
auth_provider_name = "cern_sso"
@@ -85,6 +85,11 @@ async def start(self):
# ── Category JSON → topic RSS requests + next page ──────────────────
def parse_category(self, response: Response) -> Iterator[Request]:
+ """
+ @url https://cms-talk.web.cern.ch/tags/c/offcomp/ais/150.json
+ @returns requests 100
+ @scrapes url title content
+ """
try:
data = json.loads(response.text)
except (json.JSONDecodeError, AttributeError) as exc:
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
index d437c9d7e..d43314ddb 100644
--- a/src/data_manager/collectors/scrapers/spiders/twiki.py
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -13,9 +13,11 @@ class TwikiSpider(LinkSpider):
"""
name = "twiki"
+
+ auth_provider_name = "cern_sso"
_DEFAULT_START_URLS = [
- "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide", # public page
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideCrab", # public page
]
_DEFAULT_DENY = [
@@ -51,7 +53,7 @@ class TwikiSpider(LinkSpider):
"RETRY_TIMES": 0, # Very Safe no retries
"DEPTH_LIMIT": 1, # Default max depth
"DOWNLOAD_DELAY": 60, # Default (download) delay
- "CLOSESPIDER_PAGECOUNT": 1, # Very Safe Default max pages
+ "CLOSESPIDER_PAGECOUNT": 2, # Very Safe Default max pages
"COOKIES_ENABLED": False, # disable CookiesMiddleware jar
}
@@ -65,10 +67,10 @@ def _normalize_url(url: str) -> str:
def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
"""
Twiki pages render their main content inside #patternMain or .twikiMain.
- @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
+ @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideCrab
@returns items 1 1
@scrapes url title
- @returns requests 1 100
+ @returns requests 1 105
"""
yield from super().parse(response)
diff --git a/src/data_manager/collectors/utils/anonymizer.py b/src/data_manager/collectors/utils/anonymizer.py
index 72ac00456..782709ccd 100644
--- a/src/data_manager/collectors/utils/anonymizer.py
+++ b/src/data_manager/collectors/utils/anonymizer.py
@@ -3,20 +3,71 @@
"""
import re
-from typing import List, Set
+from typing import List, Set, Dict, Any
import spacy
from src.utils.config_access import get_data_manager_config
+from html import unescape
+
+# Generic markup patterns
+_TAG_RE = re.compile(r"<[^>]+>")
+_CDATA_RE = re.compile(r"")
+_DC_CREATOR_RE = re.compile(
+ r'( )',
+ re.IGNORECASE,
+)
+_ATTR_TEXT_RE = re.compile(r'(?:title|alt|creator|author)=["\']([^"\']+)["\']', re.IGNORECASE)
+_CONTENT_TAG_RE = re.compile(
+ r'<(?:p|li|td|description|title|dc:creator)[^>]*>(.*?)(?:p|li|td|description|title|dc:creator)>',
+ re.DOTALL | re.IGNORECASE,
+)
+# Albert-Einstein → (removed)
+_DEFAULT_GENERIC_MARKUP_USER_LINK_RE = re.compile(
+ r']*href="[^"]*?/(?:Main|author|user|profile|members)/[^"]*"[^>]*>[^<]* ',
+ re.IGNORECASE,
+)
+# Generic author link, like Albert-Einstein
+# Stephenie Meyer
+# John Doe
+# Jane Smith
+# Bob
+_DEFAULT_GENERIC_MARKUP_AUTHOR_ELEMENT_RE = re.compile(
+ r'<[^>]*(?:itemprop=["\']author["\']|class=["\'][^"\']*\bauthor\b[^"\']*["\']|rel=["\']author["\'])[^>]*>[^<]*[^>]+>',
+ re.IGNORECASE,
+)
+# JohnDoe → (removed)
+_DEFAULT_MARKUP_TWIKI_USER_LINK_RE = re.compile(
+ r']*href="[^"]*?/twiki/bin/\w+/Main/\w+"[^>]*>\w+ ',
+ re.IGNORECASE,
+)
+# John
→ (removed)
+# John Doe
→ (removed)
+_DEFAULT_MARKUP_SIGNOFF_TAG_RE = re.compile(
+ r'\s*(?: )?\s*[A-Z][\w.]*(?:\s+[A-Z][\w.]*){0,2}\s*
',
+ re.IGNORECASE,
+)
+# ..atm \nJohn
→ ..atm
+# Thanks\John →
+# Yours sincerely,\nJ.D.Doe]]> → ]]>
+_DEFAULT_MARKUP_TRAILING_SIGNOFF_TAG_RE = re.compile(
+ r'(?:'
+ r' \s*\n?\s*'
+ r'|(?:Thanks|Cheers|Best|Regards|HTH|Yours\s+sincerely)\s*,?\s*[\n\s]*'
+ r')'
+ r'[A-Z][\w.]*(?:\s+[A-Z][\w.]*){0,2}'
+ r'\s*(?=||\]\]>)',
+ re.IGNORECASE,
+)
class Anonymizer:
- def __init__(self):
+ def __init__(self, dm_config: Dict[str, Any]=None):
"""
Initialize the Anonymizer.
"""
- dm_config = get_data_manager_config()
+ dm_config = dm_config or get_data_manager_config()
data_manager_utils = dm_config.get("utils", {}) if isinstance(dm_config, dict) else {}
anonymizer_config = data_manager_utils.get("anonymizer", {}) if isinstance(data_manager_utils, dict) else {}
@@ -45,39 +96,91 @@ def __init__(self):
self.SIGNOFF_PATTERNS = [re.compile(pattern, re.IGNORECASE) for pattern in signoff_patterns]
self.EMAIL_PATTERN = re.compile(email_pattern)
self.USERNAME_PATTERN = re.compile(username_pattern)
+
+ def _discover_names(self, text: str) -> set:
+ """NER to discover names in the text."""
+ doc = self.nlp(text)
+ return {
+ ent.text for ent in doc.ents
+ if ent.label_ == "PERSON" and ent.text not in self.EXCLUDED_WORDS
+ }
+
+ def _discover_names_markup(self, markup: str) -> set:
+ # Full document: names with surrounding context (catches CDATA)
+ full_text = self._extract_text(markup)
+ names = self._discover_names(full_text)
+ # Per-chunk: focused paragraphs (catches standalone names in )
+ for chunk in self._extract_text_chunks(markup):
+ names |= self._discover_names(chunk)
+ return names
def anonymize(self, text: str) -> str:
"""
Anonymize names, emails, usernames, greetings, and sign-offs from the text.
"""
- doc = self.nlp(text)
- names_to_replace = {
- ent.text for ent in doc.ents
- if ent.label_ == "PERSON" and ent.text not in self.EXCLUDED_WORDS
- }
+ names_to_replace = self._discover_names(text)
# Remove email addresses and usernames
text = self.EMAIL_PATTERN.sub("", text)
text = self.USERNAME_PATTERN.sub("", text)
- # Remove greetings and sign-offs
+ text = self._strip_greetings_signoffs(text)
+ return self._replace_names(text, names_to_replace)
+
+ def anonymize_markup(self, markup: str) -> str:
+ """
+ Anonymize names, emails, usernames, greetings, and sign-offs from the markup.
+ including html, rss, and other markup formats. (especially twiki and discourse markup)
+ """
+ names_to_replace = self._discover_names_markup(markup)
+ # Remove email addresses and usernames
+ markup = self.EMAIL_PATTERN.sub("", markup)
+ markup = self.USERNAME_PATTERN.sub("", markup)
+ markup = _DC_CREATOR_RE.sub(r'\1\2', markup)
+ markup = _DEFAULT_GENERIC_MARKUP_AUTHOR_ELEMENT_RE.sub("", markup)
+ markup = _DEFAULT_GENERIC_MARKUP_USER_LINK_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_SIGNOFF_TAG_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_TRAILING_SIGNOFF_TAG_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_TWIKI_USER_LINK_RE.sub("", markup)
+ markup = self._strip_greetings_signoffs(markup)
+ return self._replace_names(markup, names_to_replace)
+
+ def _strip_greetings_signoffs(self, text: str) -> str:
lines = text.splitlines()
- filtered_lines: List[str] = []
+ filtered = []
for line in lines:
- stripped_line = line.strip()
- if any(p.match(stripped_line) for p in self.GREETING_PATTERNS):
+ stripped = line.strip()
+ if any(p.match(stripped) for p in self.GREETING_PATTERNS):
continue
- if any(p.match(stripped_line) for p in self.SIGNOFF_PATTERNS):
+ if any(p.match(stripped) for p in self.SIGNOFF_PATTERNS):
continue
- filtered_lines.append(line)
- text = "\n".join(filtered_lines)
-
- # Remove names (case-insensitive)
- for name in sorted(names_to_replace, key=len, reverse=True):
- pattern = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE)
- text = pattern.sub("", text)
-
- # Remove extra whitespace
- text = "\n".join(line for line in text.splitlines() if line.strip())
-
- return text
+ filtered.append(line)
+ return "\n".join(filtered)
+
+ def _replace_names(self, text: str, names: set) -> str:
+ for name in sorted(names, key=len, reverse=True):
+ text = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE).sub("", text)
+ return "\n".join(line for line in text.splitlines() if line.strip())
+
+ def _extract_text(self, markup: str) -> str:
+ """Strip markup to plain text for NER. Format-agnostic."""
+ attrs = " ".join(_ATTR_TEXT_RE.findall(markup))
+ clean = _CDATA_RE.sub(" ", markup)
+ clean = _TAG_RE.sub(" ", clean)
+ clean = unescape(clean)
+ return re.sub(r"\s+", " ", f"{clean} {attrs}").strip()
+
+ def _extract_text_chunks(self, markup: str) -> list:
+ chunks = []
+ # Text content from tags
+ for match in _CONTENT_TAG_RE.finditer(markup):
+ inner = _CDATA_RE.sub(" ", match.group(1))
+ clean = _TAG_RE.sub(" ", inner)
+ clean = unescape(clean).strip()
+ if clean:
+ chunks.append(clean)
+ # Text from attributes
+ attr_text = " ".join(_ATTR_TEXT_RE.findall(markup))
+ if attr_text.strip():
+ chunks.append(attr_text.strip())
+ return chunks
\ No newline at end of file
diff --git a/src/data_manager/data_manager.py b/src/data_manager/data_manager.py
index 4c987eec2..da169b4fd 100644
--- a/src/data_manager/data_manager.py
+++ b/src/data_manager/data_manager.py
@@ -2,6 +2,7 @@
from typing import Callable, Optional
from src.data_manager.collectors.persistence import PersistenceService
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
from src.data_manager.collectors.scraper_manager import ScraperManager
from src.data_manager.collectors.tickets.ticket_manager import TicketManager
from src.data_manager.collectors.localfile_manager import LocalFileManager
@@ -36,9 +37,10 @@ def __init__(self, *, run_ingestion: bool = True, factory=None):
raise RuntimeError("Static config missing sources_config; run deployment initialization first.")
self.config["data_manager"]["sources"] = static_config.sources_config
+ self.anonymizer = Anonymizer()
self.localfile_manager = LocalFileManager(dm_config=self.config["data_manager"])
self.git_manager = GitManager(dm_config=self.config["data_manager"])
- self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"], persistence=self.persistence)
+ self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"], persistence=self.persistence, anonymizer=self.anonymizer)
self.ticket_manager = TicketManager(dm_config=self.config["data_manager"])
self.vector_manager = VectorStoreManager(
From 8aa441d644330a2a0970a7d9da97e7214b2dad23 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Sun, 5 Apr 2026 21:54:30 +0200
Subject: [PATCH 50/55] [Markitdown] support straight forward markitdown with
second pass anonymized.
---
.../deployments/basic-scraping/config.yaml | 3 ++
src/cli/templates/base-config.yaml | 3 ++
.../collectors/scraper_manager.py | 5 ++-
.../scrapers/pipelines/markitdown.py | 43 +++++++++++++++++++
.../collectors/scrapers/settings.py | 1 +
.../collectors/scrapers/spiders/discourse.py | 3 ++
.../collectors/scrapers/spiders/link.py | 5 ++-
.../collectors/utils/markitdown_convertor.py | 32 ++++++++++++++
8 files changed, 93 insertions(+), 2 deletions(-)
create mode 100644 src/data_manager/collectors/scrapers/pipelines/markitdown.py
create mode 100644 src/data_manager/collectors/utils/markitdown_convertor.py
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index 425003053..d355703b2 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -46,6 +46,7 @@ data_manager:
max_depth: 2
max_pages: 100
delay: 10
+ markitdown: true
input_lists:
- examples/deployments/basic-scraping/miscellanea.list
twiki:
@@ -68,6 +69,7 @@ data_manager:
max_depth: 2
max_pages: 500
anonymize_data: true
+ markitdown: true
delay: 60
discourse:
auth_provider_name: cern_sso
@@ -75,6 +77,7 @@ data_manager:
delay: 10
max_pages: 1000
anonymize_data: true
+ markitdown: true
category_paths:
- /c/offcomp/comptools/87
- /c/offcomp/ais/150
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index 294aa90fb..13b7fccf8 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -201,6 +201,7 @@ data_manager:
allow: {{ data_manager.sources.web.link.allow | default([], true) | tojson }}
deny: {{ data_manager.sources.web.link.deny | default([], true) | tojson }}
anonymize_data: {{ data_manager.sources.web.link.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.link.markitdown | default(false, true) }}
input_lists:
{%- for l in data_manager.sources.web.link.input_lists | default([], true) %}
- {{ l }}
@@ -219,6 +220,7 @@ data_manager:
allow: {{ data_manager.sources.web.twiki.allow | default([], true) | tojson }}
deny: {{ data_manager.sources.web.twiki.deny | default([], true) | tojson }}
anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.twiki.markitdown | default(false, true) }}
input_lists:
{%- for list in data_manager.sources.web.twiki.input_lists | default([], true) %}
- {{ list }}
@@ -234,6 +236,7 @@ data_manager:
max_pages: {{ data_manager.sources.web.discourse.max_pages | default(500, true) }}
delay: {{ data_manager.sources.web.discourse.delay | default(10, true) }}
anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.discourse.markitdown | default(false, true) }}
base_url: {{ data_manager.sources.web.discourse.base_url | default("https://cms-talk.web.cern.ch", true) }}
keywords:
{%- for keyword in data_manager.sources.web.discourse.keywords | default([], true) %}
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index eb36b7b28..1af709b1e 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -13,6 +13,7 @@
from src.data_manager.collectors.persistence import PersistenceService
from src.utils.config_access import get_global_config
from src.utils.logging import get_logger
+from src.data_manager.collectors.utils.markitdown_manager import MarkitdownManager
logger = get_logger(__name__)
@@ -34,11 +35,12 @@ class ScraperManager:
SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
"""
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None) -> None:
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None, markitdown_manager: MarkitdownManager = None) -> None:
global_config = get_global_config()
self.data_path = Path(global_config["DATA_PATH"])
self.persistence = persistence
self.anonymizer = anonymizer
+ self.markitdown_manager = markitdown_manager
self.settings = Settings()
self.settings.setmodule(
"src.data_manager.collectors.scrapers.settings",
@@ -115,6 +117,7 @@ def _add_crawler(
crawler.settings.set("PERSISTENCE_SERVICE", self.persistence, priority="spider")
crawler.settings.set("PERSISTENCE_OUTPUT_DIR", self.data_path / "websites", priority="spider")
crawler.settings.set("ANONYMIZER_SERVICE", self.anonymizer, priority="spider")
+ crawler.settings.set("MARKITDOWN_SERVICE", self.markitdown_manager, priority="spider")
process.crawl(crawler, start_urls=urls, **cfg)
# ── URL sources & list parsing ──────────────────────────────────────────────────────
diff --git a/src/data_manager/collectors/scrapers/pipelines/markitdown.py b/src/data_manager/collectors/scrapers/pipelines/markitdown.py
new file mode 100644
index 000000000..f5c09237b
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines/markitdown.py
@@ -0,0 +1,43 @@
+from scrapy import Spider
+from src.utils.logging import get_logger
+from src.data_manager.collectors.utils.markitdown_convertor import MarkitdownConvertor
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.scrapers.pipelines.anonymization import AnonymizationPipeline
+from src.data_manager.collectors.scrapers.items import BasePageItem
+from scrapy.exceptions import NotConfigured
+
+logger = get_logger(__name__)
+
+class MarkitdownPipeline:
+ """Runs at priority 250, before PersistencePipeline (300)."""
+
+ def __init__(self, markitdown: MarkitdownConvertor, anonymizer: Anonymizer, anonymize_data: bool):
+ self._markitdown = markitdown
+ self._anonymizer = anonymizer
+ self._anonymize_data = anonymize_data
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ enabled = crawler.settings.getbool("MARKITDOWN_ENABLED", True)
+ markitdown_convertor = crawler.settings.get("MARKITDOWN_SERVICE")
+ anonymizer = crawler.settings.get("ANONYMIZER_SERVICE")
+ anonymize_data = crawler.settings.getbool("ANONYMIZE_DATA", True)
+ if not enabled:
+ raise NotConfigured("Markitdown is disabled")
+ if markitdown_convertor is None:
+ # when we use scrapy cmd, we don't have the markitdown service provided
+ markitdown_convertor = MarkitdownConvertor()
+ if anonymizer is None:
+ # when we use scrapy cmd, we don't have the anonymizer service provided
+ anonymizer = AnonymizationPipeline.from_crawler(crawler)._anonymizer
+ return cls(markitdown=markitdown_convertor, anonymizer=anonymizer, anonymize_data=anonymize_data)
+
+ def process_item(self, item: BasePageItem, spider: Spider) -> BasePageItem:
+ if isinstance(item.get("content"), str):
+ logger.info(f"Converting content to markdown: {item['content']}")
+ item["content"] = self._markitdown.convert(item["content"], file_extension=item["suffix"])
+ if self._anonymize_data:
+ logger.info(f"Anonymizing content: {item['content']}")
+ item["content"] = self._anonymizer.anonymize(item["content"])
+ logger.info(f"Markitdown result ({'anonymized' if self._anonymize_data else 'not second pass anonymized'})): {item['content']}")
+ return item
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
index c9b6224dd..65aeddc01 100644
--- a/src/data_manager/collectors/scrapers/settings.py
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -94,6 +94,7 @@
ITEM_PIPELINES = {
"src.data_manager.collectors.scrapers.pipelines.anonymization.AnonymizationPipeline": 250,
+ "src.data_manager.collectors.scrapers.pipelines.markitdown.MarkitdownPipeline": 260,
"src.data_manager.collectors.scrapers.pipelines.persistence.PersistencePipeline": 300,
}
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
index 5a35ea2b5..1803128e4 100644
--- a/src/data_manager/collectors/scrapers/spiders/discourse.py
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -46,12 +46,15 @@ def from_crawler(cls, crawler, *args, **kwargs):
delay = kwargs.get("delay")
max_pages = kwargs.get("max_pages")
anonymize_data = kwargs.get("anonymize_data")
+ markitdown_enabled = kwargs.get("markitdown")
if delay:
crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
if max_pages:
crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
if anonymize_data:
crawler.settings.set("ANONYMIZE_DATA", anonymize_data, priority="spider")
+ if markitdown_enabled:
+ crawler.settings.set("MARKITDOWN_ENABLED", markitdown_enabled, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
def __init__(
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
index e2cd0ae11..ce9e82415 100644
--- a/src/data_manager/collectors/scrapers/spiders/link.py
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -29,6 +29,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
max_depth = kwargs.get("max_depth")
max_pages = kwargs.get("max_pages")
delay = kwargs.get("delay")
+ markitdown_enabled = kwargs.get("markitdown")
anonymize_data = kwargs.get("anonymize_data")
if max_depth:
crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
@@ -36,8 +37,10 @@ def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
if delay:
crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
+ if markitdown_enabled:
+ crawler.settings.set("MARKITDOWN_ENABLED", markitdown_enabled, priority="spider")
if anonymize_data:
- crawler.settings.set("anonymize_data", anonymize_data, priority="spider")
+ crawler.settings.set("ANONYMIZE_DATA", anonymize_data, priority="spider")
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, process_value: Callable[[str], str] = None, *args, **kwargs):
diff --git a/src/data_manager/collectors/utils/markitdown_convertor.py b/src/data_manager/collectors/utils/markitdown_convertor.py
new file mode 100644
index 000000000..560d0d43f
--- /dev/null
+++ b/src/data_manager/collectors/utils/markitdown_convertor.py
@@ -0,0 +1,32 @@
+import io
+from markitdown import MarkItDown
+from src.utils.logging import get_logger
+# from src.interfaces.llm.llm_client import LLMClient
+
+logger = get_logger(__name__)
+
+class MarkitdownConvertor:
+
+ def __init__(self):
+ self.markitdown = MarkItDown(
+ enable_plugins=True,
+ # llm_client=llm_client,
+ # llm_model=llm_model,
+ )
+
+ def convert(self, content: str, file_extension: str = ".html") -> str:
+ """
+ Convert the content to markdown using MarkItDown.
+ Args:
+ content: The content to convert.
+ file_extension: The file extension of the content.
+ Returns:
+ The converted content.
+ """
+ logger.info(f"Converting content to markdown: {content}")
+ result = self.markitdown.convert_stream(
+ io.BytesIO(content.encode("utf-8")),
+ file_extension=file_extension,
+ )
+ logger.info(f"Markitdown result: {result.text_content if hasattr(result, 'text_content') else str(result)}")
+ return result.text_content if hasattr(result, 'text_content') else str(result)
\ No newline at end of file
From 0656666885d7cf6dcffd3546be36961b04b7aca2 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 8 Apr 2026 17:46:43 +0200
Subject: [PATCH 51/55] realistic, comprehensive test configurations.
---
.../deployments/basic-scraping/config.yaml | 21 ++++++++++---------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
index d355703b2..0c9cce3ef 100644
--- a/examples/deployments/basic-scraping/config.yaml
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -52,26 +52,27 @@ data_manager:
twiki:
auth_provider_name: cern_sso # remove if crawling public pages only
anonymize_data: true
- urls:
- - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide
- - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons
+ urls: # as of now, if we have sso-proteccted twiki, please put it first for efficiency and robustness.
+ - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons # sso-protected twiki pages.
+ - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide # public twiki seed urls.
allow:
- ".*CRAB3.*"
- ".*SWGuide.*"
- ".*WorkBook.*"
- ".*Crab.*"
- ".*Crab3.*"
- - ".*HeavyIons.*"
- - ".*HICollisions.*"
- - ".*HIRel.*"
+ # Crawled all possible HeavyIons + a little bit of CRAB
+ # - ".*HeavyIons.*"
+ # - ".*HICollisions.*"
+ # - ".*HIRel.*"
deny:
- ".*WorkBook.*"
max_depth: 2
- max_pages: 500
+ max_pages: 1000
anonymize_data: true
markitdown: true
delay: 60
- discourse:
+ discourse: # we should get approximately 500/800++ anonymized markdown discussions.
auth_provider_name: cern_sso
base_url: https://cms-talk.web.cern.ch
delay: 10
@@ -80,7 +81,7 @@ data_manager:
markitdown: true
category_paths:
- /c/offcomp/comptools/87
- - /c/offcomp/ais/150
+ # - /c/offcomp/ais/150
keywords:
- "Stefano Belforte"
- "Katy Ellis"
@@ -94,4 +95,4 @@ data_manager:
- https://github.com/dmwm/CRABClient
utils:
anonymizer:
- nlp_model: en_core_web_sm
\ No newline at end of file
+ nlp_model: en_core_web_sm
From c0bd84b0f7dc0174ea7fe0ae0d55f1dc5fcbcda8 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 8 Apr 2026 18:05:30 +0200
Subject: [PATCH 52/55] fix renaming missing from ref PR.
---
src/data_manager/collectors/scraper_manager.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
index 1af709b1e..ec58e537e 100644
--- a/src/data_manager/collectors/scraper_manager.py
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -13,7 +13,7 @@
from src.data_manager.collectors.persistence import PersistenceService
from src.utils.config_access import get_global_config
from src.utils.logging import get_logger
-from src.data_manager.collectors.utils.markitdown_manager import MarkitdownManager
+from src.data_manager.collectors.utils.markitdown_convertor import MarkitdownConvertor
logger = get_logger(__name__)
@@ -35,7 +35,7 @@ class ScraperManager:
SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
"""
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None, markitdown_manager: MarkitdownManager = None) -> None:
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None, markitdown_manager: MarkitdownConvertor= None) -> None:
global_config = get_global_config()
self.data_path = Path(global_config["DATA_PATH"])
self.persistence = persistence
From 456ab3d364992744c48247873c1f2711a3c9b641 Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 8 Apr 2026 18:19:16 +0200
Subject: [PATCH 53/55] fix markitdown dep was missing.
---
pyproject.toml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index bc4465843..08ef32cef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ dependencies = [
"pre-commit>=4",
"psycopg2-binary==2.9.10",
"Scrapy>=2.14.2",
- "playwright>=1.49.0,<2"
+ "playwright>=1.49.0,<2",
+ "markitdown>=0.1.5"
]
[project.scripts]
From ace75e18d64b7e8382f1c1eb875383c65d5a857f Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 8 Apr 2026 19:27:53 +0200
Subject: [PATCH 54/55] remove noises from local deployments.
---
examples/deployments/basic-scraping/data/ingestion_status.json | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 examples/deployments/basic-scraping/data/ingestion_status.json
diff --git a/examples/deployments/basic-scraping/data/ingestion_status.json b/examples/deployments/basic-scraping/data/ingestion_status.json
deleted file mode 100644
index 9e26dfeeb..000000000
--- a/examples/deployments/basic-scraping/data/ingestion_status.json
+++ /dev/null
@@ -1 +0,0 @@
-{}
\ No newline at end of file
From 6b9dfd5e14964c6970c1fa43383756e323dce43c Mon Sep 17 00:00:00 2001
From: Krittin Phornsiricharoenphant
Date: Wed, 8 Apr 2026 19:56:25 +0200
Subject: [PATCH 55/55] moved deps to proper requirements.txt
---
pyproject.toml | 5 +----
requirements/requirements-base.txt | 3 +++
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 08ef32cef..f5136f334 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,10 +26,7 @@ dependencies = [
"pandas==2.3.2",
"isort==6.0.1",
"pre-commit>=4",
- "psycopg2-binary==2.9.10",
- "Scrapy>=2.14.2",
- "playwright>=1.49.0,<2",
- "markitdown>=0.1.5"
+ "psycopg2-binary==2.9.10"
]
[project.scripts]
diff --git a/requirements/requirements-base.txt b/requirements/requirements-base.txt
index 602eed1dd..70fdcf852 100644
--- a/requirements/requirements-base.txt
+++ b/requirements/requirements-base.txt
@@ -86,3 +86,6 @@ aiohttp==3.9.5
nltk==3.9.1
sentence-transformers==5.1.2
rank_bm25==0.2.2
+Scrapy==2.14.2
+playwright==1.58.0
+markitdown==0.1.5