diff --git a/examples/deployments/basic-scraping/config.yaml b/examples/deployments/basic-scraping/config.yaml
new file mode 100644
index 000000000..0c9cce3ef
--- /dev/null
+++ b/examples/deployments/basic-scraping/config.yaml
@@ -0,0 +1,98 @@
+# Basic configuration file for a Archi deployment
+# with a chat app interface, agent, and
+# PostgreSQL with pgvector for document storage.
+# The LLM is used through an existing Ollama server.
+#
+# run with:
+# archi create --name my-archi-scraping --config examples/deployments/basic-scraping/config.yaml --services chatbot --hostmode
+
+# Deployment example for CERN data sources:
+# Twiki (with optional SSO) + public links + Git repos
+#
+# Required env vars for SSO:
+# SSO_USERNAME=xxx SSO_PASSWORD=yyy
+
+name: my_archi
+
+services:
+ data_manager:
+ port: 7872
+ chat_app:
+ agent_class: CMSCompOpsAgent
+ agents_dir: examples/agents
+ default_provider: local
+ default_model: qwen3:32b
+ providers:
+ local:
+ enabled: true
+ base_url: http://submit76.mit.edu:7870 # make sure this matches your ollama server URL!
+ mode: ollama
+ default_model: "qwen3:32b" # make sure this matches a model you have downloaded locally with ollama
+ models:
+ - "qwen3:32b"
+ trained_on: "My data"
+ port: 7868
+ external_port: 7868
+ vectorstore:
+ backend: postgres # PostgreSQL with pgvector (only supported backend)
+
+data_manager:
+ embedding_name: HuggingFaceEmbeddings
+ sources:
+ web:
+ link:
+ urls:
+ - https://ppc.mit.edu/news/
+ max_depth: 2
+ max_pages: 100
+ delay: 10
+ markitdown: true
+ input_lists:
+ - examples/deployments/basic-scraping/miscellanea.list
+ twiki:
+ auth_provider_name: cern_sso # remove if crawling public pages only
+ anonymize_data: true
+ urls: # as of now, if we have sso-proteccted twiki, please put it first for efficiency and robustness.
+ - https://twiki.cern.ch/twiki/bin/view/CMS/HeavyIons # sso-protected twiki pages.
+ - https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuide # public twiki seed urls.
+ allow:
+ - ".*CRAB3.*"
+ - ".*SWGuide.*"
+ - ".*WorkBook.*"
+ - ".*Crab.*"
+ - ".*Crab3.*"
+ # Crawled all possible HeavyIons + a little bit of CRAB
+ # - ".*HeavyIons.*"
+ # - ".*HICollisions.*"
+ # - ".*HIRel.*"
+ deny:
+ - ".*WorkBook.*"
+ max_depth: 2
+ max_pages: 1000
+ anonymize_data: true
+ markitdown: true
+ delay: 60
+ discourse: # we should get approximately 500/800++ anonymized markdown discussions.
+ auth_provider_name: cern_sso
+ base_url: https://cms-talk.web.cern.ch
+ delay: 10
+ max_pages: 1000
+ anonymize_data: true
+ markitdown: true
+ category_paths:
+ - /c/offcomp/comptools/87
+ # - /c/offcomp/ais/150
+ keywords:
+ - "Stefano Belforte"
+ - "Katy Ellis"
+ - "Krittin Phornsiricharoenphant"
+ - "Vijay Chakravarty"
+ - "Dario Mapelli"
+ - "Thanayut Seethongchuen"
+ git:
+ urls:
+ - https://github.com/dmwm/CRABServer
+ - https://github.com/dmwm/CRABClient
+ utils:
+ anonymizer:
+ nlp_model: en_core_web_sm
diff --git a/examples/deployments/basic-scraping/miscellanea.list b/examples/deployments/basic-scraping/miscellanea.list
new file mode 100644
index 000000000..65bfd0289
--- /dev/null
+++ b/examples/deployments/basic-scraping/miscellanea.list
@@ -0,0 +1,49 @@
+# PPC
+https://ppc.mit.edu/blog/2016/05/08/hello-world/
+https://ppc.mit.edu/
+https://ppc.mit.edu/christoph-paus/
+https://ppc.mit.edu/dmytro-kovalskyi/
+https://ppc.mit.edu/gomez-ceballos/
+https://ppc.mit.edu/blog/2024/11/23/lhc-finishes-a-record-year/
+https://ppc.mit.edu/blog/2024/12/02/felicidades-cecilia/
+https://ppc.mit.edu/blog/2015/05/21/clipboard/
+https://ppc.mit.edu/blog/2025/01/12/published-first-diboson-paper-using-run-3-lhc-data/
+https://ppc.mit.edu/blog/2025/01/23/student-fcc-workshop-at-mit-v3-2025/
+https://ppc.mit.edu/blog/2025/01/23/new-chill-in-middleton/
+https://ppc.mit.edu/blog/2025/01/24/first-linux-server-installation-for-david-and-pietro/
+https://ppc.mit.edu/blog/2025/01/26/from-cern-to-mit-for-the-fcc-workshop/
+https://ppc.mit.edu/publications/
+https://ppc.mit.edu/blog/2025/02/08/detailed-schedule-for-the-european-strategy/
+https://ppc.mit.edu/blog/2025/02/14/first-cms-week-in-2025/
+https://ppc.mit.edu/blog/2025/02/18/exploring-the-higgs-boson-in-our-latest-result/
+https://ppc.mit.edu/blog/2025/02/04/news-from-the-chamonix-meeting/
+https://ppc.mit.edu/blog/2025/02/11/cms-data-archival-at-mit/
+https://ppc.mit.edu/blog/2025/03/28/cern-gets-support-from-canada/
+https://ppc.mit.edu/blog/2025/04/08/breakthrough-prize-in-physics-2025/
+https://ppc.mit.edu/blog/2025/04/04/the-fcc-at-cern-a-feasibly-circular-collider/
+https://ppc.mit.edu/blog/2025/04/08/cleo-reached-magic-issue-number-5000/
+https://ppc.mit.edu/blog/2025/04/14/maximizing-cms-competitive-advantage/
+https://ppc.mit.edu/blog/2025/04/25/sueps-at-aps-march-april-meeting/
+https://ppc.mit.edu/blog/2025/04/18/round-three/
+https://ppc.mit.edu/blog/2025/04/14/first-beams-with-a-splash-in-2025/
+https://ppc.mit.edu/blog/2025/05/27/fcc-weak-in-vienna-building-our-future/
+https://ppc.mit.edu/blog/2025/06/04/new-paper-on-arxiv-submit-a-physics-analysis-facility-at-mit/
+https://ppc.mit.edu/blog/2025/06/16/summer-cms-week-2025/
+https://ppc.mit.edu/blog/2025/05/05/cms-records-first-2025-high-energy-collisions/
+https://ppc.mit.edu/blog/2025/06/17/long-term-vision-for-particle-physics-from-the-national-academies/
+https://ppc.mit.edu/blog/2025/06/20/conclusion-of-junes-cern-council-session-has-major-consequences-for-cms/
+https://ppc.mit.edu/blog/2025/06/20/highest-pileup-recorded-at-cms-last-night/
+https://ppc.mit.edu/blog/2025/06/25/selfie-station-at-wilson-hall/
+https://ppc.mit.edu/mariarosaria-dalfonso/
+https://ppc.mit.edu/kenneth-long-2/
+https://ppc.mit.edu/blog/2025/06/27/open-symposium-on-the-european-strategy-for-particle-physics/
+https://ppc.mit.edu/blog/2025/07/03/bridging-physics-and-computing-throughput-computing-2025/
+https://ppc.mit.edu/pietro-lugato-2/
+https://ppc.mit.edu/luca-lavezzo/
+https://ppc.mit.edu/zhangqier-wang-2/
+https://ppc.mit.edu/blog/2025/07/14/welcome-our-first-ever-in-house-masters-student/
+# A2
+https://ppc.mit.edu/a2/
+# Personnel
+https://people.csail.mit.edu/kraska
+https://physics.mit.edu/faculty/christoph-paus
diff --git a/requirements/requirements-base.txt b/requirements/requirements-base.txt
index 602eed1dd..70fdcf852 100644
--- a/requirements/requirements-base.txt
+++ b/requirements/requirements-base.txt
@@ -86,3 +86,6 @@ aiohttp==3.9.5
nltk==3.9.1
sentence-transformers==5.1.2
rank_bm25==0.2.2
+Scrapy==2.14.2
+playwright==1.58.0
+markitdown==0.1.5
diff --git a/scrapy.cfg b/scrapy.cfg
new file mode 100644
index 000000000..124bc2c4b
--- /dev/null
+++ b/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = src.data_manager.collectors.scrapers.settings
diff --git a/src/bin/service_data_manager.py b/src/bin/service_data_manager.py
index 2a1eb6c1c..b1236ead1 100644
--- a/src/bin/service_data_manager.py
+++ b/src/bin/service_data_manager.py
@@ -74,9 +74,8 @@ def trigger_update() -> None:
schedule_map: Dict[str, Callable[[Optional[str]], None]] = {
"local_files": lambda last_run=None: data_manager.localfile_manager.schedule_collect_local_files(data_manager.persistence, last_run=last_run),
- "links": lambda last_run=None: data_manager.scraper_manager.schedule_collect_links(data_manager.persistence, last_run=last_run),
- "git": lambda last_run=None: data_manager.scraper_manager.schedule_collect_git(data_manager.persistence, last_run=last_run),
- "sso": lambda last_run=None: data_manager.scraper_manager.schedule_collect_sso(data_manager.persistence, last_run=last_run),
+ "web": lambda last_run=None: data_manager.scraper_manager.schedule_collect(last_run=last_run),
+ "git": lambda last_run=None: data_manager.git_manager.schedule_collect_git(data_manager.persistence, last_run=last_run),
"jira": lambda last_run=None: data_manager.ticket_manager.schedule_collect_jira(data_manager.persistence, last_run=last_run),
"redmine": lambda last_run=None: data_manager.ticket_manager.schedule_collect_redmine(data_manager.persistence, last_run=last_run),
}
diff --git a/src/cli/managers/config_manager.py b/src/cli/managers/config_manager.py
index 7ed9f9405..ad8f0caac 100644
--- a/src/cli/managers/config_manager.py
+++ b/src/cli/managers/config_manager.py
@@ -13,6 +13,8 @@
STATIC_FIELDS = ['global', 'services']
+_WEB_TOP_LEVEL_STATIC_KEYS = ["enabled", "visible"]
+
class ConfigurationManager:
"""Manages archi configuration loading and validation"""
@@ -266,10 +268,19 @@ def _collect_input_lists(self) -> None:
for conf in self.configs:
data_manager = conf.get('data_manager', {})
sources_section = data_manager.get('sources', {}) or {}
- links_section = sources_section.get('links', {}) if isinstance(sources_section, dict) else {}
- lists = links_section.get('input_lists') or []
- if isinstance(lists, list):
- collected.extend(lists)
+ if not isinstance(sources_section, dict):
+ continue
+ web = sources_section.get("web", {}) or {}
+ if not isinstance(web, dict):
+ continue
+ for spider_key, sub in web.items():
+ if spider_key in _WEB_TOP_LEVEL_STATIC_KEYS:
+ continue
+ if not isinstance(sub, dict):
+ continue
+ wlists = sub.get("input_lists") or []
+ if isinstance(wlists, list):
+ collected.extend(wlists)
self.input_list = sorted(set(collected)) if collected else []
def get_enabled_sources(self) -> List[str]:
diff --git a/src/cli/source_registry.py b/src/cli/source_registry.py
index 8ef05e5ea..1ac02efdf 100644
--- a/src/cli/source_registry.py
+++ b/src/cli/source_registry.py
@@ -23,11 +23,9 @@ def __init__(self) -> None:
def _register_defaults(self) -> None:
self.register(
SourceDefinition(
- name="links",
- description="Basic HTTP/HTTPS link scraping from input lists",
- required_config_fields=[
- "data_manager.sources.links.input_lists",
- ],
+ name="web",
+ description="Basic HTTP/HTTPS, Scrapy web sources, seeds from urls and/or input_list",
+ required_config_fields=[],
)
)
self.register(
@@ -36,17 +34,17 @@ def _register_defaults(self) -> None:
description="SSO-backed web crawling",
required_secrets=["SSO_USERNAME", "SSO_PASSWORD"],
required_config_fields=[
- "data_manager.sources.links.selenium_scraper.selenium_class",
+ "data_manager.sources.web",
],
- depends_on=["links"],
+ depends_on=["web"],
)
)
self.register(
SourceDefinition(
name="git",
- description="Git repository scraping for MkDocs-based documentation",
- required_secrets=["GIT_USERNAME", "GIT_TOKEN"],
- depends_on=["links"],
+ description="Git repository scraping for MkDocs-based documentation, Optional GIT_USERNAME/GIT_TOKEN for private repos.",
+ required_secrets=[], # was ["GIT_USERNAME", "GIT_TOKEN"]
+ depends_on=[], # no longer depends on links or webs, considered to be standalone manager.
)
)
self.register(
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
index ef6d21fee..13b7fccf8 100644
--- a/src/cli/templates/base-config.yaml
+++ b/src/cli/templates/base-config.yaml
@@ -188,40 +188,72 @@ data_manager:
{%- for path in paths %}
- {{ path }}
{%- endfor %}
- links:
- base_source_depth: {{ data_manager.sources.links.base_source_depth | default(1, true) }}
- max_pages: {{ data_manager.sources.links.max_pages | default(null, true) }}
- enabled: {{ data_manager.sources.links.enabled | default(true, true) }}
- visible: {{ data_manager.sources.links.visible | default(true, true) }}
- schedule: '{{ data_manager.sources.links.schedule | default("", true) }}'
- input_lists:
- {%- set link_lists = data_manager.sources.links.input_lists | default([], true) %}
- {%- for input_list in link_lists %}
- - {{ input_list }}
- {%- endfor %}
- html_scraper:
- reset_data: {{ data_manager.sources.links.html_scraper.reset_data | default(true, true) }}
- verify_urls: {{ data_manager.sources.links.html_scraper.verify_urls | default(false, true) }}
- enable_warnings: {{ data_manager.sources.links.html_scraper.enable_warnings | default(false, true) }}
- selenium_scraper:
- enabled: {{ data_manager.sources.links.selenium_scraper.selenium_scraper.enabled | default(false, True) }}
- visible: {{ data_manager.sources.links.selenium_scraper.selenium_scraper.visible | default(false, true) }}
- use_for_scraping: {{ data_manager.sources.links.selenium_scraper.use_for_scraping | default(false, true) }}
- selenium_class: {{ data_manager.sources.links.selenium_scraper.selenium_class | default('CERNSSOScraper', true) }}
- selenium_url: {{ data_manager.sources.links.selenium_scraper.selenium_url | default('null', true) }}
- selenium_class_map:
- CERNSSOScraper:
- class: {{ data_manager.sources.links.selenium_scraper.selenium_class_map.CERNSSOScraper.class | default('CERNSSOScraper', true) }}
- kwargs:
- headless: {{ data_manager.sources.links.selenium_scraper.selenium_class_map.CERNSSOScraper.kwargs.headless | default(true, true) }}
+ web:
+ enabled: {{ data_manager.sources.web.enabled | default(true, true) }}
+ visible: {{ data_manager.sources.web.visible | default(true, true) }}
+ link:
+ enabled: {{ data_manager.sources.web.link.enabled | default(true, true) }}
+ auth_provider_name: {{ data_manager.sources.web.link.auth_provider_name | default("", true) }}
+ schedule: '{{ data_manager.sources.web.link.schedule | default("", true) }}'
+ max_depth: {{ data_manager.sources.web.link.max_depth | default(3, true) }}
+ max_pages: {{ data_manager.sources.web.link.max_pages | default(null, true) }}
+ delay: {{ data_manager.sources.web.link.delay | default(1, true) }}
+ allow: {{ data_manager.sources.web.link.allow | default([], true) | tojson }}
+ deny: {{ data_manager.sources.web.link.deny | default([], true) | tojson }}
+ anonymize_data: {{ data_manager.sources.web.link.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.link.markitdown | default(false, true) }}
+ input_lists:
+ {%- for l in data_manager.sources.web.link.input_lists | default([], true) %}
+ - {{ l }}
+ {%- endfor %}
+ urls:
+ {%- for u in data_manager.sources.web.link.urls | default([], true) %}
+ - {{ u }}
+ {%- endfor %}
+ twiki:
+ enabled: {{ data_manager.sources.web.twiki.enabled | default(true, true) }}
+ auth_provider_name: {{ data_manager.sources.web.twiki.auth_provider_name | default("", true) }}
+ schedule: '{{ data_manager.sources.web.twiki.schedule | default("", true) }}'
+ max_depth: {{ data_manager.sources.web.twiki.max_depth | default(2, true) }}
+ max_pages: {{ data_manager.sources.web.twiki.max_pages | default(100, true) }}
+ delay: {{ data_manager.sources.web.twiki.delay | default(60, true) }}
+ allow: {{ data_manager.sources.web.twiki.allow | default([], true) | tojson }}
+ deny: {{ data_manager.sources.web.twiki.deny | default([], true) | tojson }}
+ anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.twiki.markitdown | default(false, true) }}
+ input_lists:
+ {%- for list in data_manager.sources.web.twiki.input_lists | default([], true) %}
+ - {{ list }}
+ {%- endfor %}
+ urls:
+ {%- for url in data_manager.sources.web.twiki.urls | default([], true) %}
+ - {{ url }}
+ {%- endfor %}
+ discourse:
+ enabled: {{ data_manager.sources.web.discourse.enabled | default(true, true) }}
+ auth_provider_name: {{ data_manager.sources.web.discourse.auth_provider_name | default("cern_sso", true) }}
+ schedule: '{{ data_manager.sources.web.discourse.schedule | default("", true) }}'
+ max_pages: {{ data_manager.sources.web.discourse.max_pages | default(500, true) }}
+ delay: {{ data_manager.sources.web.discourse.delay | default(10, true) }}
+ anonymize_data: {{ data_manager.sources.web.discourse.anonymize_data | default(false, true) }}
+ markitdown: {{ data_manager.sources.web.discourse.markitdown | default(false, true) }}
+ base_url: {{ data_manager.sources.web.discourse.base_url | default("https://cms-talk.web.cern.ch", true) }}
+ keywords:
+ {%- for keyword in data_manager.sources.web.discourse.keywords | default([], true) %}
+ - {{ keyword }}
+ {%- endfor %}
+ category_paths:
+ {%- for category_path in data_manager.sources.web.discourse.category_paths | default([], true) %}
+ - {{ category_path }}
+ {%- endfor %}
git:
enabled: {{ data_manager.sources.git.enabled | default(true, true) }}
visible: {{ data_manager.sources.git.visible | default(true, true) }}
schedule: '{{ data_manager.sources.git.schedule | default("", true) }}'
- sso:
- enabled: {{ data_manager.sources.sso.enabled | default(true, true) }}
- visible: {{ data_manager.sources.sso.visible | default(true, true) }}
- schedule: '{{ data_manager.sources.sso.schedule | default("", true) }}'
+ urls:
+ {%- for u in data_manager.sources.git.urls | default([], true) %}
+ - {{ u }}
+ {%- endfor %}
jira:
enabled: {{ data_manager.sources.jira.enabled | default(true, true) }}
url: {{ data_manager.sources.jira.url | default('', true) }}
diff --git a/src/cli/templates/dockerfiles/Dockerfile-data-manager b/src/cli/templates/dockerfiles/Dockerfile-data-manager
index 176d1c158..7adb2c473 100644
--- a/src/cli/templates/dockerfiles/Dockerfile-data-manager
+++ b/src/cli/templates/dockerfiles/Dockerfile-data-manager
@@ -35,6 +35,10 @@ COPY pyproject.toml pyproject.toml
COPY weblists weblists
RUN pip install --upgrade pip && pip install .
+# Chromium for Python Playwright (CERN SSO in Scrapy auth middleware).
+RUN python -m playwright install-deps chromium \
+ && python -m playwright install chromium
+
RUN chmod g+rx /root; chmod -R g+w /root/archi/src/interfaces
ARG APP_VERSION=unknown
diff --git a/src/data_manager/collectors/git_manager.py b/src/data_manager/collectors/git_manager.py
new file mode 100644
index 000000000..0978d2697
--- /dev/null
+++ b/src/data_manager/collectors/git_manager.py
@@ -0,0 +1,318 @@
+# src/data_manager/collectors/git_manager.py
+from __future__ import annotations
+
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
+
+from git import Repo
+
+from src.data_manager.collectors.git_resource import GitResource
+from src.data_manager.collectors.persistence import PersistenceService
+from src.utils.config_access import get_global_config
+from src.utils.env import read_secret
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+_DEFAULT_CODE_SUFFIXES = {
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
+ ".c", ".cpp", ".h", ".hpp", ".sh", ".sql",
+ ".json", ".yaml", ".yml", ".toml", ".md", ".txt",
+}
+_DEFAULT_EXCLUDE_DIRS = {
+ ".git", "node_modules", ".venv", "venv", "__pycache__",
+ ".idea", ".vscode", "dist", "build",
+}
+
+
+class GitManager:
+ """
+ Collects git repositories (MkDocs docs + code files) into the shared data path.
+
+ Interface mirrors LocalFileManager — instantiate with dm_config, then call
+ collect_all_from_config(persistence) or collect(urls, persistence) directly.
+ """
+
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
+ global_config = get_global_config()
+ self.data_path = Path(global_config["DATA_PATH"])
+
+ sources_config = (dm_config or {}).get("sources", {}) or {}
+ self.config: Dict[str, Any] = (
+ dict(sources_config.get("git", {}))
+ if isinstance(sources_config, dict)
+ else {}
+ )
+
+ self.enabled = self.config.get("enabled", True)
+ self.git_dir = Path(self.data_path) / "raw_git_repos"
+ self.git_dir.mkdir(parents=True, exist_ok=True)
+
+ self.code_suffixes = {
+ s.lower()
+ for s in self.config.get("code_suffixes", _DEFAULT_CODE_SUFFIXES)
+ }
+ self.exclude_dirs = set(
+ self.config.get("exclude_dirs", _DEFAULT_EXCLUDE_DIRS)
+ )
+ self.max_file_size_bytes = int(
+ self.config.get("max_file_size_bytes", 1_000_000)
+ )
+
+ self.git_username = read_secret("GIT_USERNAME")
+ self.git_token = read_secret("GIT_TOKEN")
+ self._credentials_available = bool(self.git_username and self.git_token)
+ if not self._credentials_available:
+ logger.info("No git credentials supplied; will attempt public repo cloning.")
+
+ # ── Public interface (mirrors LocalFileManager) ───────────────────────────
+
+ def collect_all_from_config(self, persistence: PersistenceService) -> None:
+ if not self.enabled:
+ return
+ urls: List[str] = self.config.get("urls", [])
+ if not urls:
+ logger.info("No git URLs configured; skipping")
+ return
+ self.collect(urls, persistence)
+
+ def schedule_collect_git(
+ self, persistence: PersistenceService, last_run: Optional[str] = None
+ ) -> None:
+ """Re-harvest all repos known to the catalog (config + dynamically added)."""
+ metadata = persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="git", metadata_keys=["repo_url"]
+ )
+ urls = list({m[1]["repo_url"] for m in metadata if m[1].get("repo_url")})
+ if not urls:
+ return
+ self.collect(urls, persistence)
+
+ def collect(self, git_urls: List[str], persistence: PersistenceService) -> None:
+ """Collect a list of git URLs and persist each harvested file."""
+ if not git_urls:
+ logger.warning("No git URLs provided; skipping")
+ return
+
+ for url in git_urls:
+ try:
+ repo_info = self._prepare_repository(url)
+ except ValueError as exc:
+ logger.info("%s", exc)
+ continue
+ except Exception as exc:
+ logger.error("Failed to clone %s: %s", url, exc)
+ continue
+
+ try:
+ target_dir = self.data_path / "git" / repo_info["repo_name"]
+ for resource in self._harvest_repository(repo_info):
+ self._persist_one(resource, persistence, target_dir)
+ finally:
+ shutil.rmtree(repo_info["repo_path"], ignore_errors=True)
+
+ logger.info("Git collection complete")
+
+ # ── Internal harvest ──────────────────────────────────────────────────────
+
+ def _harvest_repository(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ yield from self._harvest_mkdocs(repo_info)
+ yield from self._harvest_code(repo_info)
+
+ def _harvest_mkdocs(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ repo_path: Path = repo_info["repo_path"]
+ docs_dir = repo_path / "docs"
+ if not docs_dir.exists():
+ logger.info("Skipping MkDocs harvest for %s; no docs/ dir", repo_path)
+ return
+
+ mkdocs_site_url: Optional[str] = repo_info["mkdocs_site_url"]
+ base_url: str = repo_info["web_base_url"]
+ ref: str = repo_info["ref"]
+ repo_name: str = repo_info["repo_name"]
+ repo_url: str = repo_info["repo_url"]
+
+ for md_path in docs_dir.rglob("*.md"):
+ if mkdocs_site_url:
+ url = mkdocs_site_url + md_path.relative_to(docs_dir).with_suffix("").as_posix()
+ else:
+ url = self._build_blob_url(base_url, ref, md_path.relative_to(repo_path))
+
+ text = md_path.read_text(encoding="utf-8", errors="ignore")
+ if not text.strip():
+ logger.info("Skipping empty doc: %s", md_path)
+ continue
+
+ yield GitResource(
+ repo_url=repo_url,
+ file_path=str(Path(repo_name) / md_path.relative_to(repo_path)),
+ content=text,
+ source_type="git",
+ branch=repo_info.get("branch", ""),
+ ref=ref,
+ title=md_path.stem.replace("_", " ").replace("-", " ").title(),
+ )
+
+ def _harvest_code(self, repo_info: Dict[str, Any]) -> Iterator[GitResource]:
+ repo_path: Path = repo_info["repo_path"]
+ base_url: str = repo_info["web_base_url"]
+ ref: str = repo_info["ref"]
+ repo_name: str = repo_info["repo_name"]
+ repo_url: str = repo_info["repo_url"]
+
+ for file_path in self._iter_code_files(repo_path):
+ rel = file_path.relative_to(repo_path)
+
+ # avoid overlap with _harvest_mkdocs
+ if rel.parts and rel.parts[0] == "docs" and file_path.suffix.lower() == ".md":
+ continue
+
+ if not self._is_allowed_suffix(file_path):
+ continue
+
+ try:
+ if file_path.stat().st_size > self.max_file_size_bytes:
+ logger.warning("Skipping %s — exceeds size limit", file_path)
+ continue
+ except OSError:
+ continue
+
+ if self._looks_binary(file_path):
+ continue
+
+ try:
+ text = file_path.read_text(encoding="utf-8", errors="ignore")
+ except Exception:
+ continue
+
+ if not text.strip():
+ continue
+
+ yield GitResource(
+ repo_url=repo_url,
+ file_path=str(Path(repo_name) / rel),
+ content=text,
+ source_type="git",
+ branch=repo_info.get("branch", ""),
+ ref=ref,
+ title=None,
+ )
+
+ # ── Repository preparation ────────────────────────────────────────────────
+
+ def _prepare_repository(self, url: str) -> Dict[str, Any]:
+ url_dict = self._parse_url(url)
+ repo_path = self._clone_repo(url_dict)
+ return {
+ "repo_path": repo_path,
+ "repo_name": url_dict["repo_name"],
+ "repo_url": url_dict["original_url"],
+ "branch": url_dict["branch"] or "",
+ "mkdocs_site_url": self._read_mkdocs_site_url(repo_path),
+ "ref": self._determine_ref(repo_path, url_dict["branch"]),
+ "web_base_url": self._compute_web_base_url(url_dict["original_url"]),
+ }
+
+ def _parse_url(self, url: str) -> Dict[str, Any]:
+ match = re.search(
+ r"(?:github|gitlab)\.[\w.]+\/[^\/]+\/([\w.-]+)(?:\.git|\/|$)",
+ url, re.IGNORECASE,
+ )
+ if not match:
+ raise ValueError(f"Git URL does not match expected format: {url}")
+ repo_name = match.group(1)
+
+ if self._credentials_available:
+ if "gitlab" in url:
+ clone_url = url.replace("gitlab", f"{self.git_username}:{self.git_token}@gitlab")
+ elif "github" in url:
+ clone_url = url.replace("github", f"{self.git_username}:{self.git_token}@github")
+ else:
+ clone_url = url
+ else:
+ clone_url = url
+
+ branch = None
+ parts = re.split(r"/(?:-/)?tree/", clone_url, maxsplit=1)
+ if len(parts) > 1:
+ branch = parts[1].strip("/") or None
+ clone_url = parts[0].rstrip("/")
+
+ return {"original_url": url, "clone_url": clone_url, "repo_name": repo_name, "branch": branch}
+
+ def _clone_repo(self, url_dict: Dict[str, Any]) -> Path:
+ repo_path = self.git_dir / url_dict["repo_name"]
+ logger.info("Cloning %s …", url_dict["repo_name"])
+ kwargs = {}
+ if url_dict["branch"]:
+ kwargs["branch"] = url_dict["branch"]
+ Repo.clone_from(url_dict["clone_url"], repo_path, **kwargs)
+ return repo_path
+
+ def _read_mkdocs_site_url(self, repo_path: Path) -> Optional[str]:
+ mkdocs_file = repo_path / "mkdocs.yml"
+ if not mkdocs_file.exists():
+ return None
+ try:
+ from mkdocs.utils.yaml import yaml_load
+ with mkdocs_file.open() as f:
+ data = yaml_load(f)
+ site_url = data.get("site_url")
+ if not site_url:
+ return None
+ return site_url if site_url.endswith("/") else site_url + "/"
+ except Exception:
+ return None
+
+ def _compute_web_base_url(self, url: str) -> str:
+ sanitized = re.sub(r"//[^@/]+@", "//", url)
+ sanitized = re.split(r"/(?:-/)?tree/", sanitized, maxsplit=1)[0]
+ return sanitized.rstrip("/").removesuffix(".git")
+
+ def _determine_ref(self, repo_path: Path, branch: Optional[str]) -> str:
+ if branch:
+ return branch
+ try:
+ return Repo(repo_path).active_branch.name
+ except Exception:
+ try:
+ return Repo(repo_path).head.commit.hexsha[:7]
+ except Exception:
+ return "main"
+
+ def _build_blob_url(self, base_url: str, ref: str, rel: Path) -> str:
+ base = base_url.rstrip("/")
+ if "gitlab" in base:
+ return f"{base}/-/blob/{ref}/{rel.as_posix()}"
+ return f"{base}/blob/{ref}/{rel.as_posix()}"
+
+ # ── Helpers ───────────────────────────────────────────────────────────────
+
+ def _iter_code_files(self, repo_path: Path) -> Iterator[Path]:
+ for root, dirs, files in os.walk(repo_path):
+ dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
+ for name in files:
+ yield Path(root) / name
+
+ def _is_allowed_suffix(self, path: Path) -> bool:
+ return path.suffix.lower() in self.code_suffixes
+
+ def _looks_binary(self, path: Path) -> bool:
+ try:
+ return b"\0" in path.open("rb").read(8000)
+ except Exception:
+ return True
+
+ def _persist_one(
+ self,
+ resource: GitResource,
+ persistence: PersistenceService,
+ target_dir: Path,
+ ) -> None:
+ try:
+ persistence.persist_resource(resource, target_dir)
+ except Exception as exc:
+ logger.warning("Failed to persist %s: %s", resource.file_path, exc)
\ No newline at end of file
diff --git a/src/data_manager/collectors/git_resource.py b/src/data_manager/collectors/git_resource.py
new file mode 100644
index 000000000..678f0982c
--- /dev/null
+++ b/src/data_manager/collectors/git_resource.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+
+from src.data_manager.collectors.resource_base import BaseResource
+from src.data_manager.collectors.utils.metadata import ResourceMetadata
+
+
+@dataclass
+class GitResource(BaseResource):
+ """Representation of a single file harvested from a git repository."""
+
+ repo_url: str # canonical remote URL, credentials stripped
+ file_path: str # path within repo, e.g. "docs/guide.md"
+ content: Union[str, bytes]
+ source_type: str = "git"
+ branch: str = ""
+ ref: str = "" # commit SHA or tag; used in blob URLs
+ title: Optional[str] = None
+
+ def get_hash(self) -> str:
+ """
+ Stable hash on (repo_url, file_path) so re-harvests overwrite in-place.
+
+ Intentionally excludes ref/branch: the same file at a new commit
+ is still the same resource — it should update the catalog entry,
+ not create an orphan.
+ """
+ digest = hashlib.md5()
+ digest.update(f"{self.repo_url}::{self.file_path}".encode("utf-8", errors="ignore"))
+ return digest.hexdigest()[:12]
+
+ def get_filename(self) -> str:
+ return Path(self.file_path).name
+
+ def get_file_path(self, target_dir: Path) -> Path:
+ """Preserve the repo directory tree under target_dir."""
+ return target_dir / self.file_path
+
+ def get_content(self) -> Union[str, bytes]:
+ return self.content
+
+ def get_metadata(self) -> ResourceMetadata:
+ extra: dict[str, str] = {
+ "source_type": self.source_type,
+ "repo_url": self.repo_url,
+ "file_path": self.file_path,
+ "suffix": Path(self.file_path).suffix.lstrip(".") or "",
+ "display_name": self.file_path,
+ }
+ if self.branch:
+ extra["branch"] = self.branch
+ if self.ref:
+ extra["ref"] = self.ref
+ if self.title:
+ extra["title"] = self.title
+
+ return ResourceMetadata(file_name=self.get_filename(), extra=extra)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scraper_manager.py b/src/data_manager/collectors/scraper_manager.py
new file mode 100644
index 000000000..ec58e537e
--- /dev/null
+++ b/src/data_manager/collectors/scraper_manager.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Callable
+
+from scrapy.crawler import CrawlerProcess, Crawler
+from scrapy.utils.project import get_project_settings
+from scrapy.spiderloader import SpiderLoader
+from scrapy.settings import Settings
+from scrapy import Spider
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.persistence import PersistenceService
+from src.utils.config_access import get_global_config
+from src.utils.logging import get_logger
+from src.data_manager.collectors.utils.markitdown_convertor import MarkitdownConvertor
+
+logger = get_logger(__name__)
+
+def _make_spider_loader(settings: Settings) -> Callable[[str], type[Spider]]:
+ """Bind settings once, return a name → SpiderClass callable."""
+ return SpiderLoader.from_settings(settings).load
+
+def _spider_section_enabled(cfg: Dict[str, Any]) -> bool:
+ """Respect web..enabled; missing or null → enabled (on)."""
+ v = cfg.get("enabled", True)
+ return bool(v) if v is not None else True
+
+class ScraperManager:
+ """
+ Coordinates all web crawls as a single CrawlerProcess run.
+
+ One CrawlerProcess → one Twisted reactor → all spiders run concurrently.
+ Git collection is now GitManager's responsibility.
+ SSO authentication is handled by AuthDownloaderMiddleware + CERNSSOProvider.
+ """
+
+ def __init__(self, dm_config: Optional[Dict[str, Any]] = None, persistence: PersistenceService = None, anonymizer: Anonymizer = None, markitdown_manager: MarkitdownConvertor= None) -> None:
+ global_config = get_global_config()
+ self.data_path = Path(global_config["DATA_PATH"])
+ self.persistence = persistence
+ self.anonymizer = anonymizer
+ self.markitdown_manager = markitdown_manager
+ self.settings = Settings()
+ self.settings.setmodule(
+ "src.data_manager.collectors.scrapers.settings",
+ priority="project",
+ )
+
+ sources_config = (dm_config or {}).get("sources", {}) or {}
+
+ self.config = sources_config.get("web", {}) if isinstance(sources_config, dict) else {}
+ self.enabled = self.config.get("enabled", True)
+
+ # ── Public interface ──────────────────────────────────────────────────────
+
+ def collect_all_from_config(self) -> None:
+ logger.info("collect_all_from_config")
+ self._run(self._config_urls)
+
+ def schedule_collect(self, last_run: Optional[str] = None) -> None:
+ self._run(self._catalog_urls)
+
+ def collect(self, spider_key: str, urls: List[str]) -> None:
+ process = CrawlerProcess(self.settings)
+ logger.info("project_settings: %s", json.dumps(self.settings, indent=2, default=str))
+ try:
+ SpiderClass = _make_spider_loader(self.settings)(spider_key)
+ except KeyError:
+ logger.error("Unknown spider: %s", spider_key)
+ return
+ cfg = self.config.get(spider_key, {}) # use config settings if present, else defaults
+ if urls and _spider_section_enabled(cfg):
+ self._add_crawler(process, SpiderClass, urls, cfg)
+ # Fix Twisted/Scrapy try to installs OS signal handlers (SIGINT / SIGTERM) while the code is running in a worker thread
+ process.start(install_signal_handlers=False)
+
+ def _run(self, url_fn: Callable[[str, Dict], List[str]]) -> None:
+ if not self.enabled:
+ logger.info("Web scraping disabled; skipping")
+ return
+ process = CrawlerProcess(self.settings)
+ load_spider = _make_spider_loader(self.settings)
+ (self.data_path / "websites").mkdir(parents=True, exist_ok=True)
+
+ added = False
+ for spider_key, cfg in self.config.items():
+ if not isinstance(cfg, dict):
+ continue
+ try:
+ SpiderClass = load_spider(spider_key)
+ except KeyError:
+ continue
+ urls = url_fn(spider_key, cfg)
+ if urls and _spider_section_enabled(cfg):
+ self._add_crawler(process, SpiderClass, urls, cfg)
+ added = True
+ if added:
+ process.start(install_signal_handlers=False)
+
+ # ── CrawlerProcess wiring ─────────────────────────────────────────────────
+
+ def _add_crawler(
+ self,
+ process: CrawlerProcess,
+ spider_class: type[Spider],
+ urls: List[str],
+ cfg: Optional[Dict[str, Any]] = None,
+ ) -> None:
+ """
+ Create a Crawler for spider_key, inject PersistencePipeline settings,
+ and register it with the process.
+ """
+ cfg = cfg or {}
+ crawler: Crawler = process.create_crawler(spider_class)
+ # Inject persistence objects — live Python instances, must be priority="spider"
+ crawler.settings.set("PERSISTENCE_SERVICE", self.persistence, priority="spider")
+ crawler.settings.set("PERSISTENCE_OUTPUT_DIR", self.data_path / "websites", priority="spider")
+ crawler.settings.set("ANONYMIZER_SERVICE", self.anonymizer, priority="spider")
+ crawler.settings.set("MARKITDOWN_SERVICE", self.markitdown_manager, priority="spider")
+ process.crawl(crawler, start_urls=urls, **cfg)
+
+ # ── URL sources & list parsing ──────────────────────────────────────────────────────
+
+ def _config_urls(self, spider_key: str, cfg: Dict) -> List[str]:
+ urls = list(cfg.get("urls") or [])
+ for list_path in cfg.get("input_lists") or []:
+ path = Path("weblists") / list_path.lstrip("/")
+ if not path.exists():
+ logger.warning("Input list not found: %s", path)
+ continue
+ urls.extend(self._extract_urls_from_file(path))
+ # Discourse (and similar API/Iterative spiders) don't use start_urls;
+ # category_paths or base_url signals the spider is configured.
+ if not urls and (cfg.get("category_paths") or cfg.get("base_url")):
+ urls = ["__api_spider__"]
+ return urls
+
+ def _catalog_urls(self, spider_key: str, cfg: Dict) -> List[str]:
+ if not self.persistence:
+ return []
+
+ metadata = self.persistence.catalog.get_metadata_by_filter(
+ "source_type", source_type="web", metadata_keys=["url", "spider_name"]
+ )
+ return [
+ m[1].get("url", "").strip()
+ for m in metadata
+ if m[1].get("spider_name", "link") == spider_key and m[1].get("url")
+ ]
+
+ def _extract_urls_from_file(self, path: Path) -> List[str]:
+ urls: List[str] = []
+ with path.open("r") as f:
+ for line in f:
+ stripped = line.strip()
+ if not stripped or stripped.startswith("#"):
+ continue
+ urls.append(stripped.split(",")[0].strip())
+ return urls
diff --git a/src/data_manager/collectors/scrapers/adapters.py b/src/data_manager/collectors/scrapers/adapters.py
new file mode 100644
index 000000000..0bf3fdda9
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/adapters.py
@@ -0,0 +1,102 @@
+"""
+Single-dispatch adapter: converts Scrapy Items into ScrapedResource.
+
+Design principles:
+- Items are dumb data bags. They know nothing about ScrapedResource.
+- This is the ONLY place that knows about both schemas.
+- New sources: add a @to_scraped_resource.register block here. Touch nothing else.
+- Do NOT reconstruct ResourceMetadata — ScrapedResource.get_metadata() already
+ derives display_name, url, suffix, source_type from raw fields. Pass raw values only.
+
+Constraint: ~50 LOC of logic.
+
+Adding a new source (e.g. TwikiPageItem):
+ @to_scraped_resource.register(TwikiPageItem)
+ def _twiki(item) -> ScrapedResource:
+ ...
+
+If two sources share identical mapping logic, stack decorators:
+ @to_scraped_resource.register(WebPageItem)
+ @to_scraped_resource.register(TwikiPageItem)
+ def _html_page(item) -> ScrapedResource:
+ ...
+ Note: do NOT use union type hints (WebPageItem | TwikiPageItem) —
+ singledispatch ignores annotations, it dispatches on runtime type only.
+"""
+from __future__ import annotations
+
+from functools import singledispatch
+
+from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
+from src.data_manager.collectors.scrapers.items import WebPageItem, IndicoPageItem, DiscourseTopicPageItem
+
+
+@singledispatch
+def to_scraped_resource(item) -> ScrapedResource:
+ """Raises for unregistered types — fail loudly, never silently skip."""
+ raise TypeError(
+ f"No adapter registered for item type {type(item).__name__!r}. "
+ "Add @to_scraped_resource.register(YourItemClass) in this module."
+ )
+
+
+@to_scraped_resource.register(WebPageItem)
+def _html_page(item) -> ScrapedResource:
+ """
+ Handles all HTML-family pages regardless of auth method.
+
+ PDFs scraped from the web also route here — the parser sets
+ suffix="pdf" and content=bytes in the item, so no branch needed.
+ The adapter passes suffix and source_type through without inspection.
+ """
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "html"),
+ source_type=item["source_type"],
+ metadata={
+ "content_type": item.get("content_type"),
+ "encoding": item.get("encoding"),
+ "title": item.get("title"),
+ },
+ )
+
+
+@to_scraped_resource.register(IndicoPageItem)
+def _indico(item) -> ScrapedResource:
+ """
+ Indico items carry event_id and category as extra metadata.
+ These are the only fields that justify a separate dispatch branch.
+ """
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "html"),
+ source_type=item["source_type"],
+ metadata={
+ "content_type": item.get("content_type"),
+ "title": item.get("title"),
+ "event_id": item.get("event_id"),
+ "category": item.get("category"),
+ },
+ )
+
+@to_scraped_resource.register(DiscourseTopicPageItem)
+def _discourse(item) -> ScrapedResource:
+ """
+ Discourse items carry topic-level metadata from the category JSON listing.
+ """
+ return ScrapedResource(
+ url=item["url"],
+ content=item["content"],
+ suffix=item.get("suffix", "rss"),
+ source_type=item["source_type"],
+ metadata={
+ "content_type": item.get("content_type"),
+ "encoding": item.get("encoding"),
+ "title": item.get("title"),
+ "tags": item.get("tags"),
+ "has_accepted_answer": item.get("has_accepted_answer"),
+ "created_at": item.get("created_at"),
+ },
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/integrations/__init__.py b/src/data_manager/collectors/scrapers/auth/__init__.py
similarity index 100%
rename from src/data_manager/collectors/scrapers/integrations/__init__.py
rename to src/data_manager/collectors/scrapers/auth/__init__.py
diff --git a/src/data_manager/collectors/scrapers/auth/base.py b/src/data_manager/collectors/scrapers/auth/base.py
new file mode 100644
index 000000000..f85830868
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/auth/base.py
@@ -0,0 +1,96 @@
+"""
+Base auth contract: Credentials value object + AuthProvider ABC.
+
+Scrapy SoC note
+---------------
+Providers are *credential factories only*. They know how to acquire,
+validate, and refresh credentials. They have zero knowledge of Scrapy
+Requests, Responses, spiders, or pipelines. The middleware decides *when*
+to call the provider; the provider decides *how* to produce valid credentials.
+
+Credential lifecycle (owned by AuthDownloaderMiddleware):
+ 1. acquire(url) — full login flow, called lazily on the first request
+ 2. inject — middleware stamps cookies/headers onto the Request
+ 3. is_valid() — middleware may pre-check before each request (optional)
+ 4. refresh(url) — called on 401/403 or detected login-redirect
+ 5. invalidate() — marks credentials stale; next request triggers refresh
+ 6. close() — release browser/driver resources on spider_closed signal
+"""
+from __future__ import annotations
+
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+
+@dataclass
+class Credentials:
+ """Immutable value object carrying whatever the downloader needs.
+
+ Either ``cookies`` (session-based SSO) or ``headers`` (bearer token) or both.
+ Never mutated after creation — callers call provider.refresh() to get a new one.
+
+ ``acquired_at`` and ``ttl_seconds`` are optional hints. If the provider
+ knows the session lifetime (e.g. from a Set-Cookie Max-Age), it sets them
+ so the middleware can pre-emptively refresh before a request fails rather
+ than waiting for a 401.
+
+ ``_valid`` is an internal flag; use invalidate() / is_valid() rather than
+ touching it directly.
+ """
+
+ cookies: List[Dict] = field(default_factory=list)
+ headers: Dict[str, str] = field(default_factory=dict)
+ acquired_at: float = field(default_factory=time.monotonic)
+ ttl_seconds: Optional[float] = None # None = unknown / infinite
+ _valid: bool = field(default=True, repr=False, init=False)
+
+ def is_empty(self) -> bool:
+ return not self.cookies and not self.headers
+
+ def is_valid(self) -> bool:
+ """Return False if explicitly invalidated or if TTL has elapsed."""
+ if not self._valid:
+ return False
+ if self.ttl_seconds is not None:
+ return (time.monotonic() - self.acquired_at) < self.ttl_seconds
+ return True
+
+ def invalidate(self) -> None:
+ """Mark these credentials as stale. Thread-safe for single-threaded Twisted."""
+ self._valid = False
+
+
+class AuthProvider(ABC):
+ """Abstract base for all auth providers.
+
+ Instantiated once per crawl inside AuthDownloaderMiddleware.from_crawler()
+ so providers can be swapped for test fakes without touching any spider.
+
+ Concrete implementations must be importable via their dotted class path
+ registered in settings.SPIDER_AUTH_PROVIDERS.
+ """
+
+ @abstractmethod
+ def acquire(self, url: str) -> Optional[Credentials]:
+ """Full authentication flow. Returns Credentials or None on failure."""
+
+ def refresh(self, url: str) -> Optional[Credentials]:
+ """Re-authenticate. Default: delegates to acquire().
+
+ Override for providers that have a cheaper refresh path (e.g. a
+ /token/refresh endpoint that doesn't need a full browser login).
+ """
+ return self.acquire(url)
+
+ def is_session_expired(self, response) -> bool:
+ """Return True if response indicates session expiry.
+ Default checks only explicit HTTP auth codes via the middleware's
+ failure_codes list. Override for providers whose SSO signals
+ expiry via a 302→200 poison-pill (CERN) or a JSON error body (APIs).
+ """
+ return False
+
+ def close(self) -> None:
+ """Release resources (browser context, HTTP session, etc.)."""
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/auth/cern_sso.py b/src/data_manager/collectors/scrapers/auth/cern_sso.py
new file mode 100644
index 000000000..9cf445f82
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/auth/cern_sso.py
@@ -0,0 +1,278 @@
+"""
+CERN SSO auth provider — Playwright implementation.
+
+Why Playwright over Selenium (legacy SSOScraper used Selenium)
+--------------------------------------------------------------
+The legacy SSOScraper mixed browser lifecycle, cookie collection, crawling, and
+link extraction into one class. Now that auth is a pure credential factory
+(Boundary B from the spec), the browser only needs to log in and hand back
+cookies — Playwright's sync API is less boilerplate for this narrow use case:
+
+ - No geckodriver binary management (Playwright installs its own browsers)
+ - BrowserContext.cookies() returns the exact dict format Scrapy expects
+ - context.clear_cookies() + re-login is cheaper than quitting/restarting
+ a WebDriver session — critical for mid-crawl refresh without stalling the
+ Twisted reactor for a long time
+ - storage_state() lets us persist/restore auth state across restarts if we
+ ever want that (Phase 2 enhancement)
+
+Design
+------
+One ``Browser`` instance lives for the lifetime of the crawl (created lazily).
+Each acquire/refresh operates on a fresh ``BrowserContext`` so sessions never
+bleed between attempts. The old context is closed before opening a new one.
+
+Invalidation
+------------
+The middleware calls ``credentials.invalidate()`` and then ``provider.refresh()``
+when it detects a 401, 403, or a login-page redirect. ``refresh()`` here does:
+
+ 1. close the existing BrowserContext (clearing all cookies server-side too)
+ 2. open a new BrowserContext
+ 3. navigate to the target URL (which triggers the SSO redirect)
+ 4. fill in credentials and submit
+ 5. return a fresh Credentials object
+
+The Browser process itself is NOT restarted on refresh — only the context,
+which is a lightweight operation (~200ms vs ~2s for a full browser restart).
+"""
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional
+
+from urllib.parse import urlparse
+from playwright.sync_api import (
+ Browser,
+ BrowserContext,
+ Page,
+ Playwright,
+ sync_playwright,
+)
+
+from src.utils.env import read_secret
+from src.utils.logging import get_logger
+from .base import AuthProvider, Credentials
+
+logger = get_logger(__name__)
+
+# Keycloak login form element IDs (CERN SSO uses standard Keycloak)
+_USERNAME_SELECTOR = "#username"
+_PASSWORD_SELECTOR = "#password"
+_SUBMIT_SELECTOR = "#kc-login"
+_LOGIN_TIMEOUT_MS = 20_000 # ms — Playwright uses milliseconds
+
+# URL patterns that indicate we landed on a login page instead of content.
+# Used by the middleware to detect the SSO poison-pill (302 → /login → 200 OK).
+LOGIN_URL_PATTERNS: List[str] = [
+ r"auth\.cern\.ch",
+ r"/login",
+ r"/sso/",
+ r"keycloak",
+]
+_LOGIN_RE = re.compile("|".join(LOGIN_URL_PATTERNS), re.IGNORECASE)
+
+
+def looks_like_login_page(url: str) -> bool:
+ """Return True if *url* matches known CERN SSO login page patterns.
+
+ Exported so the middleware can call it from process_response without
+ importing the whole provider.
+ """
+ return bool(_LOGIN_RE.search(url))
+
+
+class CERNSSOProvider(AuthProvider):
+ """Acquires CERN SSO session cookies via a headless Playwright browser.
+
+ Args:
+ username: CERN SSO username. Falls back to SSO_USERNAME secret.
+ password: CERN SSO password. Falls back to SSO_PASSWORD secret.
+ headless: Run browser headlessly (default True).
+ browser_type: 'chromium' | 'firefox' | 'webkit' (default 'chromium').
+ Chromium is faster for headless cookie extraction.
+ slow_mo_ms: Playwright slow-motion delay in ms. 0 in production,
+ useful for debugging (e.g. 500).
+ """
+
+ def __init__(
+ self,
+ username: Optional[str] = None,
+ password: Optional[str] = None,
+ headless: bool = True,
+ browser_type: str = "chromium",
+ slow_mo_ms: int = 0,
+ ) -> None:
+ self.username: str = username or read_secret("SSO_USERNAME") or ""
+ self.password: str = password or read_secret("SSO_PASSWORD") or ""
+ self.headless = headless
+ self.browser_type = browser_type
+ self.slow_mo_ms = slow_mo_ms
+
+ if not self.username or not self.password:
+ raise ValueError(
+ "CERNSSOProvider requires SSO_USERNAME and SSO_PASSWORD. "
+ "Set them as secrets or pass them explicitly."
+ )
+
+ # Lazily initialised — browser starts only when acquire() is first called.
+ self._playwright: Optional[Playwright] = None
+ self._browser: Optional[Browser] = None
+ self._context: Optional[BrowserContext] = None
+
+ logger.info(
+ "CERNSSOProvider ready (browser=%s, headless=%s)",
+ browser_type,
+ headless,
+ )
+
+ # ------------------------------------------------------------------
+ # AuthProvider contract
+ # ------------------------------------------------------------------
+
+ def acquire(self, url: str) -> Optional[Credentials]:
+ """Full CERN SSO login flow. Returns cookies as Credentials."""
+ self._ensure_browser()
+ self._open_fresh_context()
+ return self._login_and_extract(url)
+
+ def refresh(self, url: str) -> Optional[Credentials]:
+ """Refresh by wiping the existing context and re-logging in.
+
+ Reuses the running Browser process — only the BrowserContext is
+ discarded, which is fast (~200 ms) and avoids stalling the Twisted
+ reactor for a full browser restart.
+ """
+ logger.info("CERNSSOProvider: refreshing session for %s", url)
+ self._close_context() # wipe cookies server-side
+ self._open_fresh_context() # blank slate
+ return self._login_and_extract(url)
+
+ def is_session_expired(self, response) -> bool:
+ return looks_like_login_page(response.url)
+
+ def close(self) -> None:
+ """Quit the browser process. Called by middleware on spider_closed."""
+ self._close_context()
+ if self._browser:
+ try:
+ self._browser.close()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: browser.close() raised: %s", exc)
+ finally:
+ self._browser = None
+ if self._playwright:
+ try:
+ self._playwright.stop()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: playwright.stop() raised: %s", exc)
+ finally:
+ self._playwright = None
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _ensure_browser(self) -> None:
+ if self._playwright is None:
+ self._playwright = sync_playwright().start()
+ if self._browser is None:
+ launcher = getattr(self._playwright, self.browser_type)
+ self._browser = launcher.launch(
+ headless=self.headless,
+ slow_mo=self.slow_mo_ms,
+ )
+ logger.info(
+ "CERNSSOProvider: %s browser started (headless=%s)",
+ self.browser_type,
+ self.headless,
+ )
+
+ def _open_fresh_context(self) -> None:
+ """Close any existing context and open a blank new one."""
+ self._close_context()
+ assert self._browser is not None
+ self._context = self._browser.new_context(
+ # Accept cookies from any domain so SSO redirects set cookies freely.
+ ignore_https_errors=True,
+ )
+
+ def _close_context(self) -> None:
+ if self._context:
+ try:
+ self._context.close()
+ except Exception as exc:
+ logger.debug("CERNSSOProvider: context.close() raised: %s", exc)
+ finally:
+ self._context = None
+
+ def _login_and_extract(self, url: str) -> Optional[Credentials]:
+ """Navigate to *url*, complete SSO login, return Credentials."""
+ assert self._context is not None
+ page: Page = self._context.new_page()
+ try:
+ page.goto(url, wait_until="networkidle", timeout=30_000)
+
+ # Public page: loaded directly without SSO redirect — return whatever
+ # cookies the browser has (may be empty, that's fine for public pages).
+ if not looks_like_login_page(page.url):
+ # Try the site root — some sites like Discourse only redirect on the homepage
+ origin = f"{urlparse(url).scheme}://{urlparse(url).netloc}/"
+ page.goto(origin, wait_until="networkidle", timeout=30_000)
+ if not looks_like_login_page(page.url):
+ # Still no SSO redirect — return whatever cookies we have
+ raw_cookies = self._context.cookies()
+ logger.info("CERNSSOProvider: no SSO redirect for %s, returning browser cookies", url)
+ return Credentials(cookies=raw_cookies)
+
+ if not self._fill_login_form(page):
+ return None
+
+ # After submit, wait for navigation away from the login page.
+ page.wait_for_url(
+ lambda u: not looks_like_login_page(u),
+ timeout=_LOGIN_TIMEOUT_MS,
+ )
+
+ # Navigate back to the original URL so all domain cookies are set.
+ page.goto(url, wait_until="networkidle", timeout=30_000)
+
+ raw_cookies: List[Dict] = self._context.cookies()
+ logger.debug(
+ "CERNSSOProvider: acquired %d cookies for %s",
+ len(raw_cookies),
+ url,
+ )
+ return Credentials(cookies=raw_cookies)
+
+ except Exception as exc:
+ logger.error(
+ "CERNSSOProvider: login flow failed for %s: %s",
+ url,
+ exc,
+ exc_info=True,
+ )
+ return None
+ finally:
+ try:
+ page.close()
+ except Exception:
+ pass
+
+ def _fill_login_form(self, page: Page) -> bool:
+ """Fill in and submit the Keycloak login form.
+
+ Returns True if the submit was reached without timeout.
+ """
+ try:
+ page.wait_for_selector(_USERNAME_SELECTOR, timeout=_LOGIN_TIMEOUT_MS)
+ page.fill(_USERNAME_SELECTOR, self.username)
+ page.fill(_PASSWORD_SELECTOR, self.password)
+ page.click(_SUBMIT_SELECTOR)
+ logger.info("CERNSSOProvider: login form submitted")
+ return True
+ except Exception as exc:
+ logger.error(
+ "CERNSSOProvider: could not find/fill login form: %s", exc
+ )
+ return False
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/integrations/git_scraper.py b/src/data_manager/collectors/scrapers/integrations/git_scraper.py
deleted file mode 100644
index 7d73fd37a..000000000
--- a/src/data_manager/collectors/scrapers/integrations/git_scraper.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import os
-import re
-import shutil
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
-
-from git import Repo
-from mkdocs.utils.yaml import yaml_load
-
-from src.utils.config_access import get_global_config
-from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-if TYPE_CHECKING:
- from src.data_manager.collectors.scrapers.scraper_manager import \
- ScraperManager
-
-global_config = get_global_config()
-
-class GitScraper:
- """Scraper integration that clones Git repositories and indexes MkDocs sites and code files."""
-
- def __init__(self, manager: "ScraperManager", git_config: Optional[Dict[str, Any]] = None) -> None:
- self.manager = manager
- self.config = git_config or {}
-
- # where we clone our repos to
- self.data_path = global_config["DATA_PATH"]
- self.git_dir = Path(self.data_path) / "raw_git_repos"
- self.git_dir.mkdir(parents=True, exist_ok=True)
-
- self.code_suffixes = {
- suffix.lower()
- for suffix in (
- self.config.get(
- "code_suffixes",
- [
- ".py",
- ".js",
- ".ts",
- ".tsx",
- ".jsx",
- ".java",
- ".go",
- ".rs",
- ".c",
- ".cpp",
- ".h",
- ".hpp",
- ".sh",
- ".sql",
- ".json",
- ".yaml",
- ".yml",
- ".toml",
- ".md",
- ".txt",
- ],
- )
- or []
- )
- }
- self.exclude_dirs = {
- dir_name
- for dir_name in (
- self.config.get(
- "exclude_dirs",
- [
- ".git",
- "node_modules",
- ".venv",
- "venv",
- "__pycache__",
- ".idea",
- ".vscode",
- "dist",
- "build",
- ],
- )
- or []
- )
- }
- self.max_file_size_bytes = int(self.config.get("max_file_size_bytes", 1_000_000))
-
- self.git_username = read_secret("GIT_USERNAME")
- self.git_token = read_secret("GIT_TOKEN")
- self._credentials_available = bool(self.git_username and self.git_token)
- if not self._credentials_available:
- logger.info("No git credentials supplied; will attempt public repo cloning.")
-
- def collect(self, git_urls: List[str]) -> List[ScrapedResource]:
- if not git_urls:
- logger.warning("No git URLs provided for scraping; skipping git scraper.")
- return []
-
- harvested: List[ScrapedResource] = []
-
- for url in git_urls:
- try:
- repo_info = self._prepare_repository(url)
- except ValueError as exc:
- logger.info(f"{exc}")
- continue
- except Exception as exc:
- logger.error(f"Failed to clone {url}: {exc}")
- continue
-
- try:
- harvested.extend(self._harvest_repository(repo_info))
- finally:
- shutil.rmtree(repo_info["repo_path"], ignore_errors=True)
-
- if harvested:
- logger.info("Git scraping was completed successfully")
-
- return harvested
-
- def _prepare_repository(self, url: str) -> Dict[str, Any]:
- url_dict = self._parse_url(url)
- repo_path = self._clone_repo(url_dict)
- mkdocs_site_url = self._read_mkdocs_site_url(repo_path)
- ref = self._determine_ref(repo_path, url_dict["branch"])
- web_base_url = self._compute_web_base_url(url_dict["original_url"])
-
- return {
- "repo_path": repo_path,
- "repo_name": url_dict["repo_name"],
- "mkdocs_site_url": mkdocs_site_url,
- "ref": ref,
- "web_base_url": web_base_url,
- }
-
- def _harvest_repository(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- resources: List[ScrapedResource] = []
- resources.extend(self._harvest_mkdocs(repo_info))
- resources.extend(self._harvest_code(repo_info))
- return resources
-
- def _harvest_mkdocs(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- repo_path = repo_info["repo_path"]
- mkdocs_site_url = repo_info["mkdocs_site_url"]
- base_url = repo_info["web_base_url"]
- ref = repo_info["ref"]
- docs_dir = repo_path / "docs"
- if not docs_dir.exists():
- logger.info(f"Skipping MkDocs harvesting for {repo_path}; missing docs directory")
- return []
-
- resources: List[ScrapedResource] = []
- parent_repo = repo_info["repo_name"]
- used_blob_links = False
- for markdown_path in docs_dir.rglob("*.md"):
- if mkdocs_site_url:
- current_url = mkdocs_site_url + markdown_path.relative_to(docs_dir).with_suffix("").as_posix()
- else:
- current_url = self._build_blob_url(base_url, ref, markdown_path.relative_to(repo_path))
- used_blob_links = True
- logger.info(f"Indexing Git doc: {current_url}")
- text_content = markdown_path.read_text(encoding="utf-8")
- relative_path = Path(parent_repo) / markdown_path.relative_to(repo_path)
- resource = ScrapedResource(
- url=current_url,
- content=text_content,
- suffix=markdown_path.suffix.lstrip(".") or "txt",
- source_type="git",
- metadata={
- "repo_path": str(markdown_path.relative_to(repo_path)),
- "title": markdown_path.stem.replace("_", " ").replace("-", " ").title(),
- "parent": parent_repo,
- },
- file_name=markdown_path.name,
- relative_path=str(relative_path),
- )
- if resource.content:
- resources.append(resource)
- else:
- logger.info(f"Resource {current_url} is empty. Skipping...")
-
- if used_blob_links and not mkdocs_site_url:
- logger.info(f"Used repository blob URLs for MkDocs content in {repo_path} (site_url missing)")
-
- return resources
-
- def _harvest_code(self, repo_info: Dict[str, Any]) -> List[ScrapedResource]:
- repo_path = repo_info["repo_path"]
- ref = repo_info["ref"]
- base_url = repo_info["web_base_url"]
- repo_name = repo_info["repo_name"]
-
- resources: List[ScrapedResource] = []
- for file_path in self._iter_code_files(repo_path):
- logger.debug(file_path)
- rel_path = file_path.relative_to(repo_path)
-
- # avoid overlap wtih _harvest_mkdocs
- if rel_path.parts and rel_path.parts[0] == "docs" and file_path.suffix.lower() == ".md":
- continue
-
- try:
- if file_path.stat().st_size > self.max_file_size_bytes:
- logger.warning(f"Skipping {file_path} due to file size")
- continue
- except OSError:
- continue
-
- if not self._is_allowed_suffix(file_path):
- logger.warning(f"Skipping {file_path} due to disallowed suffix")
- continue
-
- if self._looks_binary(file_path):
- logger.warning(f"Skipping {file_path} due to likely binary content")
- continue
-
- try:
- text_content = file_path.read_text(encoding="utf-8", errors="ignore")
- except Exception:
- continue
-
- if not text_content.strip():
- continue
-
- resource_url = self._build_blob_url(base_url, ref, rel_path)
- relative_path = Path(repo_name) / rel_path
- resource = ScrapedResource(
- url=resource_url,
- content=text_content,
- suffix=file_path.suffix.lstrip("."),
- source_type="git",
- metadata={
- "repo_path": str(rel_path),
- "parent": repo_name,
- "ref": ref,
- },
- file_name=file_path.name,
- relative_path=str(relative_path),
- )
- resources.append(resource)
-
- return resources
-
- def _parse_url(self, url: str) -> dict:
- branch_name = None
-
- regex_repo_name = r"(?:github|gitlab)\.[\w.]+\/[^\/]+\/([\w.-]+)(?:\.git|\/|$)"
- match = re.search(regex_repo_name, url, re.IGNORECASE)
- if not match:
- raise ValueError(f"The git url {url} does not match the expected format.")
-
- repo_name = match.group(1)
-
- # Only inject credentials if available (for private repos)
- if self._credentials_available:
- if "gitlab" in url:
- clone_from_url = url.replace("gitlab", f"{self.git_username}:{self.git_token}@gitlab")
- elif "github" in url:
- clone_from_url = url.replace("github", f"{self.git_username}:{self.git_token}@github")
- else:
- # For other hosts, try without credentials
- clone_from_url = url
- else:
- # No credentials - use URL as-is (for public repos)
- clone_from_url = url
-
- branch_split = re.split(r"/(?:-/)?tree/", clone_from_url, maxsplit=1)
- if len(branch_split) > 1:
- branch_name = branch_split[1].strip("/") or None
- clone_from_url = branch_split[0].rstrip("/")
-
- return {
- "original_url": url,
- "clone_url": clone_from_url,
- "repo_name": repo_name,
- "branch": branch_name,
- }
-
- def _clone_repo(self, url_dict: dict) -> Path:
- clone_url = url_dict["clone_url"]
- branch = url_dict["branch"]
- repo_name = url_dict["repo_name"]
-
- logger.info(f"Cloning repository {repo_name}...")
-
- repo_path = self.git_dir / repo_name
- if branch is None:
- Repo.clone_from(clone_url, repo_path)
- else:
- Repo.clone_from(clone_url, repo_path, branch=branch)
-
- return repo_path
-
- def _read_mkdocs_site_url(self, repo_path: Path) -> Optional[str]:
- mkdocs_file = repo_path / "mkdocs.yml"
- if not mkdocs_file.exists():
- return None
- try:
- with mkdocs_file.open("r") as file:
- data = yaml_load(file)
- site_url = data.get("site_url")
- if not site_url:
- return None
- return site_url if site_url.endswith("/") else site_url + "/"
- except Exception:
- logger.info(f"Could not read mkdocs.yml in {repo_path}")
- return None
-
- def _compute_web_base_url(self, original_url: str) -> str:
- sanitized = re.sub(r"//[^@/]+@", "//", original_url)
- sanitized = re.split(r"/(?:-/)?tree/", sanitized, maxsplit=1)[0]
- if sanitized.endswith(".git"):
- sanitized = sanitized[:-4]
- return sanitized.rstrip("/")
-
- def _determine_ref(self, repo_path: Path, requested_branch: Optional[str]) -> str:
- if requested_branch:
- return requested_branch
- repo: Optional[Repo] = None
- try:
- repo = Repo(repo_path)
- return repo.active_branch.name
- except Exception:
- try:
- repo = repo or Repo(repo_path)
- return repo.head.commit.hexsha[:7]
- except Exception:
- return "main"
-
- def _iter_code_files(self, repo_path: Path):
- for root, dirs, files in os.walk(repo_path):
- dirs[:] = [d for d in dirs if d not in self.exclude_dirs]
- for filename in files:
- file_path = Path(root) / filename
- yield file_path
-
- def _is_allowed_suffix(self, file_path: Path) -> bool:
- return file_path.suffix.lower() in self.code_suffixes
-
- def _looks_binary(self, file_path: Path) -> bool:
- try:
- with file_path.open("rb") as file:
- sample = file.read(8000)
- return b"\0" in sample
- except Exception:
- return True
-
- def _build_blob_url(self, base_url: str, ref: str, rel_path: Path) -> str:
- base = base_url.rstrip("/")
- rel = rel_path.as_posix()
- if "gitlab" in base:
- return f"{base}/-/blob/{ref}/{rel}"
- return f"{base}/blob/{ref}/{rel}"
diff --git a/src/data_manager/collectors/scrapers/integrations/sso_scraper.py b/src/data_manager/collectors/scrapers/integrations/sso_scraper.py
deleted file mode 100644
index d03877bfb..000000000
--- a/src/data_manager/collectors/scrapers/integrations/sso_scraper.py
+++ /dev/null
@@ -1,466 +0,0 @@
-import hashlib
-import importlib
-import json
-import os
-import re
-import time
-import urllib.parse
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
-
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.options import Options as FirefoxOptions
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.common.exceptions import TimeoutException
-
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource, BrowserIntermediaryResult
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-class SSOScraper(ABC):
- """Generic base class for SSO-authenticated web scrapers."""
-
- def __init__(self, username=None, password=None, headless=True, site_type="generic", max_depth=2, selenium_url=None):
- """Initialize the SSO scraper with credentials and browser settings.
-
- Args:
- username (str, optional): SSO username. If None, will try to get from env vars.
- password (str, optional): SSO password. If None, will try to get from env vars.
- headless (bool): Whether to run the browser in headless mode.
- site_type (str): Type of site to scrape ('generic' or 'mkdocs')
- max_depth (int): Maximum number of levels to crawl per page.
- """
- self.username = username or self.get_username_from_env()
- self.password = password or self.get_password_from_env()
- self.headless = headless
- self.max_depth = max_depth
- self.site_type = site_type
- self.driver = None
- self.visited_urls = set()
- self.selenium_url = selenium_url
-
- if self.username:
- logger.info(f"Using username: {self.username}")
-
- def _is_image_url(self, url: str) -> bool:
- """Check if URL points to an image file."""
- image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.webp')
- parsed_url = urllib.parse.urlparse(url)
- path = parsed_url.path.lower()
- return any(path.endswith(ext) for ext in image_extensions)
-
- @abstractmethod
- def get_username_from_env(self):
- """Get username from environment variables. Override in subclasses."""
- pass
-
- @abstractmethod
- def get_password_from_env(self):
- """Get password from environment variables. Override in subclasses."""
- pass
-
- @abstractmethod
- def login(self):
- """Login to SSO with the provided credentials. Override in subclasses."""
- pass
-
- def setup_driver(self):
- """Configure and initialize the Firefox WebDriver."""
- firefox_options = FirefoxOptions()
- if self.headless:
- firefox_options.add_argument("--headless")
-
- # Additional options for better performance in containers
- firefox_options.add_argument("--no-sandbox")
- firefox_options.add_argument("--disable-dev-shm-usage")
- firefox_options.add_argument("--disable-gpu")
- firefox_options.add_argument("--window-size=1920,1080")
-
- # Create Firefox profile with preferences
- firefox_profile = webdriver.FirefoxProfile()
- firefox_profile.set_preference("dom.disable_open_during_load", False)
- firefox_profile.set_preference("browser.download.folderList", 2)
- firefox_profile.set_preference("browser.download.manager.showWhenStarting", False)
- firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
-
- # Initialize the driver with options
- if self.selenium_url:
- self.driver = webdriver.Remote(command_executor=self.selenium_url,options=firefox_options)
- else:
- self.driver = webdriver.Firefox(options=firefox_options)
- self.driver.set_page_load_timeout(30)
- logger.info(f"Starting Firefox browser in {'headless' if self.headless else 'visible'} mode...")
- return self.driver
-
- def navigate_to(self, url, wait_time=1):
- """Navigate to specified URL and wait for page to load."""
- if not self.driver:
- raise RuntimeError("WebDriver not initialized. Call setup_driver() first.")
-
- self.driver.get(url)
- time.sleep(wait_time) # Enable wait time for page loading
- logger.info(f"Navigated to {url}")
- logger.info(f"Page title: {self.driver.title}")
- return self.driver.title
-
- def get_links_with_same_hostname(self, base_url):
- """Extract all links from the current page that have the same hostname as base_url."""
- base_hostname = urllib.parse.urlparse(base_url).netloc
- links = []
-
- # Find all anchor tags
- if self.site_type == "mkdocs":
- # For MkDocs, prioritize navigation links
- anchors = self.driver.find_elements(By.CSS_SELECTOR, ".md-nav__link, .md-content a")
- else:
- anchors = self.driver.find_elements(By.TAG_NAME, "a")
-
- for anchor in anchors:
- try:
- href = anchor.get_attribute("href")
- if href and href.strip():
- parsed_url = urllib.parse.urlparse(href)
- # Check if the link has the same hostname and is not a fragment
- if parsed_url.netloc == base_hostname and parsed_url.scheme in ('http', 'https'):
- # Normalize the URL to prevent duplicates
- normalized_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
- if parsed_url.query:
- normalized_url += f"?{parsed_url.query}"
-
- # this works for CMS twiki but should be generalized
- normalized_url = normalized_url.split("?")[0]
- if 'bin/rdiff' in normalized_url or 'bin/edit' in normalized_url or 'bin/oops' in normalized_url or 'bin/attach' in normalized_url or 'bin/genpdf' in normalized_url or '/WebIndex' in normalized_url:
- continue
-
- if not self._clear_url(normalized_url):
- continue
-
- # Skip image files
- if self._is_image_url(normalized_url):
- logger.debug(f"Skipping image URL: {normalized_url}")
- continue
-
- links.append(normalized_url)
-
- except Exception as e:
- logger.error(f"Error extracting link: {e}")
-
- return list(set(links)) # Remove duplicates
-
- def extract_page_data(self, current_url):
- """Return the raw HTML payload for the current page."""
- if not self.driver:
- raise RuntimeError("WebDriver not initialized. Call setup_driver() first.")
-
- title = self.driver.title or ""
- content = self.driver.page_source or ""
-
- return {
- "url": current_url,
- "title": title,
- "content": content,
- "suffix": "html",
- }
-
- def crawl(self, start_url):
- """Crawl pages starting from the given URL, storing title and content of each page.
-
- Args:
- start_url (str): The URL to start crawling from
-
- Returns:
- List[Dict]: A list of dictionaries describing each visited page.
- """
- max_depth = self.max_depth
- depth = 0
-
- if not self.driver:
- self.setup_driver()
-
- # Reset crawling state
- self.visited_urls = set()
- self.page_data = []
- to_visit = [start_url]
- level_links = []
-
- # First authenticate through the start URL
- self.authenticate_and_navigate(start_url)
-
- base_hostname = urllib.parse.urlparse(start_url).netloc
- logger.info(f"Base hostname for crawling: {base_hostname}")
- logger.info(f"Site type: {self.site_type}")
-
- # History record
- pages_visited = 0
- self.visited_urls = set()
-
- while to_visit and depth < max_depth:
- current_url = to_visit.pop(0)
-
- # Skip if we've already visited this URL
- if current_url in self.visited_urls:
- continue
-
- # Skip image files
- if self._is_image_url(current_url):
- logger.debug(f"Skipping image URL: {current_url}")
- self.visited_urls.add(current_url)
- continue
-
- logger.info(f"Crawling page {depth + 1}/{max_depth}: {current_url}")
-
- try:
- # Navigate to the page
- self.navigate_to(current_url, wait_time=2)
-
- # Mark as visited
- self.visited_urls.add(current_url)
- pages_visited += 1
-
- # Extract and store page data
- page_data = self.extract_page_data(current_url)
- self.page_data.append(page_data)
- logger.info(f"Extracted data from {current_url} ({len(page_data['content'])} chars)")
-
- # Get links to follow
- new_links = self.get_links_with_same_hostname(current_url)
- logger.info(f"Found {len(new_links)} links on the page (nv: {pages_visited})")
-
- # Add new links to visit
- for link in new_links:
- if link not in self.visited_urls and link not in to_visit and link not in level_links:
- logger.info(f"Found new link: {link} (nv: {pages_visited})")
- level_links.append(link)
-
- # Scan next level if to_visit is empty
- if not to_visit:
- to_visit.extend(level_links)
- level_links = []
- depth += 1
-
- except Exception as e:
- logger.info(f"Error crawling {current_url}: {e}", exc_info=True)
- self.visited_urls.add(current_url) # Mark as visited to avoid retrying
-
- logger.info(f"Crawling complete. Visited {pages_visited} pages.")
- return list(self.page_data)
-
- def _clear_url(self, url: str) -> bool:
- """Basic filtering for duplicate or fragment-only URLs."""
- if not url:
- return False
-
- # Ignore pure fragments or JavaScript links
- if url.startswith("javascript:"):
- return False
-
- return True
-
- def close(self):
- """Close the browser and clean up resources."""
- if self.driver:
- logger.info("Closing browser...")
- self.driver.quit()
- self.driver = None
-
- def authenticate_and_navigate(self, url):
- """Complete authentication flow and navigate to target URL."""
-
- if not self.driver:
- self.setup_driver()
-
- try:
- # First navigate to trigger SSO
- self.driver.get(url)
-
- # Login
- if self.login():
- # Navigate back to target page
- title = self.navigate_to(url)
- return title
- else:
- return None
- except Exception as e:
- logger.warning(f"Error during authentication: {e}", exc_info=True)
- return None
-
- def authenticate(self, url):
- """Complete authentication flow and navigate to target URL."""
- try:
- if not self.driver:
- self.setup_driver()
-
- # First navigate to trigger SSO
- self.driver.get(url)
-
- # Login
- if self.login():
- # Navigate back to target page
- return self.driver.get_cookies()
- else:
- return None
- except Exception as e:
- logger.warning(f"Error during authentication: {e}", exc_info=True)
- return None
-
- def __enter__(self):
- """Context manager entry point."""
- self.setup_driver()
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- """Context manager exit point."""
- self.close()
-
-
-class CERNSSOScraper(SSOScraper):
- """A scraper to handle CERN SSO authentication and page navigation."""
-
- def get_username_from_env(self):
- """Get CERN SSO username from environment variables."""
- return read_secret("SSO_USERNAME")
-
- def get_password_from_env(self):
- """Get CERN SSO password from environment variables."""
- return read_secret("SSO_PASSWORD")
-
- def login(self):
- """Login to CERN SSO with the provided credentials."""
- if not self.username or not self.password:
- raise ValueError("Missing credentials for CERN SSO")
-
- try:
- wait = WebDriverWait(self.driver, 20)
-
- # Wait for login form to appear
- username_input = wait.until(
- EC.presence_of_element_located((By.ID, "username"))
- )
- username_input.send_keys(self.username)
- # time.sleep(1) # Optional sleep to ensure the input is registered
-
- password_input = wait.until(EC.presence_of_element_located((By.ID, "password")))
- password_input.send_keys(self.password)
- # time.sleep(1) # Optional sleep to ensure the input is registered
-
- sign_in = wait.until(EC.presence_of_element_located((By.ID, "kc-login")))
- sign_in.click()
-
- logger.info("Login credentials submitted")
- return True
- except TimeoutException as e:
- logger.error(f"Could not find username or password fields in due time: {e}", exc_info=True)
- except Exception as e:
- logger.error(f"Error during login: {e}",exc_info=True)
- return False
-
-
-class SSOCollector:
- """Collects resources behind SSO-protected URLs using configured scrapers."""
-
- def __init__(self, selenium_config: Dict[str, Dict]) -> None:
- self._config = selenium_config or {}
- self._enabled = self._config.get("enabled", False)
- self._class_name = self._config.get("selenium_class", "")
- self._class_map = self._config.get("selenium_class_map", {})
-
- def collect(self, url: str) -> List[ScrapedResource]:
- if not self._enabled:
- logger.error("SSO is disabled or not configured")
- return []
-
- scraper_class, scraper_kwargs = self._resolve_scraper()
- if scraper_class is None:
- return []
-
- try:
- with scraper_class(**scraper_kwargs) as scraper:
- payload = scraper.crawl(url)
- resources = self._extract_resources(scraper, payload)
- if not resources:
- logger.warning(f"No content extracted from SSO crawl for {url}")
- return resources
- except Exception as exc: # pragma: no cover - defensive catch
- logger.error(f"SSO scraping failed for {url}: {exc}")
- return []
-
- def _resolve_scraper(self):
- entry = self._class_map.get(self._class_name)
- if not entry:
- logger.error(f"SSO class {self._class_name} not configured")
- return None, {}
-
- scraper_class = entry.get("class")
- if isinstance(scraper_class, str):
- module_name = entry.get(
- "module",
- "src.data_manager.collectors.scrapers.integrations.sso_scraper",
- )
- module = importlib.import_module(module_name)
- scraper_class = getattr(module, scraper_class)
-
- scraper_kwargs = entry.get("kwargs", {})
- return scraper_class, scraper_kwargs
-
- def _extract_resources(self, scraper, payload) -> List[ScrapedResource]:
- resources: List[ScrapedResource] = []
-
- page_data = getattr(scraper, "page_data", None)
- if isinstance(page_data, list):
- for page in page_data:
- if not isinstance(page, dict):
- continue
- page_url = page.get("url")
- content = page.get("content")
- if not page_url or content is None:
- continue
-
- resources.append(
- ScrapedResource(
- url=page_url,
- content=content,
- suffix=page.get("suffix", "html"),
- source_type="sso",
- metadata={
- "title": page.get("title"),
- },
- )
- )
-
- elif isinstance(payload, list):
- for item in payload:
- if not isinstance(item, dict):
- continue
- page_url = item.get("url")
- content = item.get("content")
- if not page_url or content is None:
- continue
- resources.append(
- ScrapedResource(
- url=page_url,
- content=content,
- suffix=item.get("suffix", "html"),
- source_type="sso",
- metadata={
- "visible": str(self._visible).lower(),
- },
- )
- )
-
- elif isinstance(payload, dict):
- for page_url in payload.values():
- logger.warning(
- f"SSO scraper returned mapping without page content; skipping {page_url}"
- )
-
- elif payload is not None:
- logger.warning(
- f"Unsupported SSO payload type {type(payload).__name__}"
- )
-
- return resources
diff --git a/src/data_manager/collectors/scrapers/items.py b/src/data_manager/collectors/scrapers/items.py
new file mode 100644
index 000000000..4e9affb71
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/items.py
@@ -0,0 +1,84 @@
+"""
+Scrapy intuition — Items as the data contract (FR-7a):
+
+ Items sit between Parser and Adapter.
+ Their field schema must be driven by what the Adapter needs
+ to construct a ScrapedResource — not by what's convenient
+ to inspect during development.
+
+ Wrong mental model: "what fields help me debug?"
+ Right mental model: "what fields does ScrapedResource.__init__ need?"
+
+ ScrapedResource fields (from scraped_resource.py):
+ url — required
+ content — required (str or bytes)
+ suffix — required
+ source_type — required ("web", "sso", "git")
+ metadata — dict, optional (title, content_type, encoding, etc.)
+ file_name — optional
+ relative_path — optional
+
+ So items carry exactly those fields.
+ Debug fields (body_preview, body_length) belong in logger calls,
+ not in the item schema — otherwise the adapter becomes a translation
+ layer for data that should never have been structured in the first place.
+
+SOLID note — Open/Closed:
+ Add new Item subclasses for new source types.
+ Do not add source-specific fields to the base class.
+ The adapter is the extension point, not the Item.
+"""
+
+import scrapy
+
+
+class BasePageItem(scrapy.Item):
+ """
+ Common fields shared across all scraped source types.
+ Maps directly to ScrapedResource constructor arguments.
+ """
+ url = scrapy.Field()
+ content = scrapy.Field() # Full text or bytes — NOT a preview
+ suffix = scrapy.Field() # "html", "pdf", "md" etc.
+ source_type = scrapy.Field() # "web" | "twiki" | "indico" | "discourse"
+
+ # Metadata fields — become ScrapedResource.metadata dict
+ title = scrapy.Field()
+ content_type = scrapy.Field() # HTTP Content-Type header value
+ encoding = scrapy.Field() # HTTP response encoding
+
+ # Optional — used by git/SSO scrapers for filesystem layout
+ file_name = scrapy.Field()
+ relative_path = scrapy.Field()
+
+
+class WebPageItem(BasePageItem):
+ """
+ Generic page item, works for SSO-*, ordinary web page.
+ No extra fields needed beyond BasePageItem.
+ Subclassing is the extension point (OCP) — Twiki quirks
+ belong in parse_twiki_page(), not in a bloated base class.
+ """
+ pass
+
+class DiscourseTopicPageItem(BasePageItem):
+ """
+ Discourse topic item.
+ Carries topic-level metadata from the category JSON listing —
+ useful for naming, filtering, and status tracking in the adapter.
+ """
+ topic_id = scrapy.Field()
+ slug = scrapy.Field()
+ has_accepted_answer = scrapy.Field()
+ created_at = scrapy.Field()
+ tags = scrapy.Field()
+
+class IndicoPageItem(BasePageItem):
+ """
+ Indico-specific item.
+ Indico API responses carry an event_id and category — useful
+ for metadata routing in the adapter without polluting the base.
+ """
+ event_id = scrapy.Field()
+ category = scrapy.Field()
+
diff --git a/src/data_manager/collectors/scrapers/middlewares/__init__.py b/src/data_manager/collectors/scrapers/middlewares/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/middlewares/auth_downloader.py b/src/data_manager/collectors/scrapers/middlewares/auth_downloader.py
new file mode 100644
index 000000000..db35989e0
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/middlewares/auth_downloader.py
@@ -0,0 +1,372 @@
+"""
+AuthDownloaderMiddleware — the single place where auth intersects Scrapy's
+request/response lifecycle.
+
+Everything else (spiders, parsers, pipelines) is auth-blind.
+
+Middleware ordering (FR-3a — must be documented here per spec)
+--------------------------------------------------------------
+Request path (outbound):
+ 500 AuthDownloaderMiddleware ← injects cookies/tokens FIRST
+ 550 RetryMiddleware ← retries after credentials are attached
+ 600 RedirectMiddleware ← follows 302s last
+
+Response path (inbound — reversed order):
+ 600 RedirectMiddleware ← resolves 302, re-queues new URL
+ 550 RetryMiddleware ← handles transport errors
+ 500 AuthDownloaderMiddleware ← sees the FINAL response (200 / 401 / 403)
+ or catches the SSO poison-pill 200
+
+Why auth before retry?
+ If RetryMiddleware ran before auth on the *request* path, retried requests
+ would carry no credentials and immediately receive another 401. The retry
+ counter exhausts before auth can refresh. Placing auth at 500 ensures
+ every outbound request carries valid credentials before retry even fires.
+
+Why we do NOT handle 302 directly
+ Scrapy's RedirectMiddleware (600) follows 302s before our middleware sees
+ the response — we receive the final destination status. *However*, CERN
+ SSO signals session expiry with a silent 302 → /login → 200 OK chain.
+ The final 200 looks healthy but contains a login page. We detect this
+ in process_response via ``_is_login_redirect(response)``.
+
+Required settings (settings.py)
+--------------------------------
+ DOWNLOADER_MIDDLEWARES = {
+ "src.data_manager.collectors.scrapers.middlewares.AuthDownloaderMiddleware": 500,
+ "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
+ # RedirectMiddleware stays at its default 600
+ }
+
+ SPIDER_AUTH_PROVIDERS = {
+ "cern_sso": {
+ "class": "src.data_manager.collectors.scrapers.auth.cern_sso.CERNSSOProvider",
+ "kwargs": {"headless": True},
+ },
+ "indico": {
+ "class": "src.data_manager.collectors.scrapers.auth.indico_bearer.IndicoBearerAuthProvider",
+ "kwargs": {},
+ },
+ }
+
+ AUTH_FAILURE_CODES = [401, 403] # optional; this is the default
+
+Spider contract
+---------------
+A spider opts into auth by declaring:
+
+ auth_provider_name = "cern_sso" # matches a key in SPIDER_AUTH_PROVIDERS
+
+Spiders without this attribute are public and completely bypass this middleware.
+"""
+from __future__ import annotations
+
+import importlib
+from typing import Dict, Optional, TYPE_CHECKING
+
+from scrapy import signals
+from scrapy.exceptions import IgnoreRequest
+from scrapy.http import Request, Response
+from twisted.internet.threads import deferToThread
+
+from src.utils.logging import get_logger
+from src.data_manager.collectors.scrapers.auth.base import AuthProvider, Credentials
+
+if TYPE_CHECKING:
+ from scrapy import Spider
+ from scrapy.crawler import Crawler
+
+logger = get_logger(__name__)
+
+# Meta key that marks a request as a post-refresh retry.
+# Prevents infinite refresh loops: if a retried request also fails auth,
+# the middleware closes the spider instead of refreshing again.
+_AUTH_RETRY_META_KEY = "_auth_retry"
+
+
+class AuthDownloaderMiddleware:
+ """Injects auth credentials and handles mid-crawl session expiry.
+
+ Auth-provider-agnostic: resolves which provider to use from
+ ``spider.auth_provider_name`` + ``settings.SPIDER_AUTH_PROVIDERS``.
+ """
+
+ def __init__(
+ self,
+ auth_providers_config: Dict,
+ auth_failure_codes: list,
+ ) -> None:
+ self._config = auth_providers_config
+ self._failure_codes = set(auth_failure_codes)
+ # Keyed by provider name. Populated lazily on first use.
+ self._providers: Dict[str, AuthProvider] = {}
+ self._credentials: Dict[str, Optional[Credentials]] = {}
+
+ # ------------------------------------------------------------------
+ # Scrapy classmethod + signal wiring
+ # ------------------------------------------------------------------
+
+ @classmethod
+ def from_crawler(cls, crawler: "Crawler") -> "AuthDownloaderMiddleware":
+ mw = cls(
+ auth_providers_config=crawler.settings.getdict(
+ "SPIDER_AUTH_PROVIDERS", {}
+ ),
+ auth_failure_codes=crawler.settings.getlist(
+ "AUTH_FAILURE_CODES", [401, 403]
+ ),
+ )
+ crawler.signals.connect(mw._on_spider_closed, signal=signals.spider_closed)
+ return mw
+
+ def _on_spider_closed(self, spider: "Spider", reason: str) -> None:
+ for name, provider in self._providers.items():
+ try:
+ provider.close()
+ logger.debug("AuthMiddleware: closed provider %r", name)
+ except Exception as exc:
+ logger.warning(
+ "AuthMiddleware: error closing provider %r: %s", name, exc
+ )
+
+ # ------------------------------------------------------------------
+ # process_request — inject credentials before the request is sent
+ # ------------------------------------------------------------------
+
+ def process_request(self, request: Request, spider: "Spider") -> None:
+ """Inject credentials. No-op for spiders without auth_provider_name."""
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if not provider_name:
+ return
+
+ # Cache hit — no thread needed, inject inline
+ cached = self._credentials.get(provider_name)
+ if cached is not None and cached.is_valid():
+ _inject(request, cached)
+ return None
+
+ # Cold start or stale — acquire blocks (Playwright), run in thread pool
+ return deferToThread(self._blocking_acquire_and_inject, request, provider_name, spider)
+
+
+ def _blocking_acquire_and_inject(self, request: Request, provider_name: str, spider: "Spider") -> None:
+ """Runs in a thread — safe for sync Playwright."""
+ creds = self._get_valid_credentials(provider_name, request.url, spider)
+ if creds is None:
+ logger.error(
+ "AuthMiddleware: could not acquire credentials for %r — "
+ "closing spider.", provider_name
+ )
+ self._close_spider(spider, "auth_acquisition_failed")
+ raise IgnoreRequest("Auth acquisition failed -- no credentials found")
+
+ _inject(request, creds)
+
+ # ------------------------------------------------------------------
+ # process_response — detect auth failure, refresh once, then close
+ # ------------------------------------------------------------------
+
+ def process_response(
+ self, request: Request, response: Response, spider: "Spider"
+ ) -> Response | Request:
+ """Detect 401/403 and SSO login-redirect poison pill."""
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if not provider_name:
+ return response
+
+ provider = self._resolve_provider(provider_name)
+ failure_reason = self._detect_auth_failure(response, provider) if provider else None
+ if failure_reason is None:
+ return response # healthy response — pass through
+
+ if request.meta.get(_AUTH_RETRY_META_KEY):
+ # Already refreshed once. A second failure means the session is
+ # broken beyond repair; do not retry again.
+ logger.error(
+ "AuthMiddleware: auth failure persists after refresh "
+ "(%s, url=%s). Closing spider.",
+ failure_reason,
+ request.url,
+ )
+ self._close_spider(spider, "auth_expired")
+ return response
+
+ logger.warning(
+ "AuthMiddleware: %s detected for %s — refreshing credentials.",
+ failure_reason,
+ request.url,
+ )
+
+ # Invalidate the cached credentials so _get_valid_credentials knows
+ # they're stale before the next process_request call.
+ cached = self._credentials.get(provider_name)
+ if cached:
+ cached.invalidate()
+
+ # refresh also runs Playwright — thread pool
+ return deferToThread(self._blocking_refresh_and_retry, request, provider_name, spider, failure_reason)
+
+ def _blocking_refresh_and_retry(self, request: Request, provider_name: str, spider: "Spider", failure_reason: str) -> Request:
+ """Runs in a thread — safe for sync Playwright."""
+ fresh = self._do_refresh(provider_name, request.url, spider)
+ if fresh is None:
+ self._close_spider(spider, "auth_expired")
+ raise IgnoreRequest("auth refresh failed")
+ retry = request.copy()
+ retry.meta[_AUTH_RETRY_META_KEY] = True
+ retry = retry.replace(dont_filter=True)
+ _inject(retry, fresh)
+ return retry
+
+ # ------------------------------------------------------------------
+ # process_exception — log transport errors; let RetryMiddleware handle
+ # ------------------------------------------------------------------
+
+ def process_exception(
+ self, request: Request, exception: Exception, spider: "Spider"
+ ) -> None:
+ provider_name = getattr(spider, "auth_provider_name", None)
+ if provider_name:
+ logger.warning(
+ "AuthMiddleware: transport error [provider=%r] %s — %s",
+ provider_name,
+ request.url,
+ exception,
+ )
+ # Return None → other middlewares (RetryMiddleware) handle it.
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _detect_auth_failure(self, response, provider: AuthProvider):
+ """Return a failure label or None if the response looks healthy.
+
+ Checks two failure modes:
+ 1. Explicit HTTP auth codes (401, 403).
+ 2. SSO poison-pill: a 200 OK whose final URL is a login page.
+ CERN SSO sometimes redirects expired sessions to /login and returns
+ a 200 with the login form HTML. This is invisible to RetryMiddleware
+ because the status code is 200 — only URL inspection reveals the trap.
+ """
+ if response.status in self._failure_codes:
+ return f"HTTP {response.status}"
+ if provider.is_session_expired(response):
+ return "session-expired (provider-detected)"
+ return None
+
+
+ def _get_valid_credentials(
+ self, provider_name: str, url: str, spider: "Spider"
+ ) -> Optional[Credentials]:
+ """Return cached credentials if still valid, or acquire fresh ones."""
+ cached = self._credentials.get(provider_name)
+ if cached is not None and cached.is_valid():
+ return cached
+
+ # Cache miss or explicitly invalidated / TTL expired — acquire fresh.
+ logger.info(
+ "AuthMiddleware: acquiring credentials via %r for %s",
+ provider_name, url,
+ )
+ provider = self._resolve_provider(provider_name)
+ if provider is None:
+ return None
+
+ fresh = provider.acquire(url)
+ self._credentials[provider_name] = fresh
+ return fresh
+
+ def _do_refresh(
+ self, provider_name: str, url: str, spider: "Spider"
+ ) -> Optional[Credentials]:
+ """Delegate refresh to the provider and update the cache."""
+ provider = self._resolve_provider(provider_name)
+ if provider is None:
+ return None
+ fresh = provider.refresh(url)
+ self._credentials[provider_name] = fresh
+ return fresh
+
+ def _resolve_provider(self, name: str) -> Optional[AuthProvider]:
+ """Return a cached provider, instantiating it on first call."""
+ if name in self._providers:
+ return self._providers[name]
+
+ entry = self._config.get(name)
+ if not entry:
+ logger.error(
+ "AuthMiddleware: no SPIDER_AUTH_PROVIDERS entry for %r. "
+ "Check settings.py.", name
+ )
+ return None
+
+ class_path: str = entry["class"]
+ kwargs: dict = entry.get("kwargs", {})
+ try:
+ module_path, class_name = class_path.rsplit(".", 1)
+ module = importlib.import_module(module_path)
+ provider_cls = getattr(module, class_name)
+ provider: AuthProvider = provider_cls(**kwargs)
+ except Exception as exc:
+ logger.error(
+ "AuthMiddleware: could not instantiate %r: %s",
+ class_path, exc, exc_info=True,
+ )
+ return None
+
+ self._providers[name] = provider
+ return provider
+
+ @staticmethod
+ def _close_spider(spider: "Spider", reason: str) -> None:
+ logger.error("AuthMiddleware: closing spider (reason=%r)", reason)
+ try:
+ spider.crawler.engine.close_spider(spider, reason)
+ except Exception as exc:
+ logger.error("AuthMiddleware: engine.close_spider failed: %s", exc)
+
+
+# ---------------------------------------------------------------------------
+# Standalone helper — lives outside the class so it's testable without
+# constructing the full middleware.
+# ---------------------------------------------------------------------------
+
+def _inject(request: Request, credentials: Credentials) -> None:
+ """Stamp cookies and/or auth headers onto a Scrapy Request in-place.
+
+ Scrapy's Request.cookies accepts a list[dict] (same format Playwright's
+ context.cookies() returns) or a plain dict. We always normalise to
+ list[dict] and merge rather than replace, so existing cookies (e.g. from
+ a previous inject or a spider-level cookies= argument) are preserved.
+
+ Headers are set directly on request.headers which is mutable.
+
+ Note: Request.cookies is read-only after construction; we use
+ request.replace(cookies=...) to produce a new Request object, then
+ update the reference via the caller. But since Scrapy passes Request
+ objects by reference and the middleware hooks return None (pass-through)
+ or a new Request, we instead mutate headers (mutable) and re-build the
+ cookie jar using the internal _cookies attribute that Scrapy exposes.
+ This is the idiomatic approach used by Scrapy's own cookie middleware.
+ """
+ if credentials.cookies:
+ cookie_header = "; ".join(
+ f"{c['name']}={c['value']}" for c in credentials.cookies
+ )
+ request.headers["Cookie"] = cookie_header
+ request.meta["dont_merge_cookies"] = True
+ # # Merge new cookies over existing ones (last write wins per name).
+ # existing: list = list(request.cookies) if isinstance(request.cookies, list) else [
+ # {"name": k, "value": v} for k, v in (request.cookies or {}).items()
+ # ]
+ # merged: Dict[str, dict] = {c["name"]: c for c in existing}
+ # for cookie in credentials.cookies:
+ # merged[cookie["name"]] = cookie
+ # Replace is safe here — process_request returns None so Scrapy uses
+ # the same object; we mutate via internal attribute.
+ # request._cookies = list(merged.values())
+
+ if credentials.headers:
+ for key, value in credentials.headers.items():
+ request.headers[key] = value
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/parsers/link.py b/src/data_manager/collectors/scrapers/parsers/link.py
new file mode 100644
index 000000000..a4344b110
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/parsers/link.py
@@ -0,0 +1,73 @@
+from typing import Iterator, List
+from scrapy.http import Response, TextResponse
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.utils import get_content_type
+# Tried in order — first non-empty match wins.
+# Covers: HTML5 semantic, ARIA landmark, common CMS patterns, final fallback.
+_CONTENT_SELECTORS = [
+ "main",
+ "article",
+ '[role="main"]',
+ "#content",
+ "#main",
+ "#main-content",
+ ".main-content", # MIT.edu Drupal wrapper
+ ".region-content", # Drupal generic region
+ ".content",
+ ".post-content",
+ ".entry-content",
+ "body",
+]
+
+def _first_outer_html(response: Response, selectors: List[str]) -> str:
+ for selector in selectors:
+ nodes = response.css(selector)
+ if not nodes:
+ continue
+ html = nodes[0].get()
+ if html and html.strip():
+ return html.strip()
+ return ""
+
+def parse_link_page(response: Response) -> Iterator[WebPageItem]:
+ """
+ Generic page parser — works for any HTML page with no site-specific selectors.
+ Strategy:
+ - PDFs: return raw bytes, suffix="pdf".
+ - HTML: extract visible text from the first matching content container,
+ falling back through _CONTENT_SELECTORS to .
+ Full raw HTML is never stored — only visible text reaches the item.
+ Suitable as the default parse_item for LinkSpider subclasses that have
+ no meaningful site-specific structure to exploit.
+ """
+ ct = get_content_type(response)
+ # ── PDF ──────────────────────────────────────────────────────────────────
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield WebPageItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ source_type="web",
+ title=urlparse(response.url).path.split("/")[-1].replace(".pdf", "").strip(),
+ content_type=ct,
+ )
+ return
+ # ── HTML ─────────────────────────────────────────────────────────────────
+ title = (
+ response.css("h1::text").get()
+ or response.css("title::text").get()
+ or ""
+ ).strip()
+ body_text = _first_outer_html(response, _CONTENT_SELECTORS)
+ encoding = response.encoding if isinstance(response, TextResponse) else "utf-8"
+ if not body_text:
+ return # empty page — don't yield a blank item
+ yield WebPageItem(
+ url=response.url,
+ content=body_text,
+ suffix="html",
+ source_type="web",
+ title=title,
+ content_type=ct,
+ encoding=encoding,
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/parsers/twiki.py b/src/data_manager/collectors/scrapers/parsers/twiki.py
new file mode 100644
index 000000000..fa6b6745c
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/parsers/twiki.py
@@ -0,0 +1,85 @@
+"""
+TWiki / PatternSkin parser.
+
+1. **PDF** — same rule as ``parse_link_page``: raw ``response.body``, ``suffix="pdf"``.
+2. **HTML** — **outer HTML** of the main column (DOM subtree), not ``*::text``.
+
+Selectors are tried in order; first non-empty serialized node wins, then ``body``.
+"""
+from __future__ import annotations
+
+from typing import Iterator
+
+from scrapy.http import Response, TextResponse
+
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.parsers.link import _first_outer_html
+from src.data_manager.collectors.scrapers.utils import get_content_type
+from src.utils.logging import get_logger
+from urllib.parse import urlparse
+
+logger = get_logger(__name__)
+
+_TWIKI_DOM_SELECTORS: List[str] = [
+ "body.patternViewPage #patternMainContents",
+ "#patternMainContents",
+ "body.patternViewPage #patternMain",
+ "#patternMain",
+ "#twikiMainContents",
+ ".patternViewBody",
+ ".twikiTopicText",
+ ".patternTopic",
+ ".patternContent",
+ ".patternMain",
+ "body",
+]
+
+
+def _twiki_title(response: TextResponse) -> str:
+ raw = (
+ response.css("#topic-title::text").get()
+ or response.css(".patternTitle::text").get()
+ or response.css("title::text").get()
+ or ""
+ )
+ if not isinstance(raw, str):
+ return ""
+ # CERN TWiki example: CRAB3ConfigurationFile < CMSPublic < TWiki
+ return raw.split("<")[0].strip()
+
+
+def parse_twiki_page(response: Response) -> Iterator[WebPageItem]:
+ ct = get_content_type(response)
+
+ # ── PDF (aligned with parse_link_page) ─────────────────────────────────
+ if response.url.lower().endswith(".pdf") or "application/pdf" in ct:
+ yield WebPageItem(
+ url=response.url,
+ content=response.body,
+ suffix="pdf",
+ source_type="web",
+ title=urlparse(response.url).path.split("/")[-1].replace(".pdf", "").strip(),
+ content_type=ct,
+ )
+ return
+
+ # ── HTML DOM ────────────────────────────────────────────────────────────
+ if not isinstance(response, TextResponse):
+ logger.debug("Skipping non-text response (no css): %s", response.url)
+ return
+
+ title = _twiki_title(response)
+ body_html = _first_outer_html(response, _TWIKI_DOM_SELECTORS)
+ if not body_html:
+ logger.debug("No main-column HTML for Twiki page: %s", response.url)
+ return
+
+ yield WebPageItem(
+ url=response.url,
+ title=title,
+ content=body_html,
+ suffix="html",
+ source_type="web",
+ content_type=ct,
+ encoding=response.encoding or "utf-8",
+ )
diff --git a/src/data_manager/collectors/scrapers/pipelines/__init__.py b/src/data_manager/collectors/scrapers/pipelines/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/pipelines/anonymization.py b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
new file mode 100644
index 000000000..592b74e80
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines/anonymization.py
@@ -0,0 +1,53 @@
+from typing import TYPE_CHECKING
+
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.scrapers.items import BasePageItem
+
+from scrapy import Spider
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+class AnonymizationPipeline:
+ """Runs at priority 250, before PersistencePipeline (300)."""
+
+ _DEFAULT_ANONYMIZER_CONFIG = {
+ "utils": {
+ "anonymizer": {
+ "nlp_model": "en_core_web_sm",
+ "excluded_words": ["John", "Jane", "Doe"],
+ "greeting_patterns": [
+ r"^(hi|hello|hey|greetings|dear)\b",
+ r"^\w+,\s*",
+ ],
+ "signoff_patterns": [
+ r"\b(regards|sincerely|best regards|cheers|thank you)\b",
+ r"^\s*[-~]+\s*$",
+ ],
+ "email_pattern": r"[\w\.-]+@[\w\.-]+\.\w+",
+ "username_pattern": r"\[~[^\]]+\]",
+ }
+ }
+ }
+
+ def __init__(self, anonymizer: Anonymizer) -> None:
+ self._anonymizer = anonymizer
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ enabled = crawler.settings.getbool("ANONYMIZE_DATA", True)
+ anonymizer = crawler.settings.get("ANONYMIZER_SERVICE")
+ if not enabled:
+ raise NotConfigured("Anonymization is disabled")
+ if anonymizer is None:
+ # when we use scrapy cmd, we don't have the anonymizer service provided
+ dm_config = cls._DEFAULT_ANONYMIZER_CONFIG
+ return cls(anonymizer=Anonymizer(dm_config))
+ return cls(anonymizer=anonymizer)
+
+ def process_item(self, item: BasePageItem, spider: Spider) -> BasePageItem:
+ if isinstance(item.get("content"), str):
+ item["content"] = self._anonymizer.anonymize_markup(item["content"])
+ if isinstance(item.get("title"), str):
+ item["title"] = self._anonymizer.anonymize(item["title"])
+ return item
diff --git a/src/data_manager/collectors/scrapers/pipelines/markitdown.py b/src/data_manager/collectors/scrapers/pipelines/markitdown.py
new file mode 100644
index 000000000..f5c09237b
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines/markitdown.py
@@ -0,0 +1,43 @@
+from scrapy import Spider
+from src.utils.logging import get_logger
+from src.data_manager.collectors.utils.markitdown_convertor import MarkitdownConvertor
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.scrapers.pipelines.anonymization import AnonymizationPipeline
+from src.data_manager.collectors.scrapers.items import BasePageItem
+from scrapy.exceptions import NotConfigured
+
+logger = get_logger(__name__)
+
+class MarkitdownPipeline:
+ """Runs at priority 250, before PersistencePipeline (300)."""
+
+ def __init__(self, markitdown: MarkitdownConvertor, anonymizer: Anonymizer, anonymize_data: bool):
+ self._markitdown = markitdown
+ self._anonymizer = anonymizer
+ self._anonymize_data = anonymize_data
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ enabled = crawler.settings.getbool("MARKITDOWN_ENABLED", True)
+ markitdown_convertor = crawler.settings.get("MARKITDOWN_SERVICE")
+ anonymizer = crawler.settings.get("ANONYMIZER_SERVICE")
+ anonymize_data = crawler.settings.getbool("ANONYMIZE_DATA", True)
+ if not enabled:
+ raise NotConfigured("Markitdown is disabled")
+ if markitdown_convertor is None:
+ # when we use scrapy cmd, we don't have the markitdown service provided
+ markitdown_convertor = MarkitdownConvertor()
+ if anonymizer is None:
+ # when we use scrapy cmd, we don't have the anonymizer service provided
+ anonymizer = AnonymizationPipeline.from_crawler(crawler)._anonymizer
+ return cls(markitdown=markitdown_convertor, anonymizer=anonymizer, anonymize_data=anonymize_data)
+
+ def process_item(self, item: BasePageItem, spider: Spider) -> BasePageItem:
+ if isinstance(item.get("content"), str):
+ logger.info(f"Converting content to markdown: {item['content']}")
+ item["content"] = self._markitdown.convert(item["content"], file_extension=item["suffix"])
+ if self._anonymize_data:
+ logger.info(f"Anonymizing content: {item['content']}")
+ item["content"] = self._anonymizer.anonymize(item["content"])
+ logger.info(f"Markitdown result ({'anonymized' if self._anonymize_data else 'not second pass anonymized'})): {item['content']}")
+ return item
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/pipelines/persistence.py b/src/data_manager/collectors/scrapers/pipelines/persistence.py
new file mode 100644
index 000000000..94ab0cf14
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/pipelines/persistence.py
@@ -0,0 +1,143 @@
+"""
+Persistence pipeline: converts Scrapy Items → ScrapedResource → PersistenceService.
+
+Design notes
+------------
+* Follows Scrapy's canonical ``from_crawler`` injection pattern.
+ The ``PersistenceService`` instance and output directory are set on
+ ``crawler.settings`` *programmatically* by ``ScraperManager`` before the
+ crawl starts — they are live Python objects, not serialised config values,
+ so they must never appear in settings.py or YAML.
+
+* SRP boundary: this pipeline does *two* things (adapt + persist). That is
+ intentional and acceptable because the two operations are trivially coupled
+ here (no branching logic in either). If adapter logic grows, extract it to
+ ``adapters/resource_adapter.py`` and import here.
+
+* The pipeline never raises — it logs and drops items on error so a single
+ bad page does not kill the crawl (OR-5 / FR-7b).
+
+Settings keys consumed
+----------------------
+PERSISTENCE_SERVICE : PersistenceService instance (required)
+PERSISTENCE_OUTPUT_DIR: pathlib.Path — where files are written (required)
+"""
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from scrapy import Spider
+from scrapy.exceptions import NotConfigured
+
+from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
+
+if TYPE_CHECKING:
+ from scrapy import Crawler
+ from src.data_manager.collectors.persistence import PersistenceService
+
+logger = logging.getLogger(__name__)
+
+SETTING_SERVICE = "PERSISTENCE_SERVICE"
+SETTING_OUTPUT_DIR = "PERSISTENCE_OUTPUT_DIR"
+
+class PersistencePipeline:
+ """
+ Scrapy item pipeline that persists scraped items via ``PersistenceService``.
+
+ Activation (in ScraperManager, before CrawlerProcess/Runner starts)::
+
+ crawler.settings.set(
+ "PERSISTENCE_SERVICE", persistence_service_instance, priority="spider"
+ )
+ crawler.settings.set(
+ "PERSISTENCE_OUTPUT_DIR", Path("/root/data/websites"), priority="spider"
+ )
+ crawler.settings.set(
+ "ITEM_PIPELINES",
+ {"src.data_manager.collectors.scrapers.pipelines.PersistencePipeline": 300},
+ priority="spider",
+ )
+ """
+
+ def __init__(self, persistence: "PersistenceService", output_dir: Path) -> None:
+ self._persistence = persistence
+ self._output_dir = output_dir
+ self._success_count = 0
+ self._error_count = 0
+
+ # ------------------------------------------------------------------
+ # Scrapy lifecycle
+ # ------------------------------------------------------------------
+
+ @classmethod
+ def from_crawler(cls, crawler: "Crawler") -> "PersistencePipeline":
+ """Canonical Scrapy injection point — pulls service from settings."""
+ persistence = crawler.settings.get(SETTING_SERVICE)
+ output_dir = crawler.settings.get(SETTING_OUTPUT_DIR)
+
+ if persistence is None:
+ raise NotConfigured(
+ f"PersistencePipeline requires '{SETTING_SERVICE}' in crawler settings. "
+ "Set it programmatically in ScraperManager before starting the crawl."
+ )
+ if output_dir is None:
+ raise NotConfigured(
+ f"PersistencePipeline requires '{SETTING_OUTPUT_DIR}' in crawler settings."
+ )
+
+ instance = cls(persistence=persistence, output_dir=Path(output_dir))
+ return instance
+
+ def open_spider(self, spider: Spider) -> None:
+ self._output_dir.mkdir(parents=True, exist_ok=True)
+ logger.info(
+ "PersistencePipeline opened | output_dir=%s", self._output_dir
+ )
+
+ def close_spider(self, spider: Spider) -> None:
+ # Summary logged via spider_closed signal too, but belt-and-suspenders here.
+ logger.info(
+ "PersistencePipeline | spider=%s persisted=%d errors=%d",
+ spider.name,
+ self._success_count,
+ self._error_count,
+ )
+
+ def process_item(self, item, spider: Spider):
+ """
+ Convert item → ScrapedResource → persist.
+
+ Never raises; errors are logged and the item is dropped.
+ Returning the item allows other downstream pipelines to receive it.
+ """
+ try:
+ resource = to_scraped_resource(item)
+ resource.metadata["spider_name"] = spider.name
+ except Exception as exc:
+ self._error_count += 1
+ logger.warning(
+ "Adapter failed for item from %s: %s | item=%r",
+ spider.name,
+ exc,
+ dict(item),
+ exc_info=False, # keep log concise; set True for debug
+ )
+ return item # drop from persistence but don't crash
+
+ try:
+ file_path = self._persistence.persist_resource(resource, self._output_dir)
+ self._success_count += 1
+ logger.debug(
+ "Persisted %s → %s", resource.get_hash(), file_path
+ )
+ except Exception as exc:
+ self._error_count += 1
+ logger.error(
+ "PersistenceService.persist_resource failed for %s: %s",
+ getattr(resource, "url", "unknown"),
+ exc,
+ exc_info=True,
+ )
+
+ return item
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/scraped_resource.py b/src/data_manager/collectors/scrapers/scraped_resource.py
index 357eaaf41..080e4cbb7 100644
--- a/src/data_manager/collectors/scrapers/scraped_resource.py
+++ b/src/data_manager/collectors/scrapers/scraped_resource.py
@@ -74,14 +74,3 @@ def _safe_relative_path(self) -> Optional[Path]:
if rel_path.is_absolute() or ".." in rel_path.parts:
return None
return rel_path
-
-@dataclass
-class BrowserIntermediaryResult:
- """
- this class is meant to provide a layer of abstraction for browser based scrapers (i.e selenium)
- it will format everything into a single class so that more complicated scraping results which may hit
- multiple tabs or pages at once can be handled in a uniform way by the LinkScraper class.
- """
-
- artifacts: List[Dict] # list of scraper results for each page produced by a seelnium navigation
- links: List[str] # links reached
diff --git a/src/data_manager/collectors/scrapers/scraper.py b/src/data_manager/collectors/scrapers/scraper.py
deleted file mode 100644
index 7fe1ef0e3..000000000
--- a/src/data_manager/collectors/scrapers/scraper.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import requests
-import re
-
-from typing import Dict, Iterator, List, Optional
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urljoin, urldefrag
-
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-class LinkScraper:
- """
- Single scraper for all our link needs that handles Selenium and requests.
- This class explicitly handles requests, but if selenium scraping is enabled for a link
- everything is passed through to the driver including how the page data is collected and
- how the next level of links are found. This class DOESNT own the selenium driver, that is
- owned by the scraper manager class.
- """
-
- def __init__(self, verify_urls: bool = True, enable_warnings: bool = True) -> None:
- self.verify_urls = verify_urls
- self.enable_warnings = enable_warnings
- # seen_urls tracks anything queued/visited; visited_urls tracks pages actually crawled.
- self.visited_urls = set()
- self.seen_urls = set()
-
- def _is_image_url(self, url: str) -> bool:
- """Check if URL points to an image file."""
- image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.webp')
- parsed_url = urlparse(url)
- path = parsed_url.path.lower()
- return any(path.endswith(ext) for ext in image_extensions)
-
- def reap(self, response, current_url: str, selenium_scrape: bool = False, authenticator = None):
- """
- probably the most complicated method here and most volatile in terms of maybe later needing a rewrite
-
- this method is here to deal with any result that it gets back. for a selenium resource it expects results as a
- BrowserIntermediaryResult, otherwhise it will handle it as a normal http response. it handles getting the next set
- of links and updating the page data gathered
-
- Args:
- response (BrowserIntermediaryResult | requests.response): whatever has been collected for the current_url by the scraper
- selenium_scrape (bool): whether or not selenium was used to scrape this content
- authenticator (SSOAuthenticator | None): client being used to crawl websites or just for auth
-
- Return (tuple[list[str], list[ScrapedResource]]): next links to crawl and resources collected
- """
-
- # mark as visited
- self._mark_visited(current_url)
-
- source_type = "web" if (authenticator is None) else "sso"
-
- resources = []
-
- if selenium_scrape: # deals with a selenium response (should work for both non authenitcated and authenticated sites in principle)
- assert(authenticator is not None) ## this shouldnt be tripped
-
- # For selenium scraping, we expect a simple dict from extract_page_data
- # containing url, title, content, suffix
- content = response.get("content", "")
- title = response.get("title", "")
- suffix = response.get("suffix", "html")
-
- resource = ScrapedResource(
- url=current_url,
- content=content,
- suffix=suffix,
- source_type=source_type,
- metadata={
- "title": title,
- "content_type": "rendered_html",
- "renderer": "selenium",
- },
- )
- res = authenticator.get_links_with_same_hostname(current_url)
- resources.append(resource)
-
- else: # deals with http response
- content_type = response.headers.get("Content-type")
-
- if current_url.lower().endswith(".pdf"):
- resource = ScrapedResource(
- url=current_url,
- content=response.content,
- suffix="pdf",
- source_type=source_type,
- metadata={"content_type": content_type},
- )
- else:
- resource = ScrapedResource(
- url=current_url,
- content=response.text,
- suffix="html",
- source_type=source_type,
- metadata={
- "content_type": content_type,
- "encoding": response.encoding,
- },
- )
- res = self.get_links_with_same_hostname(current_url, resource)
- resources.append(resource)
-
- return res, resources # either collected via http or via authenticators method
-
-
- def crawl(
- self,
- start_url: str,
- browserclient = None,
- max_depth: int = 1,
- selenium_scrape: bool = False,
- max_pages: Optional[int] = None,
- ):
- """
- crawl pages from a given starting url up to a given depth either using basic http or a provided browser client
-
- Args :
- start_url (str): Url to start crawling from
- authenticator (SSOAuthenticator): class used for handling authenticatoin for web resources
- max_depth (int): max depth of links to descend from the start url
- selenium_scrape (bool): tracks whether or not the page should be scraped through selenium or not
- max_pages (int | None): cap on total pages to visit before stopping
-
- Returns: List[ScrapedResource]
-
- """
- # Consume the iterator so page_data is populated for callers of crawl().
- for _ in self.crawl_iter(
- start_url,
- browserclient=browserclient,
- max_depth=max_depth,
- selenium_scrape=selenium_scrape,
- max_pages=max_pages,
- collect_page_data=True,
- ):
- pass
- return list(self.page_data)
-
- def crawl_iter(
- self,
- start_url: str,
- browserclient = None,
- max_depth: int = 1,
- selenium_scrape: bool = False,
- max_pages: Optional[int] = None,
- collect_page_data: bool = False,
- ) -> Iterator[ScrapedResource]:
- """
- crawl pages from a given starting url up to a given depth either using basic http or a provided browser client
-
- Args :
- start_url (str): Url to start crawling from
- authenticator (SSOAuthenticator): class used for handling authenticatoin for web resources
- max_depth (int): max depth of links to descend from the start url
- selenium_scrape (bool): tracks whether or not the page should be scraped through selenium or not
- max_pages (int | None): cap on total pages to visit before stopping
- collect_page_data (bool): whether to store resources on the scraper instance
-
- Returns: Iterator[ScrapedResource]
-
- """
-
- if not self.enable_warnings:
- import urllib3
- urllib3.disable_warnings()
-
- depth = 0
- self.visited_urls = set()
- self.seen_urls = set()
- self.page_data = []
- normalized_start_url = self._normalize_url(start_url)
- if not normalized_start_url:
- logger.error(f"Failed to crawl: {start_url}, could not normalize URL")
- return
- to_visit = [normalized_start_url]
- self.seen_urls.add(normalized_start_url)
- level_links = []
- pages_visited = 0
-
- base_hostname = urlparse(normalized_start_url).netloc
- logger.info(f"Base hostname for crawling: {base_hostname}")
-
- # session either stays none or becomes a requests.Session object if not selenium scraping
- session = None
-
- if selenium_scrape: # scrape page with pure selenium
- if browserclient is None:
- logger.error(f"Failed to crawl: {start_url}, auth is needed but no browser clilent was passed through")
- return []
- browserclient.authenticate_and_navigate(normalized_start_url)
-
- elif not selenium_scrape and browserclient is not None: # use browser client for auth but scrape with http request
- session = requests.Session()
- cookies = browserclient.authenticate(normalized_start_url)
- if cookies is not None:
- for cookie_args in cookies:
- cookie = requests.cookies.create_cookie(name=cookie_args['name'],
- value=cookie_args['value'],
- domain=cookie_args.get('domain'),
- path=cookie_args.get('path', '/'),
- expires=cookie_args.get('expires'),
- secure=cookie_args.get('secure', False))
- session.cookies.set_cookie(cookie)
-
- else: # pure html no browser client needed
- session = requests.Session()
-
- while to_visit and depth < max_depth:
- if max_pages is not None and pages_visited >= max_pages:
- logger.info(f"Reached max_pages={max_pages}; stopping crawl early.")
- break
- current_url = to_visit.pop(0)
-
- # Skip if we've already visited this URL
- if current_url in self.visited_urls:
- continue
-
- # Skip image files
- if self._is_image_url(current_url):
- logger.debug(f"Skipping image URL: {current_url}")
- self._mark_visited(current_url)
- continue
-
- logger.info(f"Crawling depth {depth + 1}/{max_depth}: {current_url}")
-
- try:
-
- # grab the page content
- if not selenium_scrape:
- assert (session is not None) # REMOVELATER
- response = session.get(current_url, verify = self.verify_urls)
- response.raise_for_status()
- else:
- assert (browserclient is not None) # REMOVELATER
- browserclient.navigate_to(current_url, wait_time = 2)
- response = browserclient.extract_page_data(current_url) # see the BrowserIntermediaryResult class to see what comes back here
-
-
- # Mark as visited and store content
- pages_visited += 1
- new_links, resources = self.reap(response, current_url, selenium_scrape, browserclient)
- for resource in resources:
- if collect_page_data:
- self.page_data.append(resource)
- yield resource
-
- for link in new_links:
- normalized_link = self._normalize_url(link)
- if not normalized_link:
- continue
- if normalized_link in self.seen_urls:
- continue
- logger.info(f"Found new link: {normalized_link} (nv: {pages_visited})")
- self.seen_urls.add(normalized_link)
- level_links.append(normalized_link)
-
- except Exception as e:
- logger.info(f"Error crawling {current_url}: {e}")
- self._mark_visited(current_url) # Mark as visited to avoid retrying
-
- if not to_visit:
- to_visit.extend(level_links)
- level_links = []
- depth += 1
-
- logger.info(f"Crawling complete. Visited {pages_visited} pages.")
- return
-
- def _normalize_url(self, url: str) -> Optional[str]:
- if not url:
- return None
-
- normalized, _ = urldefrag(url)
- parsed = urlparse(normalized)
- if not parsed.scheme:
- return normalized
- return parsed._replace(
- scheme=parsed.scheme.lower(),
- netloc=parsed.netloc.lower(),
- ).geturl()
-
- def _mark_visited(self, url: str) -> None:
- normalized = self._normalize_url(url)
- if not normalized:
- return
- self.visited_urls.add(normalized)
- self.seen_urls.add(normalized)
-
- def get_links_with_same_hostname(self, url: str, page_data: ScrapedResource):
- """Return all links on the page that share the same hostname as `url`. For now does not support PDFs"""
-
- base_url = self._normalize_url(url) or url
- base_hostname = urlparse(base_url).netloc
- links = set()
- a_tags = []
-
- if (page_data.suffix == "html"):
- soup = BeautifulSoup(page_data.content, "html.parser")
- a_tags = soup.find_all("a", href=True)
-
- # how many links found on the first level
- for tag in a_tags:
- full = urljoin(base_url, tag["href"])
- normalized = self._normalize_url(full)
- if not normalized:
- continue
- if urlparse(normalized).netloc == base_hostname:
- links.add(normalized)
- return list(links)
diff --git a/src/data_manager/collectors/scrapers/scraper_manager.py b/src/data_manager/collectors/scrapers/scraper_manager.py
deleted file mode 100644
index 1904f7f11..000000000
--- a/src/data_manager/collectors/scrapers/scraper_manager.py
+++ /dev/null
@@ -1,366 +0,0 @@
-import os
-import importlib
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
-
-from src.data_manager.collectors.persistence import PersistenceService
-from src.data_manager.collectors.scrapers.scraped_resource import \
- ScrapedResource
-from src.data_manager.collectors.scrapers.scraper import LinkScraper
-from src.utils.config_access import get_global_config
-from src.utils.env import read_secret
-from src.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-if TYPE_CHECKING:
- from src.data_manager.collectors.scrapers.integrations.git_scraper import \
- GitScraper
-
-
-class ScraperManager:
- """Coordinates scraper integrations and centralises persistence logic."""
-
- def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
- global_config = get_global_config()
-
- sources_config = (dm_config or {}).get("sources", {}) or {}
- links_config = sources_config.get("links", {}) if isinstance(sources_config, dict) else {}
- selenium_config = links_config.get("selenium_scraper", {}) if isinstance(sources_config, dict) else {}
-
- git_config = sources_config.get("git", {}) if isinstance(sources_config, dict) else {}
- sso_config = sources_config.get("sso", {}) if isinstance(sources_config, dict) else {}
- self.base_depth = links_config.get('base_source_depth', 5)
- logger.debug(f"Using base depth of {self.base_depth} for weblist URLs")
-
- scraper_config = {}
- if isinstance(links_config, dict):
- scraper_config = links_config.get("html_scraper", {}) or {}
- self.config = scraper_config
- raw_max_pages = links_config.get("max_pages")
- self.max_pages = None
- if raw_max_pages not in (None, ""):
- try:
- self.max_pages = int(raw_max_pages)
- except (TypeError, ValueError):
- logger.warning(f"Invalid max_pages value {raw_max_pages}; ignoring.")
-
- self.links_enabled = True
- self.git_enabled = git_config.get("enabled", False) if isinstance(git_config, dict) else True
- self.git_config = git_config if isinstance(git_config, dict) else {}
- self.selenium_config = selenium_config or {}
- self.selenium_enabled = self.selenium_config.get("enabled", False)
- self.scrape_with_selenium = self.selenium_config.get("use_for_scraping", False)
-
- self.sso_enabled = bool(sso_config.get("enabled", False))
-
- self.data_path = Path(global_config["DATA_PATH"])
- self.input_lists = links_config.get("input_lists", [])
- self.git_dir = self.data_path / "git"
-
- self.data_path.mkdir(parents=True, exist_ok=True)
-
- self.web_scraper = LinkScraper(
- verify_urls=self.config.get("verify_urls", False), # Default to False for broader compatibility
- enable_warnings=self.config.get("enable_warnings", False),
- )
- self._git_scraper: Optional["GitScraper"] = None
-
- def collect_all_from_config(
- self, persistence: PersistenceService
- ) -> None:
- """Run the configured scrapers and persist their output."""
- link_urls, git_urls, sso_urls = self._collect_urls_from_lists_by_type(self.input_lists)
-
- if git_urls:
- self.git_enabled = True
- if sso_urls:
- self.sso_enabled = True
- self._ensure_sso_defaults()
-
- self.collect_links(persistence, link_urls=link_urls)
- self.collect_sso(persistence, sso_urls=sso_urls)
- self.collect_git(persistence, git_urls=git_urls)
-
- logger.info("Web scraping was completed successfully")
-
- def collect_links(
- self,
- persistence: PersistenceService,
- link_urls: List[str] = [],
- max_depth: Optional[int] = None,
- ) -> int:
- """Collect only standard link sources. Returns count of resources scraped."""
- if not self.links_enabled:
- logger.info("Links disabled, skipping link scraping")
- return 0
- if not link_urls:
- return 0
- websites_dir = persistence.data_path / "websites"
- if not os.path.exists(websites_dir):
- os.makedirs(websites_dir, exist_ok=True)
- return self._collect_links_from_urls(link_urls, persistence, websites_dir, max_depth=max_depth)
-
- def collect_git(
- self,
- persistence: PersistenceService,
- git_urls: Optional[List[str]] = None,
- ) -> None:
- """Collect only git sources."""
- if not self.git_enabled:
- logger.info("Git disabled, skipping git scraping")
- return
- if not git_urls:
- return
- git_dir = persistence.data_path / "git"
- if not os.path.exists(git_dir):
- os.makedirs(git_dir, exist_ok=True)
- self._collect_git_resources(git_urls, persistence, git_dir)
-
- def collect_sso(
- self,
- persistence: PersistenceService,
- sso_urls: Optional[List[str]] = None,
- ) -> None:
- """Collect only SSO sources."""
- if not self.sso_enabled:
- logger.info("SSO disabled, skipping SSO scraping")
- return
- self._ensure_sso_defaults()
- if not sso_urls:
- return
- sso_dir = persistence.data_path / "sso"
- if not os.path.exists(sso_dir):
- os.makedirs(sso_dir, exist_ok=True)
- self._collect_sso_from_urls(sso_urls, persistence, sso_dir)
-
- def schedule_collect_links(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- """
- Scheduled collection of link sources.
- For now, this behaves the same as a full collection, overriding last_run depending on the persistence layer.
- """
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="web", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "").strip() for m in metadata]
- catalog_urls = [u for u in catalog_urls if u]
- logger.info("Scheduled links collection found %d URL(s) in catalog", len(catalog_urls))
- self.collect_links(persistence, link_urls=catalog_urls)
-
- def schedule_collect_git(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="git", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "") for m in metadata]
- self.collect_git(persistence, git_urls=catalog_urls)
-
- def schedule_collect_sso(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
- metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="sso", metadata_keys=["url"])
- catalog_urls = [m[1].get("url", "") for m in metadata]
- self.collect_sso(persistence, sso_urls=catalog_urls)
-
- def _collect_links_from_urls(
- self,
- urls: List[str],
- persistence: PersistenceService,
- output_dir: Path,
- max_depth: Optional[int] = None,
- ) -> int:
- """Collect links from URLs and return total count of resources scraped."""
- # Initialize authenticator if selenium is enabled
- authenticator = None
- if self.selenium_enabled:
- authenticator_class, kwargs = self._resolve_scraper()
- if authenticator_class is not None:
- authenticator = authenticator_class(**kwargs)
-
- total_count = 0
- try:
- for url in urls:
- # For standard link collection, don't use selenium for scraping
- # (SSO urls are handled separately via collect_sso)
- count = self._handle_standard_url(
- url,
- persistence,
- output_dir,
- max_depth=max_depth if max_depth is not None else self.base_depth,
- client=None,
- use_client_for_scraping=False
- )
- total_count += count
- finally:
- if authenticator is not None:
- authenticator.close() # Close the authenticator properly and free the resources
- return total_count
-
- def _collect_sso_from_urls(
- self,
- urls: List[str],
- persistence: PersistenceService,
- output_dir: Path,
- ) -> None:
- """Collect SSO-protected URLs using selenium for authentication."""
- if not self.selenium_enabled:
- logger.error("SSO scraping requires data_manager.sources.links.selenium_scraper.enabled")
- return
- if not read_secret("SSO_USERNAME") or not read_secret("SSO_PASSWORD"):
- logger.error("SSO scraping requires SSO_USERNAME and SSO_PASSWORD secrets")
- return
- authenticator = None
- if self.selenium_enabled:
- authenticator_class, kwargs = self._resolve_scraper()
- if authenticator_class is not None:
- authenticator = authenticator_class(**kwargs)
-
- if authenticator is None:
- logger.error("SSO collection requires a valid selenium scraper configuration")
- return
-
- try:
- for url in urls:
- # For SSO URLs, use selenium client for authentication
- # scrape_with_selenium determines if we use selenium for scraping too
- self._handle_standard_url(
- url,
- persistence,
- output_dir,
- max_depth=self.base_depth,
- client=authenticator,
- use_client_for_scraping=self.scrape_with_selenium
- )
- finally:
- if authenticator is not None:
- authenticator.close()
-
- def _ensure_sso_defaults(self) -> None:
- if not self.selenium_config:
- self.selenium_config = {}
-
- if not self.selenium_enabled:
- self.selenium_config["enabled"] = True
- self.selenium_enabled = True
-
- if not self.selenium_config.get("selenium_class"):
- self.selenium_config["selenium_class"] = "CERNSSOScraper"
-
- class_map = self.selenium_config.setdefault("selenium_class_map", {})
- if "CERNSSOScraper" not in class_map:
- class_map["CERNSSOScraper"] = {
- "class": "CERNSSOScraper",
- "kwargs": {
- "headless": True,
- "max_depth": 2,
- },
- }
-
- def _collect_urls_from_lists(self, input_lists) -> List[str]:
- """Collect URLs from the configured weblists."""
- # Handle case where input_lists might be None
- urls: List[str] = []
- if not input_lists:
- return urls
- for list_name in input_lists:
- list_path = Path("weblists") / Path(list_name).name
- if not list_path.exists():
- logger.warning(f"Input list {list_path} not found.")
- continue
-
- urls.extend(self._extract_urls_from_file(list_path))
-
- return urls
-
- def _collect_urls_from_lists_by_type(self, input_lists: List[str]) -> tuple[List[str], List[str], List[str]]:
- """All types of URLs are in the same input lists, separate them via prefixes"""
- link_urls: List[str] = []
- git_urls: List[str] = []
- sso_urls: List[str] = []
- for raw_url in self._collect_urls_from_lists(input_lists):
- if raw_url.startswith("git-"):
- git_urls.append(raw_url.split("git-", 1)[1])
- continue
- if raw_url.startswith("sso-"):
- sso_urls.append(raw_url.split("sso-", 1)[1])
- continue
- link_urls.append(raw_url)
- return link_urls, git_urls, sso_urls
- def _resolve_scraper(self):
- class_name = self.selenium_config.get("selenium_class")
- class_map = self.selenium_config.get("selenium_class_map", {})
- selenium_url = self.selenium_config.get("selenium_url",None)
-
- entry = class_map.get(class_name)
-
- if not entry:
- logger.error(f"Selenium class {class_name} is not defined in the configuration")
- return None, {}
-
- scraper_class = entry.get("class")
- if isinstance(scraper_class, str):
- module_name = entry.get(
- "module",
- "src.data_manager.collectors.scrapers.integrations.sso_scraper",
- )
- module = importlib.import_module(module_name)
- scraper_class = getattr(module, scraper_class)
- scraper_kwargs = entry.get("kwargs", {})
- scraper_kwargs["selenium_url"] = selenium_url
- return scraper_class, scraper_kwargs
-
-
- def _handle_standard_url(
- self,
- url: str,
- persistence: PersistenceService,
- output_dir: Path,
- max_depth: int,
- client=None,
- use_client_for_scraping: bool = False,
- ) -> int:
- """Scrape a URL and persist resources. Returns count of resources scraped."""
- count = 0
- try:
- for resource in self.web_scraper.crawl_iter(
- url,
- browserclient=client,
- max_depth=max_depth,
- selenium_scrape=use_client_for_scraping,
- max_pages=self.max_pages,
- ):
- persistence.persist_resource(
- resource, output_dir
- )
- count += 1
- logger.info(f"Scraped {count} resources from {url}")
- except Exception as exc:
- logger.error(f"Failed to scrape {url}: {exc}", exc_info=exc)
- return count
-
- def _extract_urls_from_file(self, path: Path) -> List[str]:
- """Extract URLs from file, ignoring depth specifications for now."""
- urls: List[str] = []
- with path.open("r") as file:
- for line in file:
- stripped = line.strip()
- if not stripped or stripped.startswith("#"):
- continue
- # Extract just the URL part, ignoring depth specification if present
- url_depth = stripped.split(",")
- url = url_depth[0].strip()
- urls.append(url)
- return urls
-
- def _collect_git_resources(
- self,
- git_urls: List[str],
- persistence: PersistenceService,
- git_dir: Path,
- ) -> List[ScrapedResource]:
- git_scraper = self._get_git_scraper()
- resources = git_scraper.collect(git_urls)
- for resource in resources:
- persistence.persist_resource(resource, git_dir)
- return resources
-
- def _get_git_scraper(self) -> "GitScraper":
- if self._git_scraper is None:
- from src.data_manager.collectors.scrapers.integrations.git_scraper import \
- GitScraper
-
- self._git_scraper = GitScraper(manager=self, git_config=self.git_config)
- return self._git_scraper
diff --git a/src/data_manager/collectors/scrapers/settings.py b/src/data_manager/collectors/scrapers/settings.py
new file mode 100644
index 000000000..65aeddc01
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/settings.py
@@ -0,0 +1,103 @@
+BOT_NAME = "archi_scrapers"
+
+SPIDER_MODULES = ["src.data_manager.collectors.scrapers.spiders"]
+
+NEWSPIDER_MODULE = "src.data_manager.collectors.scrapers.spiders"
+
+# Browser-like UA to avoid bot-blocking (e.g. Twiki ConnectionLost issue)
+USER_AGENT = (
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36"
+ "archi_scrapers/1.0 (+https://github.com/archi-physics/archi)"
+)
+
+# Default RETRY_TIMES is 2. We bump to 3 for transient failures.
+# ConnectionLost is in RETRY_HTTP_CODES by default as a non-HTTP failure;
+# Scrapy retries it automatically via RetryMiddleware.
+RETRY_ENABLED = True
+RETRY_TIMES = 3 # max retries per request (transport + server errors only)
+RETRY_HTTP_CODES = [
+ 500, # Internal Server Error — transient server fault
+ 502, # Bad Gateway — upstream not reachable
+ 503, # Service Unavailable — server overloaded
+ 504, # Gateway Timeout
+ 408, # Request Timeout — network-level timeout
+ # 429 (Too Many Requests) omitted: AutoThrottle should prevent hitting it;
+]
+
+# Conservative floor delay for all sources.
+# AutoThrottle will increase this dynamically but never go below it.
+# Indico's robots.txt mandates Crawl-delay: 10 — Indico spiders must override
+# this to 10 via custom_settings = {"DOWNLOAD_DELAY": 10}.
+DOWNLOAD_DELAY = 2 # seconds
+# Per-request timeout — prevents indefinite hangs
+DOWNLOAD_TIMEOUT = 30 # seconds
+
+# Keep a single concurrent request per domain.
+# AutoThrottle adjusts throughput dynamically; starting at 1 is safe.
+CONCURRENT_REQUESTS = 1
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
+
+# Robots.txt: obey by default.
+# override this per-spider: custom_settings = {"ROBOTSTXT_OBEY": False}
+# Never disable globally — it would affect all spiders.
+ROBOTSTXT_OBEY = True
+
+# AutoThrottle
+# Enabled as a second politeness layer on top of DOWNLOAD_DELAY.
+# AutoThrottle treats DOWNLOAD_DELAY as a minimum — it will never go lower.
+# Target concurrency of 1.0 keeps us single-threaded per domain by default.
+AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_START_DELAY = DOWNLOAD_DELAY # initial delay before AT calibrates
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_MAX_DELAY = 60 # cap: never wait more than 60s
+# Log every AutoThrottle adjustment — useful during development, can be
+# set False in production if log volume is too high.
+AUTOTHROTTLE_DEBUG = False
+
+# ------------------------------------------------------------------ #
+# Depth limiting — safety cap; spiders can narrow via custom_settings.
+# ------------------------------------------------------------------ #
+DEPTH_LIMIT = 2 # hard cap so a misconfigured crawl can't run forever
+
+# ---------------------------------------------------------------------------
+# Safety: fail loudly on spider import errors
+# ---------------------------------------------------------------------------
+SPIDER_LOADER_WARN_ONLY = False
+
+# Maximum error count before the spider is closed automatically.
+# 25 gives enough room to diagnose intermittent failures without letting
+# a completely broken crawl run for hours.
+CLOSESPIDER_ERRORCOUNT = 25
+
+LOG_LEVEL = "INFO"
+
+# The class used to detect and filter duplicate requests
+DUPEFILTER_CLASS = "scrapy.dupefilters.RFPDupeFilter"
+
+# ---------------------------------------------------------------------------
+# Middlewares, Pipelines and Extensions Priorities
+# ---------------------------------------------------------------------------
+DOWNLOADER_MIDDLEWARES = {
+ "src.data_manager.collectors.scrapers.middlewares.auth_downloader.AuthDownloaderMiddleware": 500,
+ "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
+ # RedirectMiddleware stays at its default 600 — no entry needed
+}
+
+SPIDER_AUTH_PROVIDERS = {
+ "cern_sso": {
+ "class": "src.data_manager.collectors.scrapers.auth.cern_sso.CERNSSOProvider",
+ "kwargs": {"headless": True},
+ },
+}
+
+ITEM_PIPELINES = {
+ "src.data_manager.collectors.scrapers.pipelines.anonymization.AnonymizationPipeline": 250,
+ "src.data_manager.collectors.scrapers.pipelines.markitdown.MarkitdownPipeline": 260,
+ "src.data_manager.collectors.scrapers.pipelines.persistence.PersistencePipeline": 300,
+}
+
+EXTENSIONS = {
+ "scrapy.extensions.closespider.CloseSpider": 500,
+}
diff --git a/src/data_manager/collectors/scrapers/spiders/__init__.py b/src/data_manager/collectors/scrapers/spiders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/data_manager/collectors/scrapers/spiders/discourse.py b/src/data_manager/collectors/scrapers/spiders/discourse.py
new file mode 100644
index 000000000..1803128e4
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/discourse.py
@@ -0,0 +1,194 @@
+"""
+Discourse spider — recursive JSON pagination, no link following.
+
+Seed: GET /c/{path}.json → first page of each category
+Recur: GET more_topics_url (from JSON) → next page (until exhausted)
+Fan-out: each topic → GET /t/{slug}/{id}.rss → yield DiscourseTopicPageItem
+"""
+from __future__ import annotations
+
+import re
+import json
+from typing import Any, Iterator, List, Optional
+from urllib.parse import urljoin
+
+from scrapy import Spider
+from scrapy.http import Request, Response, TextResponse
+
+from src.data_manager.collectors.scrapers.items import DiscourseTopicPageItem
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class DiscourseSpider(Spider):
+ name = "discourse"
+
+ _DEFAULT_BASE_URL = "https://cms-talk.web.cern.ch"
+ _DEFAULT_CATEGORY_PATHS: List[str] = [
+ "/c/offcomp/ais/150",
+ ]
+
+ auth_provider_name = "cern_sso"
+
+ custom_settings = {
+ "ROBOTSTXT_OBEY": False,
+ "DOWNLOAD_DELAY": 10, # default polite delay (seconds)
+ "RETRY_TIMES": 2,
+ "COOKIES_ENABLED": True,
+ "CLOSESPIDER_PAGECOUNT": 500, # safety cap on total responses
+ "CLOSESPIDER_ITEMCOUNT": 0, # 0 = no item-count limit
+ "DEPTH_LIMIT": 0, # 0 = no limit; pagination is not link depth tracking
+ }
+
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ delay = kwargs.get("delay")
+ max_pages = kwargs.get("max_pages")
+ anonymize_data = kwargs.get("anonymize_data")
+ markitdown_enabled = kwargs.get("markitdown")
+ if delay:
+ crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
+ if max_pages:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ if anonymize_data:
+ crawler.settings.set("ANONYMIZE_DATA", anonymize_data, priority="spider")
+ if markitdown_enabled:
+ crawler.settings.set("MARKITDOWN_ENABLED", markitdown_enabled, priority="spider")
+ return super().from_crawler(crawler, *args, **kwargs)
+
+ def __init__(
+ self,
+ base_url: Optional[str] = None,
+ category_paths: Optional[List[str]] = None,
+ keywords: Optional[List[str]] = None,
+ delay: Optional[int] = None,
+ max_pages: Optional[int] = None,
+ *args: Any,
+ **kwargs: Any,
+ ) -> None:
+ super().__init__(*args, **kwargs)
+ self.base_url = (base_url or self._DEFAULT_BASE_URL).rstrip("/")
+ self.category_paths = category_paths or self._DEFAULT_CATEGORY_PATHS
+ self.keywords_re: List[re.Pattern] = [
+ re.compile(kw, re.IGNORECASE) for kw in (keywords or [])
+ ]
+
+ # ── Seeds: one request per category (page 0) ────────────────────────
+ async def start(self):
+ for path in self.category_paths:
+ path = path.strip("/")
+ url = f"{self.base_url}/{path}.json"
+ yield Request(
+ url=url,
+ callback=self.parse_category,
+ errback=self.errback,
+ meta={"category_path": path},
+ )
+
+ # ── Category JSON → topic RSS requests + next page ──────────────────
+ def parse_category(self, response: Response) -> Iterator[Request]:
+ """
+ @url https://cms-talk.web.cern.ch/tags/c/offcomp/ais/150.json
+ @returns requests 100
+ @scrapes url title content
+ """
+ try:
+ data = json.loads(response.text)
+ except (json.JSONDecodeError, AttributeError) as exc:
+ logger.error("Failed to parse category JSON %s: %s", response.url, exc)
+ return
+
+ topic_list = data.get("topic_list", {})
+ topics = topic_list.get("topics", []) or []
+ category_path = response.meta.get("category_path", "?")
+ logger.info(
+ "Category %s returned %d topics (%s)",
+ category_path, len(topics), response.url,
+ )
+
+ for topic in topics:
+ slug = topic.get("slug", "")
+ topic_id = topic.get("id")
+ if not slug or not topic_id:
+ continue
+ rss_url = f"{self.base_url}/t/{slug}/{topic_id}.rss"
+ yield Request(
+ url=rss_url,
+ callback=self.parse_topic,
+ errback=self.errback,
+ meta={
+ "topic_id": topic_id,
+ "slug": slug,
+ "title": topic.get("title", f"{slug} ({topic_id})"),
+ "tags": topic.get("tags", []),
+ "has_accepted_answer": topic.get("has_accepted_answer", False),
+ "created_at": topic.get("created_at", ""),
+ },
+ )
+
+ # Recurse: follow more_topics_url if present
+ more_url = topic_list.get("more_topics_url")
+ if more_url:
+ next_url = urljoin(response.url, more_url)
+ if ".json" not in next_url:
+ # Insert .json before the query string:
+ # /c/.../87?page=1 → /c/.../87.json?page=1
+ if "?" in next_url:
+ path, qs = next_url.split("?", 1)
+ next_url = f"{path}.json?{qs}"
+ else:
+ next_url += ".json"
+ yield Request(
+ url=next_url,
+ callback=self.parse_category,
+ errback=self.errback,
+ meta={"category_path": category_path},
+ )
+ else:
+ logger.info("Category %s exhausted (no more_topics_url)", category_path)
+
+ def _content_matches_keywords(self, text: str) -> bool:
+ """No keywords pattern means accept everything."""
+ if not self.keywords_re:
+ return True
+ return any(pattern.search(text) for pattern in self.keywords_re)
+
+ # ── Topic RSS → DiscourseTopicPageItem ───────────────────────────────
+ def parse_topic(self, response: Response) -> Iterator[DiscourseTopicPageItem]:
+ if not isinstance(response, TextResponse):
+ logger.debug("Skipping non-text response: %s", response.url)
+ return
+
+ if not self._content_matches_keywords(response.text):
+ logger.debug("Skipping topic (no keyword match): %s", response.url)
+ return
+
+ slug = response.meta.get("slug", "")
+ topic_id = response.meta.get("topic_id", "")
+ title = response.meta.get("title", "")
+ tags = response.meta.get("tags", [])
+
+ yield DiscourseTopicPageItem(
+ url=response.url.replace(".rss", ""), # This will redirect to normal browser view of the topic.
+ content=response.text,
+ suffix="rss",
+ source_type="web",
+ title=title,
+ content_type=response.headers.get("Content-Type", b"").decode(
+ "utf-8", errors="replace"
+ ),
+ encoding=response.encoding or "utf-8",
+ topic_id=topic_id,
+ slug=slug,
+ tags=tags,
+ has_accepted_answer=response.meta.get("has_accepted_answer", False),
+ created_at=response.meta.get("created_at", ""),
+ )
+
+ def errback(self, failure):
+ self.logger.error(
+ "Request failed: %s — %s",
+ failure.request.url,
+ repr(failure.value),
+ )
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/link.py b/src/data_manager/collectors/scrapers/spiders/link.py
new file mode 100644
index 000000000..ce9e82415
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/link.py
@@ -0,0 +1,115 @@
+from typing import Iterator, Callable
+from urllib.parse import urlparse
+from scrapy import Spider
+from scrapy.http import Response, Request
+from scrapy.linkextractors import LinkExtractor
+from scrapy.link import Link
+from src.data_manager.collectors.scrapers.utils import IMAGE_EXTENSIONS, IGNORED_DOCUMENT_EXTENSIONS
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.parsers.link import parse_link_page
+
+class LinkSpider(Spider):
+ """
+ Generic link-following spider for unauthenticated pages.
+ Stays within the hostnames of all start_urls, up to max_depth.
+ """
+
+ name = "link"
+
+ _DEFAULT_START_URLS = ["https://quotes.toscrape.com/"]
+
+ custom_settings = {
+ "DEPTH_LIMIT": 1, # Default max depth
+ "DOWNLOAD_DELAY": 2, # Default (download) delay
+ "CLOSESPIDER_PAGECOUNT": 500 # Default max pages
+ }
+
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ max_depth = kwargs.get("max_depth")
+ max_pages = kwargs.get("max_pages")
+ delay = kwargs.get("delay")
+ markitdown_enabled = kwargs.get("markitdown")
+ anonymize_data = kwargs.get("anonymize_data")
+ if max_depth:
+ crawler.settings.set("DEPTH_LIMIT", max_depth, priority="spider")
+ if max_pages:
+ crawler.settings.set("CLOSESPIDER_PAGECOUNT", max_pages, priority="spider")
+ if delay:
+ crawler.settings.set("DOWNLOAD_DELAY", delay, priority="spider")
+ if markitdown_enabled:
+ crawler.settings.set("MARKITDOWN_ENABLED", markitdown_enabled, priority="spider")
+ if anonymize_data:
+ crawler.settings.set("ANONYMIZE_DATA", anonymize_data, priority="spider")
+ return super().from_crawler(crawler, *args, **kwargs)
+
+ def __init__(self, start_urls: list[str] = None, max_depth: int = None, max_pages: int = None, allow: list[str] = None, deny: list[str] = None, delay: int = None, canonicalize: bool = False, process_value: Callable[[str], str] = None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._start_urls = start_urls or getattr(self, "_DEFAULT_START_URLS", [])
+ self._allowed_domains: set[str] = {
+ urlparse(u).netloc
+ for u in self._start_urls
+ if urlparse(u).netloc
+ }
+ default_deny = getattr(self, "_DEFAULT_DENY", [])
+ default_process_value = getattr(self, "_DEFAULT_PROCESS_VALUE", None)
+ self._le = LinkExtractor(
+ allow=allow or [],
+ deny=(deny or []) + default_deny,
+ allow_domains=list(self._allowed_domains),
+ deny_extensions=(IMAGE_EXTENSIONS + IGNORED_DOCUMENT_EXTENSIONS),
+ canonicalize=canonicalize,
+ process_value=process_value or default_process_value,
+ unique=True,
+ )
+
+ async def start(self):
+ """
+ Seed requests — validates start_urls at crawl time, not import time.
+ Building the habit: always attach errback here, never rely on
+ start_urls shortcut in production spiders.
+ """
+ if not self._start_urls:
+ raise ValueError("LinkSpider requires start_urls to be set")
+ for url in self._start_urls:
+ yield Request(url=url, callback=self.parse, errback=self.errback, meta={"depth": 0})
+
+ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
+ """
+ Extract one item per response, then yield follow Requests up to max_depth.
+ @url https://quotes.toscrape.com/
+ @returns items 1
+ @returns requests 1
+ @scrapes url title
+ """
+ yield from self.parse_item(response) # Yield Item
+ yield from self.follow_links(response) # Yield Requests
+
+
+ def follow_links(self, response: Response) -> Iterator[Request]:
+ current_depth = response.meta.get("depth", 0)
+ if current_depth >= self.settings.get("DEPTH_LIMIT"):
+ self.logger.info("Reached max depth %d", self.settings.get("DEPTH_LIMIT"))
+ return
+ for link in self.parse_follow_links(response):
+ self.logger.info("Following %s at depth %d", link.url, current_depth)
+ yield Request(link.url, callback=self.parse, errback=self.errback, meta={"depth": current_depth + 1})
+
+ def errback(self, failure):
+ self.logger.error(
+ "Request failed: %s — %s",
+ failure.request.url,
+ repr(failure.value),
+ )
+
+ # ------------------------------------------------------------------ #
+ # Extension points — pure, unit-testable/checkable without a reactor
+ # ------------------------------------------------------------------ #
+
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ yield from parse_link_page(response)
+
+ def parse_follow_links(self, response: Response) -> Iterator[Link]:
+ links = self._le.extract_links(response)
+ self.logger.info("Extracted %d links from %s", len(links), response.url)
+ yield from links
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/spiders/twiki.py b/src/data_manager/collectors/scrapers/spiders/twiki.py
new file mode 100644
index 000000000..d43314ddb
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/spiders/twiki.py
@@ -0,0 +1,78 @@
+from typing import Iterator
+from urllib.parse import urlparse
+from scrapy.http import Response, Request
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.spiders.link import LinkSpider
+from src.data_manager.collectors.scrapers.parsers.twiki import parse_twiki_page
+
+
+class TwikiSpider(LinkSpider):
+ """
+ Minimal Twiki spider against a real Twiki target.
+ Support CERN SSO authentication.
+ """
+
+ name = "twiki"
+
+ auth_provider_name = "cern_sso"
+
+ _DEFAULT_START_URLS = [
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideCrab", # public page
+ ]
+
+ _DEFAULT_DENY = [
+ # CGI endpoints — no content, mostly we allow just /bin/view/ or /bin/viewauth/
+ r"/bin/edit",
+ r"/bin/logon",
+ r"/bin/oops",
+ r"/bin/attach",
+ r"/bin/search",
+ r"/bin/rdiff",
+ r"/bin/history",
+ r"/bin/raw",
+ r"/bin/genpdf", # PDF generation — not content
+ r"/bin/view/Main", # user profile pages, not content
+ # Navigation/structural pages
+ r"LeftBar$", # just ignore all left bar pages
+ r"/bin/view/[^/]+/WebLeftBar", # sidebar navigation template
+ r"/bin/view/[^/]+/WebTopBar", # top navigation bar
+ r"/bin/view/[^/]+/WebChanges", # recent changes — floods with links
+ r"/bin/view/[^/]+/WebIndex", # alphabetical index — floods with links
+ r"/bin/view/[^/]+/WebStatistics", # statistics pages
+ r"/bin/view/[^/]+/WebNotify", # notification subscriptions
+ r"/bin/view/[^/]+/WebPreferences", # wiki preferences
+ # Discard Topic List page, too many links in https://twiki.cern.ch/twiki/bin/view/CMSPublic/WebTopicList
+ r"/bin/view/[^/]+/WebTopicList", # too many links, or should been put as seeds_urls.
+ r"/bin/view/[^/]+/WebSearch", # search page — floods with links
+ r"/bin/view/[^/]+/WebChanges", # recent changes — floods with links
+ ]
+
+ custom_settings = {
+ "ROBOTSTXT_OBEY": False,
+ "DOWNLOAD_TIMEOUT": 120,
+ "RETRY_TIMES": 0, # Very Safe no retries
+ "DEPTH_LIMIT": 1, # Default max depth
+ "DOWNLOAD_DELAY": 60, # Default (download) delay
+ "CLOSESPIDER_PAGECOUNT": 2, # Very Safe Default max pages
+ "COOKIES_ENABLED": False, # disable CookiesMiddleware jar
+ }
+
+ @staticmethod
+ def _normalize_url(url: str) -> str:
+ """Keep TWiki URLs clean: only scheme + netloc + path — drop query params and fragment."""
+ return urlparse(url)._replace(query="", fragment="").geturl() # type: ignore
+
+ _DEFAULT_PROCESS_VALUE = _normalize_url
+
+ def parse(self, response: Response) -> Iterator[WebPageItem | Request]:
+ """
+ Twiki pages render their main content inside #patternMain or .twikiMain.
+ @url https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideCrab
+ @returns items 1 1
+ @scrapes url title
+ @returns requests 1 105
+ """
+ yield from super().parse(response)
+
+ def parse_item(self, response: Response) -> Iterator[WebPageItem]:
+ yield from parse_twiki_page(response)
\ No newline at end of file
diff --git a/src/data_manager/collectors/scrapers/utils.py b/src/data_manager/collectors/scrapers/utils.py
new file mode 100644
index 000000000..003003d77
--- /dev/null
+++ b/src/data_manager/collectors/scrapers/utils.py
@@ -0,0 +1,19 @@
+from scrapy.http import Response
+
+IMAGE_EXTENSIONS = [
+ "png", "jpg", "jpeg", "gif", "bmp", "svg", "ico", "webp"
+]
+
+# pdf, docs, xlsx, pptx are first class supported by MarkItDown
+IGNORED_DOCUMENT_EXTENSIONS = [
+ "doc",
+ "xls",
+ "ppt",
+ "zip",
+ "rar",
+]
+
+def get_content_type(response: Response) -> str:
+ """Decode the Content-Type header bytes to str."""
+ raw: bytes = response.headers.get("Content-Type", b"") or b""
+ return raw.decode("utf-8", errors="replace")
diff --git a/src/data_manager/collectors/utils/anonymizer.py b/src/data_manager/collectors/utils/anonymizer.py
index 72ac00456..782709ccd 100644
--- a/src/data_manager/collectors/utils/anonymizer.py
+++ b/src/data_manager/collectors/utils/anonymizer.py
@@ -3,20 +3,71 @@
"""
import re
-from typing import List, Set
+from typing import List, Set, Dict, Any
import spacy
from src.utils.config_access import get_data_manager_config
+from html import unescape
+
+# Generic markup patterns
+_TAG_RE = re.compile(r"<[^>]+>")
+_CDATA_RE = re.compile(r"")
+_DC_CREATOR_RE = re.compile(
+ r'( )',
+ re.IGNORECASE,
+)
+_ATTR_TEXT_RE = re.compile(r'(?:title|alt|creator|author)=["\']([^"\']+)["\']', re.IGNORECASE)
+_CONTENT_TAG_RE = re.compile(
+ r'<(?:p|li|td|description|title|dc:creator)[^>]*>(.*?)(?:p|li|td|description|title|dc:creator)>',
+ re.DOTALL | re.IGNORECASE,
+)
+# Albert-Einstein → (removed)
+_DEFAULT_GENERIC_MARKUP_USER_LINK_RE = re.compile(
+ r']*href="[^"]*?/(?:Main|author|user|profile|members)/[^"]*"[^>]*>[^<]* ',
+ re.IGNORECASE,
+)
+# Generic author link, like Albert-Einstein
+# Stephenie Meyer
+# John Doe
+# Jane Smith
+# Bob
+_DEFAULT_GENERIC_MARKUP_AUTHOR_ELEMENT_RE = re.compile(
+ r'<[^>]*(?:itemprop=["\']author["\']|class=["\'][^"\']*\bauthor\b[^"\']*["\']|rel=["\']author["\'])[^>]*>[^<]*[^>]+>',
+ re.IGNORECASE,
+)
+# JohnDoe → (removed)
+_DEFAULT_MARKUP_TWIKI_USER_LINK_RE = re.compile(
+ r']*href="[^"]*?/twiki/bin/\w+/Main/\w+"[^>]*>\w+ ',
+ re.IGNORECASE,
+)
+# John
→ (removed)
+# John Doe
→ (removed)
+_DEFAULT_MARKUP_SIGNOFF_TAG_RE = re.compile(
+ r'\s*(?: )?\s*[A-Z][\w.]*(?:\s+[A-Z][\w.]*){0,2}\s*
',
+ re.IGNORECASE,
+)
+# ..atm \nJohn
→ ..atm
+# Thanks\John →
+# Yours sincerely,\nJ.D.Doe]]> → ]]>
+_DEFAULT_MARKUP_TRAILING_SIGNOFF_TAG_RE = re.compile(
+ r'(?:'
+ r' \s*\n?\s*'
+ r'|(?:Thanks|Cheers|Best|Regards|HTH|Yours\s+sincerely)\s*,?\s*[\n\s]*'
+ r')'
+ r'[A-Z][\w.]*(?:\s+[A-Z][\w.]*){0,2}'
+ r'\s*(?=||\]\]>)',
+ re.IGNORECASE,
+)
class Anonymizer:
- def __init__(self):
+ def __init__(self, dm_config: Dict[str, Any]=None):
"""
Initialize the Anonymizer.
"""
- dm_config = get_data_manager_config()
+ dm_config = dm_config or get_data_manager_config()
data_manager_utils = dm_config.get("utils", {}) if isinstance(dm_config, dict) else {}
anonymizer_config = data_manager_utils.get("anonymizer", {}) if isinstance(data_manager_utils, dict) else {}
@@ -45,39 +96,91 @@ def __init__(self):
self.SIGNOFF_PATTERNS = [re.compile(pattern, re.IGNORECASE) for pattern in signoff_patterns]
self.EMAIL_PATTERN = re.compile(email_pattern)
self.USERNAME_PATTERN = re.compile(username_pattern)
+
+ def _discover_names(self, text: str) -> set:
+ """NER to discover names in the text."""
+ doc = self.nlp(text)
+ return {
+ ent.text for ent in doc.ents
+ if ent.label_ == "PERSON" and ent.text not in self.EXCLUDED_WORDS
+ }
+
+ def _discover_names_markup(self, markup: str) -> set:
+ # Full document: names with surrounding context (catches CDATA)
+ full_text = self._extract_text(markup)
+ names = self._discover_names(full_text)
+ # Per-chunk: focused paragraphs (catches standalone names in )
+ for chunk in self._extract_text_chunks(markup):
+ names |= self._discover_names(chunk)
+ return names
def anonymize(self, text: str) -> str:
"""
Anonymize names, emails, usernames, greetings, and sign-offs from the text.
"""
- doc = self.nlp(text)
- names_to_replace = {
- ent.text for ent in doc.ents
- if ent.label_ == "PERSON" and ent.text not in self.EXCLUDED_WORDS
- }
+ names_to_replace = self._discover_names(text)
# Remove email addresses and usernames
text = self.EMAIL_PATTERN.sub("", text)
text = self.USERNAME_PATTERN.sub("", text)
- # Remove greetings and sign-offs
+ text = self._strip_greetings_signoffs(text)
+ return self._replace_names(text, names_to_replace)
+
+ def anonymize_markup(self, markup: str) -> str:
+ """
+ Anonymize names, emails, usernames, greetings, and sign-offs from the markup.
+ including html, rss, and other markup formats. (especially twiki and discourse markup)
+ """
+ names_to_replace = self._discover_names_markup(markup)
+ # Remove email addresses and usernames
+ markup = self.EMAIL_PATTERN.sub("", markup)
+ markup = self.USERNAME_PATTERN.sub("", markup)
+ markup = _DC_CREATOR_RE.sub(r'\1\2', markup)
+ markup = _DEFAULT_GENERIC_MARKUP_AUTHOR_ELEMENT_RE.sub("", markup)
+ markup = _DEFAULT_GENERIC_MARKUP_USER_LINK_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_SIGNOFF_TAG_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_TRAILING_SIGNOFF_TAG_RE.sub("", markup)
+ markup = _DEFAULT_MARKUP_TWIKI_USER_LINK_RE.sub("", markup)
+ markup = self._strip_greetings_signoffs(markup)
+ return self._replace_names(markup, names_to_replace)
+
+ def _strip_greetings_signoffs(self, text: str) -> str:
lines = text.splitlines()
- filtered_lines: List[str] = []
+ filtered = []
for line in lines:
- stripped_line = line.strip()
- if any(p.match(stripped_line) for p in self.GREETING_PATTERNS):
+ stripped = line.strip()
+ if any(p.match(stripped) for p in self.GREETING_PATTERNS):
continue
- if any(p.match(stripped_line) for p in self.SIGNOFF_PATTERNS):
+ if any(p.match(stripped) for p in self.SIGNOFF_PATTERNS):
continue
- filtered_lines.append(line)
- text = "\n".join(filtered_lines)
-
- # Remove names (case-insensitive)
- for name in sorted(names_to_replace, key=len, reverse=True):
- pattern = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE)
- text = pattern.sub("", text)
-
- # Remove extra whitespace
- text = "\n".join(line for line in text.splitlines() if line.strip())
-
- return text
+ filtered.append(line)
+ return "\n".join(filtered)
+
+ def _replace_names(self, text: str, names: set) -> str:
+ for name in sorted(names, key=len, reverse=True):
+ text = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE).sub("", text)
+ return "\n".join(line for line in text.splitlines() if line.strip())
+
+ def _extract_text(self, markup: str) -> str:
+ """Strip markup to plain text for NER. Format-agnostic."""
+ attrs = " ".join(_ATTR_TEXT_RE.findall(markup))
+ clean = _CDATA_RE.sub(" ", markup)
+ clean = _TAG_RE.sub(" ", clean)
+ clean = unescape(clean)
+ return re.sub(r"\s+", " ", f"{clean} {attrs}").strip()
+
+ def _extract_text_chunks(self, markup: str) -> list:
+ chunks = []
+ # Text content from tags
+ for match in _CONTENT_TAG_RE.finditer(markup):
+ inner = _CDATA_RE.sub(" ", match.group(1))
+ clean = _TAG_RE.sub(" ", inner)
+ clean = unescape(clean).strip()
+ if clean:
+ chunks.append(clean)
+ # Text from attributes
+ attr_text = " ".join(_ATTR_TEXT_RE.findall(markup))
+ if attr_text.strip():
+ chunks.append(attr_text.strip())
+ return chunks
\ No newline at end of file
diff --git a/src/data_manager/collectors/utils/markitdown_convertor.py b/src/data_manager/collectors/utils/markitdown_convertor.py
new file mode 100644
index 000000000..560d0d43f
--- /dev/null
+++ b/src/data_manager/collectors/utils/markitdown_convertor.py
@@ -0,0 +1,32 @@
+import io
+from markitdown import MarkItDown
+from src.utils.logging import get_logger
+# from src.interfaces.llm.llm_client import LLMClient
+
+logger = get_logger(__name__)
+
+class MarkitdownConvertor:
+
+ def __init__(self):
+ self.markitdown = MarkItDown(
+ enable_plugins=True,
+ # llm_client=llm_client,
+ # llm_model=llm_model,
+ )
+
+ def convert(self, content: str, file_extension: str = ".html") -> str:
+ """
+ Convert the content to markdown using MarkItDown.
+ Args:
+ content: The content to convert.
+ file_extension: The file extension of the content.
+ Returns:
+ The converted content.
+ """
+ logger.info(f"Converting content to markdown: {content}")
+ result = self.markitdown.convert_stream(
+ io.BytesIO(content.encode("utf-8")),
+ file_extension=file_extension,
+ )
+ logger.info(f"Markitdown result: {result.text_content if hasattr(result, 'text_content') else str(result)}")
+ return result.text_content if hasattr(result, 'text_content') else str(result)
\ No newline at end of file
diff --git a/src/data_manager/data_manager.py b/src/data_manager/data_manager.py
index 1f4b01a32..da169b4fd 100644
--- a/src/data_manager/data_manager.py
+++ b/src/data_manager/data_manager.py
@@ -2,9 +2,11 @@
from typing import Callable, Optional
from src.data_manager.collectors.persistence import PersistenceService
-from src.data_manager.collectors.scrapers.scraper_manager import ScraperManager
+from src.data_manager.collectors.utils.anonymizer import Anonymizer
+from src.data_manager.collectors.scraper_manager import ScraperManager
from src.data_manager.collectors.tickets.ticket_manager import TicketManager
from src.data_manager.collectors.localfile_manager import LocalFileManager
+from src.data_manager.collectors.git_manager import GitManager
from src.data_manager.vectorstore.manager import VectorStoreManager
from src.utils.config_access import get_full_config
from src.utils.config_service import ConfigService
@@ -35,8 +37,10 @@ def __init__(self, *, run_ingestion: bool = True, factory=None):
raise RuntimeError("Static config missing sources_config; run deployment initialization first.")
self.config["data_manager"]["sources"] = static_config.sources_config
+ self.anonymizer = Anonymizer()
self.localfile_manager = LocalFileManager(dm_config=self.config["data_manager"])
- self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"])
+ self.git_manager = GitManager(dm_config=self.config["data_manager"])
+ self.scraper_manager = ScraperManager(dm_config=self.config["data_manager"], persistence=self.persistence, anonymizer=self.anonymizer)
self.ticket_manager = TicketManager(dm_config=self.config["data_manager"])
self.vector_manager = VectorStoreManager(
@@ -61,7 +65,8 @@ def run_ingestion(self, progress_callback: Optional[Callable[[str], None]] = Non
"""Execute initial ingestion and vectorstore update."""
source_aggregation = [
("Copying configured local files", lambda: self.localfile_manager.collect_all_from_config(self.persistence)),
- ("Scraping documents onto filesystem", lambda: self.scraper_manager.collect_all_from_config(self.persistence)),
+ ("Collecting git repos", lambda: self.git_manager.collect_all_from_config(self.persistence)),
+ ("Scraping web sources onto filesystem", lambda: self.scraper_manager.collect_all_from_config()),
("Fetching ticket data onto filesystem", lambda: self.ticket_manager.collect_all_from_config(self.persistence)),
]
diff --git a/src/data_manager/vectorstore/loader_utils.py b/src/data_manager/vectorstore/loader_utils.py
index 622386439..6aea94536 100644
--- a/src/data_manager/vectorstore/loader_utils.py
+++ b/src/data_manager/vectorstore/loader_utils.py
@@ -28,7 +28,7 @@ def select_loader(file_path: str | Path):
return TextLoader(str(path))
if file_extension == ".py":
return PythonLoader(str(path))
- if file_extension in {".html", ".htm"}:
+ if file_extension in {".html", ".htm", ".rss", ".xml"}:
return BSHTMLLoader(str(path), bs_kwargs={"features": "html.parser"})
if file_extension == ".pdf":
return PyPDFLoader(str(path))
diff --git a/src/interfaces/uploader_app/app.py b/src/interfaces/uploader_app/app.py
index f7fd20cc8..05c9fc77f 100644
--- a/src/interfaces/uploader_app/app.py
+++ b/src/interfaces/uploader_app/app.py
@@ -14,7 +14,8 @@
from src.data_manager.collectors.persistence import PersistenceService
from src.data_manager.collectors.localfile_manager import LocalFileManager
-from src.data_manager.collectors.scrapers.scraper_manager import ScraperManager
+from src.data_manager.collectors.scraper_manager import ScraperManager
+from src.data_manager.collectors.git_manager import GitManager
from src.data_manager.collectors.utils.catalog_postgres import PostgresCatalogService
from src.data_manager.collectors.tickets.ticket_manager import TicketManager
from src.data_manager.vectorstore.loader_utils import load_text_from_path
@@ -77,7 +78,8 @@ def __init__(
if not self.salt:
logger.warning("UPLOADER_SALT not set; account checks may fail.")
- self.scraper_manager = ScraperManager(dm_config=self.config.get("data_manager"))
+ self.scraper_manager = ScraperManager(dm_config=self.config.get("data_manager"), persistence=self.persistence)
+ self.git_manager = GitManager(dm_config=self.config.get("data_manager"))
self.ticket_manager = TicketManager(dm_config=self.config.get("data_manager"))
self.localfile_manager = LocalFileManager(dm_config=self.config.get("data_manager"))
self.post_update_hook = post_update_hook
@@ -174,7 +176,8 @@ def add_git_repo(self):
return jsonify({"error": "missing_repo_url"}), 400
try:
- self.scraper_manager.collect_git(self.persistence, [repo_url.strip()])
+ self.git_manager.collect([repo_url.strip()], self.persistence)
+ self.persistence.flush_index()
self._update_source_status("git", state="idle", last_run=self._now_iso())
self._notify_update()
return jsonify({"status": "ok"})
@@ -278,13 +281,12 @@ def upload_url(self):
return jsonify({"error": "invalid_depth"}), 400
if depth < 0:
return jsonify({"error": "invalid_depth"}), 400
- # LinkScraper currently uses max_depth >= 1 for the initial URL fetch.
if depth == 0:
depth = 1
if url:
logger.info("Uploading the following URL: %s", url)
try:
- scraped_count = self.scraper_manager.collect_links(self.persistence, link_urls=[url], max_depth=depth)
+ scraped_count = self.scraper_manager.collect("link", [url])
self.persistence.flush_index()
self._update_source_status("web", state="idle", last_run=self._now_iso())
added_to_urls = True
diff --git a/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html b/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html
new file mode 100644
index 000000000..667b81951
--- /dev/null
+++ b/tests/unit/fixtures/twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html
@@ -0,0 +1,993 @@
+
+
+
+
+
+
+
+
+ CRAB3ConfigurationFile < CMSPublic < TWiki
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
CRAB configuration file
+
+
+
+
+
+
CRAB configuration file
+
+
+For convenience, we suggest to place the CRAB configuration file in the same directory as the CMSSW parameter-set file to be used by CRAB.
+
+The expected default name of the CRAB configuration file is
crabConfig.py, but of course one can give it any name (respecting always the filename extension
.py and not adding other dots in the filename), as long as one specifies the name when required (e.g. when issuing the CRAB submission command).
+
+In CRAB3 the configuration file is in Python language. It consists of creating a
Configuration object imported from the
WMCore library:
+
+
+import CRABClient
+from WMCore.Configuration import Configuration
+config = Configuration()
+
+
+Once the
Configuration object is created, it is possible to add new sections to it with corresponding parameters. This is done using the following syntax:
+
+
+config.section_("<section-name>")
+config.<section-name>.<parameter-name> = <parameter-value>
+
+
+
Abbreviated configuration definition
+Those lines can be simplified a bit by using instead the following which already defines all the
config.<section-name> objects, but the more
+explicit format above is more clear and it is the one most commonly used
+
+import CRABClient
+from CRABClient.UserUtilities import config
+
+
+
+
+
+
+
+
CRAB configuration sections
+
+
+The table below shows what are the sections currently available for CRAB configuration.
+
+
+
+ Section
+ Description
+
+
+ General
+ In this section, the user specifies generic parameters about the request (e.g. request name).
+
+
+ JobType
+ This section aims to contain all the parameters of the user job type and related configurables (e.g. CMSSW parameter-set configuration file, additional input files, etc.).
+
+
+ Data
+ This section contains all the parameters related to the data to be analyzed, including the splitting parameters.
+
+
+ Site
+ Grid site parameters are defined in this section, including the stage out information (e.g. stage out destination site, white/black lists, etc.).
+
+
+ User
+ This section is dedicated to all the information relative to the user (e.g. voms information).
+
+
+ Debug
+ For experts use only.
+
+
+
+
Predefined CRAB configuration file with empty skeleton
+
+To simplify life a bit, CRAB provides a function
config that returns a
Configuration object with pre-defined sections. The function is in the
CRABClient.UserUtilities module. Users can import and use the function in their CRAB configuration file:
+
+
+from CRABClient.UserUtilities import config
+config = config()
+
+
+which, from the point of view of the
Configuration instance, is equivalent to:
+
+
+from WMCore.Configuration import Configuration
+config = Configuration()
+
+config.section_("General")
+config.section_("JobType")
+config.section_("Data")
+config.section_("Site")
+config.section_("User")
+config.section_("Debug")
+
+
+
+
CRAB configuration parameters
+
+
+The table below provides a list of all the available CRAB configuration parameters (organized by sections), including a short description. Mandatory parameters are marked with two stars (**). Other important parameters are marked with one star (*).
+
+
+
+ Parameter
+ Type
+ Description
+
+
+ Section General
+
+
+
+
+ requestName (*)
+ string
+ A name the user gives to it's request/task. In particular, it is used by CRAB to create a project directory (named crab_<requestName>) where files corresponding to this particular task will be stored. Defaults to <time-stamp>, where the time stamp is of the form <YYYYMMDD>_<hhmmss> and corresponds to the submission time. The maximum allowed length is 100 characters, according to the formati in RX_TASKNAME . Task submission will fail with "Incorrect 'workflow' parameter" if other characters are used.
+
+
+ workArea (*)
+ string
+ The area (full or relative path) where to create the CRAB project directory. If the area doesn't exist, CRAB will try to create it using the mkdir command. Defaults to the current working directory.
+
+
+ transferOutputs (*)
+ boolean
+ Whether or not to transfer the output files to the storage site. If set to False, the output files are discarded and the user can not recover them. Defaults to True.
+
+
+ transferLogs (*)
+ boolean
+ Whether or not to copy the jobs log files to the storage site. If set to False, the log files are discarded and the user can not recover them. Notice however that a short version of the log files containing the first 1000 lines and the last 3000 lines are still available through the monitoring web pages. Defaults to False.
+
+
+ failureLimit
+ integer
+ The number of jobs that may fail permanently before the entire task is cancelled. Disabled by default. Note: a very dangerous parameter, for expert use, do not touch it unless you are sure of what you are doing
+
+
+ instance (**)
+ string
+ The CRAB server instance where to submit the task. For users please use 'prod'.
+
+
+ activity
+ string
+ The activity name used when reporting to Dashboard. For experts use only.
+
+
+
+
+
+
+
+ Section JobType
+
+
+
+
+ pluginName (**)
+ string
+ Specifies if this task is running an analysis ('Analysis') on an existing dataset or is running MC event generation ('PrivateMC').
+
+
+ psetName (*)
+ string
+ The name of the CMSSW parameter-set configuration file that should be run via cmsRun. Defaults to 'pset.py'.
+
+
+ generator
+ string
+ This parameter should be set to 'lhe' when running MC generation on LHE files. Automatically set if an LHESource is present in the parameter-set.
+
+
+ pyCfgParams
+ list of strings
+ List of parameters to pass to the CMSSW parameter-set configuration file, as explained here . For example, if set to ['myOption','-param1=value1',--'param2=value2'], then the jobs will execute cmsRun JobType.psetName myOption -param1=value1 --param2=value2. NOTE: No blanks allowed in 'param=value' see the note about pyCfgParams below.
+
+
+ inputFiles
+ list of strings
+ List of private input files (and/or directories) needed by the jobs. They will be added to the input sandbox. The input sandbox can not exceed 120 MB. The input sandbox is shipped with each job. The input files will be placed in the working directory where the users' application (e.g. cmsRun) is launched regardless of a possible path indicated in this parameter (i.e. only the file name at right of last / is relevant). Directories are tarred and their subtree structure is preserved. Please check the FAQ for more details on how these files are handled.
+
+
+ disableAutomaticOutputCollection
+ boolean
+ Whether to disable or not the automatic recognition of output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration. If set to True, it becomes the user's responsibility to specify in the JobType.outputFiles parameter all the output files that need to be collected. Defaults to False.
+
+
+ outputFiles
+ list of strings
+ List of output files that need to be collected. If disableAutomaticOutputCollection = False (the default), output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration are automatically recognized by CRAB and don't need to be included in this parameter. If publication in DBS is requested (the default) file names must obey DBS lexicon rules, in particular end with .root. If Data.publication is set to False any reasonable string will do.
+
+
+ eventsPerLumi
+ integer
+ Deprecated. Use Data.lumisPerFile instead.
+
+
+ allowUndistributedCMSSW
+ boolean
+ Whether to allow or not using a CMSSW release possibly not available at sites. Defaults to False.
+
+
+ maxMemoryMB
+ integer
+ Maximum amount of memory (in MB) a job is allowed to use. Defaults to 2000. The more memory you request, the more difficult will be to find a slot where to run your jobs. Maximum for single core jobs is 5000. for multiple cores can be up to 2500*numCores
+
+
+ maxJobRuntimeMin
+ integer
+ The maximum runtime (in minutes) per job. Jobs running longer than this amount of time will be removed. Defaults to 1315 (21 hours 55 minutes), see the note about maxJobRuntimeMin below. Not compatible with Automatic splitting.
+
+
+ numCores
+ integer
+ Number of requested cores per job. Defaults to 1. If you increase this value to run multi-threaded cmsRun, you may need to increase maxMemoryMB as well. In the CMSSW parameter-set configuration you may require also the number of streams to be larger than one per thread, which affects the memory consumption too.
+
+
+ priority
+ integer
+ Task priority among the user's own tasks. Higher priority tasks will be processed before lower priority. Two tasks of equal priority will have their jobs start in an undefined order. The first five jobs in a task are given a priority boost of 10. Defaults to 10.
+
+
+ scriptExe
+ string
+ A user script that should be run on the worker node instead of the default cmsRun. It is up to the user to setup the script properly to run on the worker node enviroment. CRAB guarantees that the CMSSW environment is setup (e.g. scram is in the path) and that the modified CMSSW parameter-set configuration file will be placed in the working directory with name PSet.py. The user must ensure that a properly named framework job report file will be written; this can be done e.g. by calling cmsRun within the script as cmsRun -j FrameworkJobReport.xml -p PSet.py. The script itself will be added automatically to the input sandbox. Output files produced by PoolOutputModule or TFileService in the CMSSW parameter-set configuration file will be automatically collected (CRAB3 will look in the framework job report). The user needs to specify other output files to be collected in the JobType.outputFiles parameter. See CRAB3AdvancedTopic#Running_a_user_script_with_CRAB for more information.
+
+
+ scriptArgs
+ list of strings
+ Additional arguments (in the form param=value) to be passed to the script specified in the JobType.scriptExe parameter. The first argument passed to the script is always the job number
+
+
+ sendPythonFolder
+ boolean
+ Obsolete. The 'python' folder in the CMSSW release ($CMSSW_BASE/python) is always included in the sandbox
+
+
+ sendVenvFolder
+ boolean
+ Determine if the =venv= folder in the CMSSW release ($CMSSW_BASE/venv) is included in the sandbox or not. Contrary to other sandbox files, symbolic links found in venv are not dereferenced. Defaults to False.
+
+
+ sendExternalFolder
+ boolean
+ Determine if the 'external' folder in the CMSSW release ($CMSSW_BASE/external) is included in the sandbox or not. See https://hypernews.cern.ch/HyperNews/CMS/get/computing-tools/1972.html . Defaults to False.
+
+
+ externalPluginFile
+ string
+ Name of a plug-in provided by the user and which should be run instead of the standard CRAB plug-in Analysis or PrivateMC. Can not be specified together with pluginName; is either one or the other. Not supported yet.
+
+
+
+
+
+
+
+ Section Data
+
+
+
+
+ inputDataset (*)
+ string
+ When running an analysis over a dataset registered in DBS, this parameter specifies the name of the dataset. The dataset can be an official CMS dataset or a dataset produced by a user or a Rucio DID as explain in this FAQ .
+
+
+ inputBlocks
+ list
+ A list of DBS block names in the format datasetname#uuid. If present only those blocks will be processed, instead of the full dataset. The dataset in the block names must be the same as indicated in inputDataset.
+
+
+ allowNonValidInputDataset
+ boolean
+ Allow CRAB to run over (the valid files of) the input dataset given in Data.inputDataset even if its status in DBS is not VALID. Defaults to False.
+
+
+ outputPrimaryDataset (*)
+ string
+ When running an analysis over private input files or running MC generation, this parameter specifies the primary dataset name that should be used in the LFN of the output/log files and in the publication dataset name (see Data handling in CRAB ).
+
+
+ inputDBS (*)
+ string
+ The URL of the DBS reader instance where the input dataset is published. The URL is of the form 'https://cmsweb.cern.ch/dbs/prod/<instance>/DBSReader', where instance can be global, phys01, phys02 or phys03. The default is global instance. The aliases global, phys01, phys02 and phys03 in place of the whole URLs are also supported (and indeed recommended to avoid typos). For datasets that are not of USER tier, CRAB only allows to read them from global DBS.
+
+
+ splitting (*)
+ string
+ Mode to use to split the task in jobs. When JobType.pluginName = 'Analysis', the splitting mode can either be 'Automatic' (the default, please read the dedicated FAQ ), 'FileBased', 'LumiBased', or 'EventAwareLumiBased' (for Data the recommended mode is 'Automatic' or 'LumiBased'). For 'EventAwareLumiBased', CRAB will split the task by luminosity sections, where each job will contain a varying number of luminosity sections such that the number of events analyzed by each job is roughly unitsPerJob. When JobType.pluginName = 'PrivateMC', the splitting mode can only be 'EventBased'.
+
+
+ unitsPerJob (*)
+ integer
+ Mandatory when Data.splitting is not 'Automatic', suggests (but not impose) how many units (i.e. files, luminosity sections or events - depending on the splitting mode - see the note about Data.splitting below) to include in each job. When Data.splitting = 'Automatic' it represents the jobs target runtime in minutes and its minimum allowed value is 180 (i.e. 3 hours).
+
+
+ totalUnits (*)
+ integer
+ Mandatory when JobType.pluginName = 'PrivateMC', in which case the parameter tells how many events to generate in total. When JobType.pluginName = 'Analysis', this parameter tells how many files (when Data.splitting = 'FileBased'), luminosity sections (when Data.splitting = 'LumiBased') or events (when Data.splitting = 'EventAwareLumiBased' or Data.splitting = 'Automatic' - see the note about "Data.splitting" below) to analyze (after applying the lumi-mask and/or run range filters).
+
+
+ lumisPerFile
+ integer
+ When JobType.pluginName = 'PrivateMC', this parameter specifies how many luminosity section will be presetn in each output file. It should be used only in very special and well motivated use cases. Note that every job starts with a fresh luminosity section, which may lead to unevenly sized luminosity sections if Data.unitsPerJob is not a multiple of this parameter. Defaults to 1.
+
+
+ useParent
+ boolean
+ Adds corresponding parent dataset in DBS as secondary input source. Allows to gain access to more data tiers than present in the current dataset. This will not check for parent dataset availability; jobs may fail with xrootd errors or due to missing dataset access. Defaults to False.
+
+
+ secondaryInputDataset
+ string
+ An extension of the Data.useParent parameter. Allows to specify any grandparent dataset in DBS (same instance as the primary dataset) as secondary input source. CRAB will internally set this dataset as the parent and will set Data.useParent = True. Therefore, Data.useParent and Data.secondaryInputDataset can not be used together a priori .
+
+
+ lumiMask (*)
+ string
+ A lumi-mask to apply to the input dataset before analysis. Can either be a URL address or the path to a JSON file on disk. Default to an empty string (no lumi-sections filter).
+
+
+ runRange (*)
+ string
+ The runs and/or run ranges to process (e.g. '193093-193999,198050,199564'). It can be used together with a lumi-mask. Defaults to an empty string (no run filter).
+
+
+ outLFNDirBase (*)
+ string
+ The first part of the LFN of the output files (see Data handling in CRAB ). Accepted values are /store/user/<username>[/<subdir>*] (the trailing / after <username> can not be omitted if a subdir is not given) and /store/group/<groupname>[/<subgroupname>*] (and /store/local/<dir>[/<subdir>*] if Data.publication = False). Defaults to /store/user/<username>/. CRAB creates the outLFNDirBase path on the storage site if needed, do not create it yourself otherwise the file stage-out may fail due to permissions inconsistency. Note: even if publication is disabled, the LFN needs to be a valid LFN name for DBS. So keep in mind that 1) name of first subdir after username (or groupname ) must start with a letter, not a number 2) do not use dot as separator inside names 3) LFN must end with .root
+
+
+ publication (*)
+ boolean
+ Whether to publish or not the EDM output files (i.e. output files produced by PoolOutputModule) in DBS. Notice that for publication to be possible, the corresponding output files have to be transferred to the permanent storage element. Defaults to True.
+
+
+ publishDBS (*)
+ string
+ The URL of the DBS writer instance where to publish. The URL is of the form 'https://cmsweb.cern.ch/dbs/prod/<instance>/DBSWriter', where instance can so far only be phys03, and therefore it is set as the default, so the user doesn't have to specify this parameter. The alias phys03 in place of the whole URL is also supported.
+
+
+ outputDatasetTag (*)
+ string
+ A custom string used in both, the LFN of the output files (even if Data.publication = False) and the publication dataset name (if Data.publication = True) (see Data handling in CRAB ).
+
+
+ ignoreLocality
+ boolean
+ Defaults to False. DO NOT USE
+
+
+ userInputFiles
+ list of strings
+ This parameter serves to run an analysis over a set of input files, as opposed to run over an full dataset from DBS. Format is: Data.userInputFiles = ['file1', 'file2', 'etc']. When this parameter is used, the only allowed splitting mode is 'FileBased'. There are two ways to use this. 1) as a fileMask, analogous to inputBlocks. If specified together with Data.inputDataset it must contain a list of LFN's and only the listed files will be processed from that Dataset. An error will be raised if some LFN does not belong to the input dataset. 2) as a pure "list of files" in case Data.inputDataset is missing. In this case 'fileN' can be an LFN (i.e. a string starting with /store/), or PFN (i.e. a string starting with protocol-prefix://store/ like e.g. a pointer to an xrootd redirector ref ). One could also have a local text file containing the list of input files (one file per line; don't include quotation marks nor commas) and then specify in this parameter the following: Data.userInputFiles = open('/path/to/local/file.txt').readlines(). Also, since there is no input dataset from where to extract the primary dataset name, the user must use the parameter Data.outputPrimaryDataset to define it. CRAB will not do any data discovery and user must specify the locations where to run the jobs via the Site.whitelist parameter.
+
+
+ partialDataset
+ boolean
+ Allow to process input dataset that is only partially on disk. Normally, when CRAB finds out that some files of the input dataset are not fully replicated on disk, CRAB will issue tape recall to Rucio and wait for all files to be on disk before running the task. If partialdataset is True, CRAB will submit task to condor immediately without request tape recall and process the files currently on disk.
+
+
+
+
+
+
+
+ Section Site
+
+
+
+
+ storageSite (**)
+ string
+ Site where the output files should be permanently copied to. See the note about storageSite below.
+
+
+ whitelist
+ list of strings
+ A user-specified list of sites where the jobs can run. For example: ['T2_CH_CERN','T2_IT_Bari',...]. Jobs will not be assigned to a site that is not in the white list. Note that at times this list may not be respected, see this FAQ
+
+
+ blacklist
+ list of strings
+ A user-specified list of sites where the jobs should not run. Useful to avoid jobs to run on a site where the user knows they will fail (e.g. because of temporary problems with the site). Note that at times this list may not be respected, see this FAQ
+
+
+ ignoreGlobalBlacklist
+ boolean
+ Whether or not to ignore the global site blacklist provided by the Site Status Board. Should only be used in special cases with a custom whitelist or blacklist to make sure the jobs land on the intended sites.
+
+
+ requireAccelerator
+ boolean
+ Defaults to False. Set to True to request GPU node for the jobs. Please see CMS Submission Infrastructure: GPUs monitor dashboard to check sites and GPUs availability.
+
+
+ acceleratorParams
+ dictionary
+ Defaults to {}. When Site.requireAccelerator is True, this parameter dictionary will be used to specify detailed GPU resource requirements for the jobs. Please see CMS Submission Infrastructure: GPUs monitor dashboard to check GPU's Memory, Capacity and Runtime availability. See Example parameters below.
+
+
+config.Site.acceleratorParams = {
+ "GPUMemoryMB": "4000",
+ "GPUMinimumCapability": "7.0",
+ "GPUMaximumCapability": "8.0",
+ "GPURuntime": "12.1"
+}
+
+
+
+
+
+
+
+
+ Section User
+
+
+
+
+ voGroup
+ string
+ The VO group that should be used with the proxy and under which the task should be submitted.
+
+
+ voRole
+ string
+ The VO role that should be used with the proxy and under which the task should be submitted.
+
+
+
+
+
+
+
+ Section Debug
+
+
+
+
+ oneEventMode
+ boolean
+ For experts use only.
+
+
+ asoConfig
+ list of dictionaries
+ For experts use only.
+
+
+ scheddName
+ string
+ For experts use only. NB if you select a schedd on the ITB pool, remember to change the collector accordingly!
+
+
+ extraJDL
+ list of strings
+ For experts use only.
+
+
+ collector
+ string
+ For experts use only.
+
+
+
+
Note for Data.splitting = 'EventAwareLumiBased'
+When CRAB does data discovery of the input dataset in DBS, the number of events is only known per input file (because that's the information available on DBS) and not per luminosity section. CRAB can therefore only estimate the number of events per luminosity section in a given input file as the number of events in the file divided by the number of luminosity sections in the file. Because of that,
Data.unitsPerJob and
Data.totalUnits should not be considered by the user as rigorous limits, but as limits applicable on average.
+
+
Note for maxJobRuntimeMin
+We strongly encourage every user to tune their splitting paramenters aiming for jobs to run for a few hours, ideally 8-10 hours, and set the
maxJobRuntimeMin accordingly.
+Having many jobs increases the chance of failure, since the number of problems is roughly proportional to the number of run jobs. Moreover, short jobs suffer of start/end overheads resulting in poor CPU/Wall-clock ratio, which impacts negatively CMS and makes it harder to secure additional resources.
+
+
Note for pyCfgParam
+Either
JobType.pyCfgParams=["arg=value" ] or
JobType.pyCfgParams=["arg", "value" ] are fine (the latter somehow more correct), but
JobType.pyCfgParams=["arg value" ] will not work. You can use python's
shlex.split to convert args from "the way you time them when running cmsRun interactively" to the correct format. Example
+
import shlex
+args = '--arg 1 --another-arg "my name"'
+config.JobType.pyCfgParams = shlex.split(args)
+results in
+
['--arg', '1', '--another-arg', 'my name'] i.e. it correctly preserves quoted spaces etc.
+
+N.B. there has been reports years ago that params with double dashes may break things. Most likely it was due to (now very old and unsupported) old python and/or CMSSW versions. But if you get odd errors you may check for this and report.
+
+
+
+
Note for storageSite
+In CRAB3 the output files of each job are transferred first to a temporary storage element in the site where the job ran and later from there to a permanent storage element in a destination site. The transfer to the permanent storage element is done asynchronously by a service called AsyncStageOut (ASO). The destination site must be specified in the
Site.storageSite parameter in the form
'Tx_yy_zzzzz' (e.g.
'T2_IT_Bari',
'T2_US_Nebraska', etc.). The official names of CMS sites can be found in the
CRIC web page.
The user MUST have write permission in the storage site.
+
+
+
Passing CRAB configuration parameters from the command line
+
+
+It is possible to define/overwrite CRAB configuration parameters by passing them through the command line when the
crab submit command is executed. Parameters can be set with the convention
<parameter-name>=<parameter-value> and can be sequentially listed separating them with a blank space. Here is an example on how one would pass the request name and the publication name:
+
+
+crab submit -c my_crab_config_file.py General.requestName=my_request_name Data.outputDatasetTag=my_publication_name
+
+
+
Note : Currently it is only possible to overwrite the parameters that take as value a string, an integer, a float or a boolean. Parameters that take a list can not be overwritten this way.
+
+
+
Converting a CRAB2 configuration file into a CRAB3 configuration file
+
+
+CRAB3 is essentially new compared to CRAB2; it is not just a re-write. As a consequence, the configuration is different and there is no direct trivial translation that can be done automatically for every CRAB2 configuration file into a CRAB3 one. There is only a basic CRAB3 utility, called
crab2cfgTOcrab3py, meant to help the user to convert an existing CRAB2 configuration file into a CRAB3 configuration file template. The user has to provide the name of the CRAB2 configuration file he/she wants to convert and the name he/she wants to give to the CRAB3 configuration file (both arguments have default values;
crab.cfg and
crabConfig.py respectively).
+
+
+crab2cfgTOcrab3py [crab2confgiName.cfg] [crab3configName.py]
+
+
+Instead of blindly taking the produced CRAB3 configuration file and run it, the user should always inspect the produced file, understand what each parameter means, edit them and add other parameters that might be needed, etc.
+
+Here we give a usage example. Suppose we have the following CRAB2 configuration file with the default name
crab.cfg:
+
+
+[CRAB]
+jobtype = cmssw
+scheduler = remoteGlidein
+use_server = 0
+
+[CMSSW]
+datasetpath = /GenericTTbar/HC-CMSSW_5_3_1_START53_V5-v1/GEN-SIM-RECO
+dbs_url = global
+pset = my_CMSSW_config.py
+number_of_jobs = 100
+events_per_job = 20
+output_file = output.root
+
+[GRID]
+se_white_list = T2_IT_Bari
+se_black_list = T2_IT_Legnaro
+data_location_override = T2_IT_Bari
+
+[USER]
+ui_working_dir = my_CRAB_project_directory
+return_data = 0
+copy_data = 1
+storage_element = T2_IT_Legnaro
+user_remote_dir = my_remote_directory
+publish_data = 1
+publish_data_name = my_publication_name
+dbs_url_for_publication = phys03
+
+
+If we run the tool without specifying any input parameters:
+
+
+crab2cfgTOcrab3py
+
+
+it will create a file
crabConfig.py with the following content:
+
+
+from WMCore.Configuration import Configuration
+config = Configuration()
+config.section_('General')
+config.General.transferOutputs = True
+config.General.requestName = 'my_CRAB_project_directory'
+config.section_('JobType')
+config.JobType.psetName = 'my_CMSSW_config.py'
+config.JobType.pluginName = 'Analysis'
+config.JobType.outputFiles = ['output.root']
+config.section_('Data')
+config.Data.inputDataset = '/GenericTTbar/HC-CMSSW_5_3_1_START53_V5-v1/GEN-SIM-RECO'
+config.Data.publication = True
+config.Data.unitsPerJob = 20
+config.Data.publishDBS = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
+config.Data.splitting = 'EventBased'
+config.Data.inputDBS = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader/'
+config.Data.outputDatasetTag = 'my_publication_name'
+config.section_('Site')
+config.Site.blacklist = ['T2_IT_Legnaro']
+config.Site.whitelist = ['T2_IT_Bari']
+config.Site.storageSite = 'T2_IT_Legnaro'
+
+
+and it will show the following screen output:
+
+
+Convertion done!
+crab2cfgTOcrab3py report:
+CRAB2 parameters not YET supported in CRAB3:
+ data_location_override,user_remote_dir
+CRAB2 parameters obsolete in CRAB3:
+ return_data,jobtype,scheduler,use_server
+
+
+As we already emphasized, the template configuration file produced by the
crab2cfgTOcrab3py utility should not be used before carefully looking into its content. Along this line, one can see for example that the parameter
JobType.outputFiles was set to
['output.root']. If
output.root is defined in the CMSSW parameter-set configuration file in an output module, then it doesn't have to be included in the
JobType.outputFiles list (although it doesn't harm).
+
+
+
+
+--
AndresTanasijczuk - 02 Oct 2014
+
+
+
+
+
+
+
+
+
+
+
Copyright &© 2008-2026 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
or Ideas, requests, problems regarding TWiki? use
Discourse or
Send feedback
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/unit/test_scrapers_resource_adapter.py b/tests/unit/test_scrapers_resource_adapter.py
new file mode 100644
index 000000000..95b4c79d8
--- /dev/null
+++ b/tests/unit/test_scrapers_resource_adapter.py
@@ -0,0 +1,39 @@
+import pytest
+
+from src.data_manager.collectors.scrapers.adapters import to_scraped_resource
+from src.data_manager.collectors.scrapers.items import WebPageItem
+from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
+
+# ---------------------------------------------------------------------------
+# WebPageItem adapter
+# ---------------------------------------------------------------------------
+
+class TestWebPageItemAdapter:
+ def _make_item(self, **overrides) -> WebPageItem:
+ base = {
+ "url": "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ "content": "CRAB3ConfigurationFile",
+ "title": "CRAB3ConfigurationFile",
+ "suffix": "html",
+ "source_type": "web",
+ "content_type": "text/html",
+ "encoding": "utf-8",
+ }
+ return WebPageItem({**base, **overrides})
+
+ def test_returns_scraped_resource(self):
+ assert isinstance(to_scraped_resource(self._make_item()), ScrapedResource)
+
+# ---------------------------------------------------------------------------
+# Unregistered item type — must fail loudly
+# ---------------------------------------------------------------------------
+
+class TestUnregisteredItem:
+ def test_raises_type_error_for_unknown_item(self):
+ """Adapter must raise, never silently return None or a half-baked resource."""
+
+ class UnknownItem(dict):
+ pass
+
+ with pytest.raises(TypeError, match="No adapter registered"):
+ to_scraped_resource(UnknownItem({"url": "x", "content": "y"}))
diff --git a/tests/unit/test_twiki_parser.py b/tests/unit/test_twiki_parser.py
new file mode 100644
index 000000000..9c9ab3e5f
--- /dev/null
+++ b/tests/unit/test_twiki_parser.py
@@ -0,0 +1,54 @@
+# tests/unit/test_twiki_parser.py
+from pathlib import Path
+
+from scrapy.http import HtmlResponse, Request, Response
+
+from src.data_manager.collectors.scrapers.parsers.twiki import parse_twiki_page
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+def fake_html_response(url: str, fixture_name: str, charset: str) -> HtmlResponse:
+ body = (FIXTURES / fixture_name).read_bytes()
+ headers = {}
+ if charset:
+ headers[b"Content-Type"] = [f"text/html; charset={charset}".encode("ascii")]
+ # No `encoding=`: let Scrapy infer from headers + HTML meta (like a real download).
+ return HtmlResponse(
+ url=url,
+ status=200,
+ body=body,
+ headers=headers,
+ request=Request(url=url),
+ )
+
+class TestParseTwikiPage:
+
+ def test_conventional_twiki_page(self):
+ response = fake_html_response(
+ "https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3ConfigurationFile",
+ "twiki_twiki_bin_view_cmspublic_crab3_configuration_file.html",
+ "iso-8859-1",
+ )
+ item = next(parse_twiki_page(response))
+ assert item['title'] == "CRAB3ConfigurationFile"
+ assert item['suffix'] == "html"
+ assert item['source_type'] == "web"
+ assert item['content_type'] == "text/html; charset=iso-8859-1"
+ assert item['encoding'] == "cp1252"
+ # HTML fragment (outer tag + children), not flattened text — for MarkItDown etc.
+ assert "<" in item["content"] and ">" in item["content"]
+ assert "patternMainContents" in item["content"]
+ assert "href=" in item["content"]
+
+ def test_pdf_yields_bytes_like_link_parser(self):
+ url = "https://twiki.cern.ch/twiki/pub/CMSPublic/Topic/file.pdf"
+ response = Response(
+ url=url,
+ body=b"%PDF-1.4 minimal",
+ headers={b"Content-Type": [b"application/pdf"]},
+ request=Request(url=url),
+ )
+ item = next(parse_twiki_page(response))
+ assert item["suffix"] == "pdf"
+ assert item["content"] == b"%PDF-1.4 minimal"
+ assert item["title"] == "file"
\ No newline at end of file