From cc2a712ff0a0d3df76f3e2ac802a12fb4a4268d3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 9 May 2026 10:58:06 +0200 Subject: [PATCH 1/6] feat: Add HTTP and Adaptive crawler templates Add `http`, `adaptive-beautifulsoup`, and `adaptive-parsel` to the cookiecutter project template (`crawlee create`), plus matching markers, parametrization in the e2e templates tests, and entries in the scheduled-tests CI matrix so the new types get the same coverage as existing ones. --- .github/workflows/on_schedule_tests.yaml | 2 +- .../project_template/cookiecutter.json | 2 +- .../hooks/post_gen_project.py | 4 +-- .../templates/main_adaptive_beautifulsoup.py | 12 +++++++ .../templates/main_adaptive_parsel.py | 12 +++++++ .../project_template/templates/main_http.py | 12 +++++++ .../routes_adaptive_beautifulsoup.py | 19 ++++++++++ .../templates/routes_adaptive_parsel.py | 19 ++++++++++ .../project_template/templates/routes_http.py | 35 +++++++++++++++++++ .../{{cookiecutter.project_name}}/Dockerfile | 15 ++++---- .../pyproject.toml | 16 ++++++--- .../requirements.txt | 17 +++++---- tests/e2e/conftest.py | 3 ++ .../test_static_crawlers_templates.py | 3 ++ 14 files changed, 149 insertions(+), 22 deletions(-) create mode 100644 src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py create mode 100644 src/crawlee/project_template/templates/main_adaptive_parsel.py create mode 100644 src/crawlee/project_template/templates/main_http.py create mode 100644 src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py create mode 100644 src/crawlee/project_template/templates/routes_adaptive_parsel.py create mode 100644 src/crawlee/project_template/templates/routes_http.py diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index bf041faa08..83ceba4943 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -27,7 +27,7 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "http", "adaptive_beautifulsoup", "adaptive_parsel"] http-client: ["httpx", "curl_impersonate", "impit"] package-manager: ["pip", "uv", "poetry"] diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index 53e0c8f445..f3ec54e97a 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], + "crawler_type": ["beautifulsoup", "parsel", "http", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index dc90ec516c..fd24e168d2 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -12,7 +12,7 @@ subprocess.check_call(['uv', 'sync']) # % endif -# % if cookiecutter.crawler_type == 'playwright' +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') manager = "{{ cookiecutter.package_manager }}" subprocess.check_call([manager, 'run', 'playwright', 'install']) # % endif @@ -38,7 +38,7 @@ subprocess.check_output([str(path / 'pip'), 'freeze']).decode() ) -# % if cookiecutter.crawler_type == 'playwright' +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') subprocess.check_call([str(path / 'playwright'), 'install']) # % endif # % endif diff --git a/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py b/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py new file mode 100644 index 0000000000..7c536498b9 --- /dev/null +++ b/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import AdaptivePlaywrightCrawler +# % endblock + +# % block instantiation +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + request_handler=router, + max_requests_per_crawl=10, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_adaptive_parsel.py b/src/crawlee/project_template/templates/main_adaptive_parsel.py new file mode 100644 index 0000000000..ff789928e0 --- /dev/null +++ b/src/crawlee/project_template/templates/main_adaptive_parsel.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import AdaptivePlaywrightCrawler +# % endblock + +# % block instantiation +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + request_handler=router, + max_requests_per_crawl=10, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_http.py b/src/crawlee/project_template/templates/main_http.py new file mode 100644 index 0000000000..73b7141f25 --- /dev/null +++ b/src/crawlee/project_template/templates/main_http.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import HttpCrawler +# % endblock + +# % block instantiation +crawler = HttpCrawler( + request_handler=router, + max_requests_per_crawl=10, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py b/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py new file mode 100644 index 0000000000..b0d8333853 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py @@ -0,0 +1,19 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[AdaptivePlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.parsed_content.find('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title.text if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_adaptive_parsel.py b/src/crawlee/project_template/templates/routes_adaptive_parsel.py new file mode 100644 index 0000000000..95e767075d --- /dev/null +++ b/src/crawlee/project_template/templates/routes_adaptive_parsel.py @@ -0,0 +1,19 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[AdaptivePlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.parsed_content.xpath('//title/text()').get() + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_http.py b/src/crawlee/project_template/templates/routes_http.py new file mode 100644 index 0000000000..381697d3e1 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_http.py @@ -0,0 +1,35 @@ +import re +from urllib.parse import urljoin, urlparse + +from crawlee.crawlers import HttpCrawlingContext +from crawlee.router import Router + +router = Router[HttpCrawlingContext]() + + +@router.default_handler +async def default_handler(context: HttpCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + body = (await context.http_response.read()).decode(errors='replace') + + title_match = re.search(r']*>(.*?)', body, re.DOTALL | re.IGNORECASE) + title = title_match.group(1).strip() if title_match else None + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + # HttpCrawler has no HTML parser, so links are extracted with a regex below. + # For real HTML scraping, prefer BeautifulSoupCrawler or ParselCrawler. + base_url = context.request.loaded_url or context.request.url + base_host = urlparse(base_url).hostname + new_requests: list[str] = [] + for match in re.finditer(r']*\bhref=["\']([^"\']+)["\']', body, re.IGNORECASE): + absolute = urljoin(base_url, match.group(1)) + parsed = urlparse(absolute) + if parsed.scheme in ('http', 'https') and parsed.hostname == base_host: + new_requests.append(absolute) + await context.add_requests(new_requests) diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index 323181d058..280bca82c7 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -1,19 +1,20 @@ # First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. -# % if cookiecutter.crawler_type == 'playwright' -FROM apify/actor-python-playwright:3.13 +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') +# % set base_image = 'apify/actor-python-playwright:3.13' # % elif cookiecutter.crawler_type == 'playwright-camoufox' -FROM apify/actor-python-playwright-camoufox:3.13 +# % set base_image = 'apify/actor-python-playwright-camoufox:3.13' # % elif cookiecutter.crawler_type == 'playwright-chrome' -FROM apify/actor-python-playwright-chrome:3.13 +# % set base_image = 'apify/actor-python-playwright-chrome:3.13' # % elif cookiecutter.crawler_type == 'playwright-firefox' -FROM apify/actor-python-playwright-firefox:3.13 +# % set base_image = 'apify/actor-python-playwright-firefox:3.13' # % elif cookiecutter.crawler_type == 'playwright-webkit' -FROM apify/actor-python-playwright-webkit:3.13 +# % set base_image = 'apify/actor-python-playwright-webkit:3.13' # % else -FROM apify/actor-python:3.13 +# % set base_image = 'apify/actor-python:3.13' # % endif +FROM {{ base_image }} RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/* diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 2de6aa2532..23fa68a0f0 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -1,12 +1,14 @@ -# % if cookiecutter.crawler_type.startswith('playwright') +# % if cookiecutter.crawler_type == 'http' +# % set extras = [] +# % elif cookiecutter.crawler_type.startswith('adaptive-') +# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] +# % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] # % endif -# % if cookiecutter.http_client == 'curl-impersonate' -# % do extras.append('curl-impersonate') -# % elif cookiecutter.http_client == 'httpx' -# % do extras.append('httpx') +# % if cookiecutter.http_client in ('httpx', 'curl-impersonate') +# % do extras.append(cookiecutter.http_client) # % endif [project] @@ -19,7 +21,11 @@ authors = [ readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ + # % if extras "crawlee[{{ extras|join(',') }}]", + # % else + "crawlee", + # % endif # % if cookiecutter.crawler_type == 'playwright-camoufox' "camoufox[geoip]~=0.4.5", # % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index 8ac28ed5e4..cf61f73e29 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -1,7 +1,11 @@ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 # % endif -# % if cookiecutter.crawler_type.startswith('playwright') +# % if cookiecutter.crawler_type == 'http' +# % set extras = [] +# % elif cookiecutter.crawler_type.startswith('adaptive-') +# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] +# % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] # % else # % set extras = [cookiecutter.crawler_type] @@ -9,10 +13,11 @@ camoufox[geoip]~=0.4.5 # % if cookiecutter.enable_apify_integration apify # % endif -# % if cookiecutter.http_client == 'curl-impersonate' -# % do extras.append('curl-impersonate') -# % endif -# % if cookiecutter.http_client == 'httpx' -# % do extras.append('httpx') +# % if cookiecutter.http_client in ('httpx', 'curl-impersonate') +# % do extras.append(cookiecutter.http_client) # % endif +# % if extras crawlee[{{ extras | join(',') }}] +# % else +crawlee +# % endif diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 81945a760a..c000ba1bba 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -20,6 +20,9 @@ def pytest_configure(config: Config) -> None: 'playwright_webkit', 'parsel', 'beautifulsoup', + 'http', + 'adaptive_beautifulsoup', + 'adaptive_parsel', 'uv', 'poetry', 'pip', diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index bd40f0c8ac..7bec34bba6 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -26,6 +26,9 @@ pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), + pytest.param('http', marks=pytest.mark.http), + pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup), + pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel), ], ) @pytest.mark.parametrize( From b11f51d5cbb0d7f398edae8772b04cea9463a0e1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 May 2026 14:58:37 +0200 Subject: [PATCH 2/6] rm HTTP crawler --- .github/workflows/on_schedule_tests.yaml | 2 +- .../project_template/cookiecutter.json | 2 +- .../project_template/templates/main_http.py | 12 ------- .../project_template/templates/routes_http.py | 35 ------------------- .../pyproject.toml | 8 +---- .../requirements.txt | 8 +---- tests/e2e/conftest.py | 1 - .../test_static_crawlers_templates.py | 1 - 8 files changed, 4 insertions(+), 65 deletions(-) delete mode 100644 src/crawlee/project_template/templates/main_http.py delete mode 100644 src/crawlee/project_template/templates/routes_http.py diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index 83ceba4943..7a0fd5b612 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -27,7 +27,7 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "http", "adaptive_beautifulsoup", "adaptive_parsel"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel"] http-client: ["httpx", "curl_impersonate", "impit"] package-manager: ["pip", "uv", "poetry"] diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index f3ec54e97a..710adeef9f 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "http", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], + "crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/templates/main_http.py b/src/crawlee/project_template/templates/main_http.py deleted file mode 100644 index 73b7141f25..0000000000 --- a/src/crawlee/project_template/templates/main_http.py +++ /dev/null @@ -1,12 +0,0 @@ -# % extends 'main.py' - -# % block import -from crawlee.crawlers import HttpCrawler -# % endblock - -# % block instantiation -crawler = HttpCrawler( - request_handler=router, - max_requests_per_crawl=10, - {{ self.http_client_instantiation() }}) -# % endblock diff --git a/src/crawlee/project_template/templates/routes_http.py b/src/crawlee/project_template/templates/routes_http.py deleted file mode 100644 index 381697d3e1..0000000000 --- a/src/crawlee/project_template/templates/routes_http.py +++ /dev/null @@ -1,35 +0,0 @@ -import re -from urllib.parse import urljoin, urlparse - -from crawlee.crawlers import HttpCrawlingContext -from crawlee.router import Router - -router = Router[HttpCrawlingContext]() - - -@router.default_handler -async def default_handler(context: HttpCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - body = (await context.http_response.read()).decode(errors='replace') - - title_match = re.search(r']*>(.*?)', body, re.DOTALL | re.IGNORECASE) - title = title_match.group(1).strip() if title_match else None - await context.push_data( - { - 'url': context.request.loaded_url, - 'title': title, - } - ) - - # HttpCrawler has no HTML parser, so links are extracted with a regex below. - # For real HTML scraping, prefer BeautifulSoupCrawler or ParselCrawler. - base_url = context.request.loaded_url or context.request.url - base_host = urlparse(base_url).hostname - new_requests: list[str] = [] - for match in re.finditer(r']*\bhref=["\']([^"\']+)["\']', body, re.IGNORECASE): - absolute = urljoin(base_url, match.group(1)) - parsed = urlparse(absolute) - if parsed.scheme in ('http', 'https') and parsed.hostname == base_host: - new_requests.append(absolute) - await context.add_requests(new_requests) diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 23fa68a0f0..633f8acbef 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -1,6 +1,4 @@ -# % if cookiecutter.crawler_type == 'http' -# % set extras = [] -# % elif cookiecutter.crawler_type.startswith('adaptive-') +# % if cookiecutter.crawler_type.startswith('adaptive-') # % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] # % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] @@ -21,11 +19,7 @@ authors = [ readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ - # % if extras "crawlee[{{ extras|join(',') }}]", - # % else - "crawlee", - # % endif # % if cookiecutter.crawler_type == 'playwright-camoufox' "camoufox[geoip]~=0.4.5", # % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index cf61f73e29..2891a42351 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -1,9 +1,7 @@ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 # % endif -# % if cookiecutter.crawler_type == 'http' -# % set extras = [] -# % elif cookiecutter.crawler_type.startswith('adaptive-') +# % if cookiecutter.crawler_type.startswith('adaptive-') # % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] # % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] @@ -16,8 +14,4 @@ apify # % if cookiecutter.http_client in ('httpx', 'curl-impersonate') # % do extras.append(cookiecutter.http_client) # % endif -# % if extras crawlee[{{ extras | join(',') }}] -# % else -crawlee -# % endif diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index c000ba1bba..3f1a707636 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -20,7 +20,6 @@ def pytest_configure(config: Config) -> None: 'playwright_webkit', 'parsel', 'beautifulsoup', - 'http', 'adaptive_beautifulsoup', 'adaptive_parsel', 'uv', diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index 7bec34bba6..df66cb8862 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -26,7 +26,6 @@ pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), - pytest.param('http', marks=pytest.mark.http), pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup), pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel), ], From 464fc58fe26ec055dc26f093c14414792abe2cc7 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 May 2026 15:42:20 +0200 Subject: [PATCH 3/6] Add stagehand --- .github/workflows/on_schedule_tests.yaml | 2 +- .../project_template/cookiecutter.json | 2 +- .../hooks/post_gen_project.py | 4 ++-- .../templates/main_stagehand.py | 18 +++++++++++++++ .../templates/routes_stagehand.py | 22 +++++++++++++++++++ .../{{cookiecutter.project_name}}/Dockerfile | 2 +- .../pyproject.toml | 2 ++ .../requirements.txt | 2 ++ tests/e2e/conftest.py | 1 + .../test_static_crawlers_templates.py | 10 +++++++++ 10 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 src/crawlee/project_template/templates/main_stagehand.py create mode 100644 src/crawlee/project_template/templates/routes_stagehand.py diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index 7a0fd5b612..96c4edb98d 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -27,7 +27,7 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel", "stagehand"] http-client: ["httpx", "curl_impersonate", "impit"] package-manager: ["pip", "uv", "poetry"] diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index 710adeef9f..0d50dbd766 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], + "crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit", "stagehand"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index fd24e168d2..7cd71c49ce 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -12,7 +12,7 @@ subprocess.check_call(['uv', 'sync']) # % endif -# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' manager = "{{ cookiecutter.package_manager }}" subprocess.check_call([manager, 'run', 'playwright', 'install']) # % endif @@ -38,7 +38,7 @@ subprocess.check_output([str(path / 'pip'), 'freeze']).decode() ) -# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' subprocess.check_call([str(path / 'playwright'), 'install']) # % endif # % endif diff --git a/src/crawlee/project_template/templates/main_stagehand.py b/src/crawlee/project_template/templates/main_stagehand.py new file mode 100644 index 0000000000..3112e3fb40 --- /dev/null +++ b/src/crawlee/project_template/templates/main_stagehand.py @@ -0,0 +1,18 @@ +# % extends 'main.py' + +# % block import +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler +# % endblock + +# % block instantiation +crawler = StagehandCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + stagehand_options=StagehandOptions( + model_api_key='', + ), + {{ self.http_client_instantiation() }} +) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_stagehand.py b/src/crawlee/project_template/templates/routes_stagehand.py new file mode 100644 index 0000000000..d0cbe5f6ab --- /dev/null +++ b/src/crawlee/project_template/templates/routes_stagehand.py @@ -0,0 +1,22 @@ +from crawlee.crawlers import StagehandCrawlingContext +from crawlee.router import Router + +router = Router[StagehandCrawlingContext]() + + +@router.default_handler +async def default_handler(context: StagehandCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + + # Use Stagehand AI to extract structured data from the page. + data = await context.page.extract(instruction='Get the page title and main heading.') + + await context.push_data( + { + 'url': context.request.loaded_url, + 'data': data.model_dump() if hasattr(data, 'model_dump') else data, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index 280bca82c7..4f958871e2 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -1,7 +1,7 @@ # First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. -# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' # % set base_image = 'apify/actor-python-playwright:3.13' # % elif cookiecutter.crawler_type == 'playwright-camoufox' # % set base_image = 'apify/actor-python-playwright-camoufox:3.13' diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 633f8acbef..f74127ec6f 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -2,6 +2,8 @@ # % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] # % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] +# % elif cookiecutter.crawler_type == 'stagehand' +# % set extras = ['stagehand'] # % else # % set extras = [cookiecutter.crawler_type] # % endif diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index 2891a42351..47c6091d77 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -5,6 +5,8 @@ camoufox[geoip]~=0.4.5 # % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] # % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] +# % elif cookiecutter.crawler_type == 'stagehand' +# % set extras = ['stagehand'] # % else # % set extras = [cookiecutter.crawler_type] # % endif diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 3f1a707636..25c5ab8695 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -22,6 +22,7 @@ def pytest_configure(config: Config) -> None: 'beautifulsoup', 'adaptive_beautifulsoup', 'adaptive_parsel', + 'stagehand', 'uv', 'poetry', 'pip', diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index df66cb8862..43387178c1 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -28,6 +28,7 @@ pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup), pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel), + pytest.param('stagehand', marks=pytest.mark.stagehand), ], ) @pytest.mark.parametrize( @@ -108,6 +109,15 @@ async def test_static_crawler_actor_at_apify( client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN')) actor = client.actor(actor_id) + # Stagehand requires an OpenAI API key at runtime; the template ships a placeholder, so we only + # validate that the actor builds successfully and skip the actual run. + if crawler_type == 'stagehand': + try: + assert build_process.returncode == 0 + finally: + await actor.delete() + return + # Run actor try: assert build_process.returncode == 0 From 6986eefc810d68c6248a7442829be4794c993d6c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 May 2026 15:47:28 +0200 Subject: [PATCH 4/6] Simplification --- src/crawlee/project_template/hooks/post_gen_project.py | 5 +++-- src/crawlee/project_template/templates/main_stagehand.py | 3 +-- src/crawlee/project_template/templates/routes_stagehand.py | 3 +-- tests/e2e/project_template/test_static_crawlers_templates.py | 3 +-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index 7cd71c49ce..c7e69b030b 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -1,3 +1,4 @@ +# % set needs_playwright = cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' import platform import subprocess from pathlib import Path @@ -12,7 +13,7 @@ subprocess.check_call(['uv', 'sync']) # % endif -# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' +# % if needs_playwright manager = "{{ cookiecutter.package_manager }}" subprocess.check_call([manager, 'run', 'playwright', 'install']) # % endif @@ -38,7 +39,7 @@ subprocess.check_output([str(path / 'pip'), 'freeze']).decode() ) -# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' +# % if needs_playwright subprocess.check_call([str(path / 'playwright'), 'install']) # % endif # % endif diff --git a/src/crawlee/project_template/templates/main_stagehand.py b/src/crawlee/project_template/templates/main_stagehand.py index 3112e3fb40..399406ffb7 100644 --- a/src/crawlee/project_template/templates/main_stagehand.py +++ b/src/crawlee/project_template/templates/main_stagehand.py @@ -13,6 +13,5 @@ stagehand_options=StagehandOptions( model_api_key='', ), - {{ self.http_client_instantiation() }} -) + {{ self.http_client_instantiation() }}) # % endblock diff --git a/src/crawlee/project_template/templates/routes_stagehand.py b/src/crawlee/project_template/templates/routes_stagehand.py index d0cbe5f6ab..b197250b16 100644 --- a/src/crawlee/project_template/templates/routes_stagehand.py +++ b/src/crawlee/project_template/templates/routes_stagehand.py @@ -9,13 +9,12 @@ async def default_handler(context: StagehandCrawlingContext) -> None: """Default request handler.""" context.log.info(f'Processing {context.request.url} ...') - # Use Stagehand AI to extract structured data from the page. data = await context.page.extract(instruction='Get the page title and main heading.') await context.push_data( { 'url': context.request.loaded_url, - 'data': data.model_dump() if hasattr(data, 'model_dump') else data, + 'data': data.model_dump(), } ) diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index 43387178c1..358f05fe50 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -109,8 +109,7 @@ async def test_static_crawler_actor_at_apify( client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN')) actor = client.actor(actor_id) - # Stagehand requires an OpenAI API key at runtime; the template ships a placeholder, so we only - # validate that the actor builds successfully and skip the actual run. + # The template ships a placeholder API key, so only validate the build and skip the run. if crawler_type == 'stagehand': try: assert build_process.returncode == 0 From 5f331df20e0d6fdd8155ba3cd057241531d29016 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 May 2026 20:50:35 +0200 Subject: [PATCH 5/6] feat: Read Stagehand OpenAI key from OPENAI_API_KEY env var in template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why: Picks up the API key from CI/local env automatically, with a clear error if it's missing — instead of leaving a placeholder string that silently fails at runtime. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/crawlee/project_template/templates/main_stagehand.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/crawlee/project_template/templates/main_stagehand.py b/src/crawlee/project_template/templates/main_stagehand.py index 399406ffb7..4f408b7c56 100644 --- a/src/crawlee/project_template/templates/main_stagehand.py +++ b/src/crawlee/project_template/templates/main_stagehand.py @@ -1,17 +1,23 @@ # % extends 'main.py' # % block import +import os + from crawlee.browsers import StagehandOptions from crawlee.crawlers import StagehandCrawler # % endblock # % block instantiation +model_api_key = os.environ.get('OPENAI_API_KEY') +if model_api_key is None: + raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') + crawler = StagehandCrawler( request_handler=router, headless=True, max_requests_per_crawl=10, stagehand_options=StagehandOptions( - model_api_key='', + model_api_key=model_api_key, ), {{ self.http_client_instantiation() }}) # % endblock From 9e706c4a1614e9d8b7e2a5f45fcecdacee73e997 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 May 2026 20:52:18 +0200 Subject: [PATCH 6/6] feat: Use ValueError for missing OPENAI_API_KEY in Stagehand template Co-Authored-By: Claude Opus 4.7 (1M context) --- src/crawlee/project_template/templates/main_stagehand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/project_template/templates/main_stagehand.py b/src/crawlee/project_template/templates/main_stagehand.py index 4f408b7c56..0a63a5153e 100644 --- a/src/crawlee/project_template/templates/main_stagehand.py +++ b/src/crawlee/project_template/templates/main_stagehand.py @@ -10,7 +10,7 @@ # % block instantiation model_api_key = os.environ.get('OPENAI_API_KEY') if model_api_key is None: - raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') + raise ValueError('The OPENAI_API_KEY environment variable is not set.') crawler = StagehandCrawler( request_handler=router,