Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/on_schedule_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
fail-fast: false
max-parallel: 12
matrix:
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"]
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel", "stagehand"]
http-client: ["httpx", "curl_impersonate", "impit"]
package-manager: ["pip", "uv", "poetry"]

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/project_template/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"project_name": "crawlee-python-project",
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"],
"crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit", "stagehand"],
"__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
"http_client": ["impit", "httpx", "curl-impersonate"],
"package_manager": ["poetry", "pip", "uv"],
Expand Down
5 changes: 3 additions & 2 deletions src/crawlee/project_template/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# % set needs_playwright = cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
import platform
import subprocess
from pathlib import Path
Expand All @@ -12,7 +13,7 @@
subprocess.check_call(['uv', 'sync'])
# % endif

# % if cookiecutter.crawler_type == 'playwright'
# % if needs_playwright
manager = "{{ cookiecutter.package_manager }}"
subprocess.check_call([manager, 'run', 'playwright', 'install'])
# % endif
Expand All @@ -38,7 +39,7 @@
subprocess.check_output([str(path / 'pip'), 'freeze']).decode()
)

# % if cookiecutter.crawler_type == 'playwright'
# % if needs_playwright
subprocess.check_call([str(path / 'playwright'), 'install'])
# % endif
# % endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.crawlers import AdaptivePlaywrightCrawler
# % endblock

# % block instantiation
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=10,
{{ self.http_client_instantiation() }})
# % endblock
12 changes: 12 additions & 0 deletions src/crawlee/project_template/templates/main_adaptive_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.crawlers import AdaptivePlaywrightCrawler
# % endblock

# % block instantiation
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
request_handler=router,
max_requests_per_crawl=10,
{{ self.http_client_instantiation() }})
# % endblock
17 changes: 17 additions & 0 deletions src/crawlee/project_template/templates/main_stagehand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# % extends 'main.py'

# % block import
from crawlee.browsers import StagehandOptions
from crawlee.crawlers import StagehandCrawler
# % endblock

# % block instantiation
crawler = StagehandCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=10,
stagehand_options=StagehandOptions(
model_api_key='<YOUR_OPENAI_API_KEY>',
Comment thread
vdusek marked this conversation as resolved.
Outdated
),
{{ self.http_client_instantiation() }})
# % endblock
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router

router = Router[AdaptivePlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.parsed_content.find('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': title.text if title else None,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions src/crawlee/project_template/templates/routes_adaptive_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router

router = Router[AdaptivePlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.parsed_content.xpath('//title/text()').get()
await context.push_data(
{
'url': context.request.loaded_url,
'title': title,
}
)

await context.enqueue_links()
21 changes: 21 additions & 0 deletions src/crawlee/project_template/templates/routes_stagehand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from crawlee.crawlers import StagehandCrawlingContext
from crawlee.router import Router

router = Router[StagehandCrawlingContext]()


@router.default_handler
async def default_handler(context: StagehandCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')

data = await context.page.extract(instruction='Get the page title and main heading.')

await context.push_data(
{
'url': context.request.loaded_url,
'data': data.model_dump(),
}
)

await context.enqueue_links()
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
# % if cookiecutter.crawler_type == 'playwright'
FROM apify/actor-python-playwright:3.13
# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
# % set base_image = 'apify/actor-python-playwright:3.13'
# % elif cookiecutter.crawler_type == 'playwright-camoufox'
FROM apify/actor-python-playwright-camoufox:3.13
# % set base_image = 'apify/actor-python-playwright-camoufox:3.13'
# % elif cookiecutter.crawler_type == 'playwright-chrome'
FROM apify/actor-python-playwright-chrome:3.13
# % set base_image = 'apify/actor-python-playwright-chrome:3.13'
# % elif cookiecutter.crawler_type == 'playwright-firefox'
FROM apify/actor-python-playwright-firefox:3.13
# % set base_image = 'apify/actor-python-playwright-firefox:3.13'
# % elif cookiecutter.crawler_type == 'playwright-webkit'
FROM apify/actor-python-playwright-webkit:3.13
# % set base_image = 'apify/actor-python-playwright-webkit:3.13'
# % else
FROM apify/actor-python:3.13
# % set base_image = 'apify/actor-python:3.13'
# % endif
FROM {{ base_image }}

RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# % if cookiecutter.crawler_type.startswith('playwright')
# % if cookiecutter.crawler_type.startswith('adaptive-')
# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel']
# % elif cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % elif cookiecutter.crawler_type == 'stagehand'
# % set extras = ['stagehand']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % elif cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % if cookiecutter.http_client in ('httpx', 'curl-impersonate')
# % do extras.append(cookiecutter.http_client)
# % endif

[project]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# % if cookiecutter.crawler_type == 'playwright-camoufox'
camoufox[geoip]~=0.4.5
# % endif
# % if cookiecutter.crawler_type.startswith('playwright')
# % if cookiecutter.crawler_type.startswith('adaptive-')
# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel']
# % elif cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % elif cookiecutter.crawler_type == 'stagehand'
# % set extras = ['stagehand']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.enable_apify_integration
apify
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % endif
# % if cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % if cookiecutter.http_client in ('httpx', 'curl-impersonate')
# % do extras.append(cookiecutter.http_client)
# % endif
crawlee[{{ extras | join(',') }}]
3 changes: 3 additions & 0 deletions tests/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def pytest_configure(config: Config) -> None:
'playwright_webkit',
'parsel',
'beautifulsoup',
'adaptive_beautifulsoup',
'adaptive_parsel',
'stagehand',
'uv',
'poetry',
'pip',
Expand Down
11 changes: 11 additions & 0 deletions tests/e2e/project_template/test_static_crawlers_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
pytest.param('playwright', marks=pytest.mark.playwright),
pytest.param('parsel', marks=pytest.mark.parsel),
pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),
pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup),
pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel),
pytest.param('stagehand', marks=pytest.mark.stagehand),
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -106,6 +109,14 @@ async def test_static_crawler_actor_at_apify(
client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN'))
actor = client.actor(actor_id)

# The template ships a placeholder API key, so only validate the build and skip the run.
if crawler_type == 'stagehand':
try:
assert build_process.returncode == 0
finally:
await actor.delete()
return

# Run actor
try:
assert build_process.returncode == 0
Expand Down
Loading