From 1febb655df57a44753ed86779ce6320883a0d7bc Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 22 Apr 2026 22:44:55 +0000 Subject: [PATCH 01/10] add stagehand plugin --- .../playwright_crawler_stagehand/__init__.py | 0 .../browser_classes.py | 101 -------- .../stagehand_run.py | 66 ------ .../support_classes.py | 57 ----- docs/guides/playwright_crawler_stagehand.mdx | 66 ------ pyproject.toml | 3 + .../browsers/_stagehand_browser_controller.py | 193 +++++++++++++++ .../browsers/_stagehand_browser_plugin.py | 221 ++++++++++++++++++ src/crawlee/browsers/_types.py | 93 +++++++- src/crawlee/crawlers/_stagehand/__init__.py | 3 + .../crawlers/_stagehand/_stagehand_crawler.py | 99 ++++++++ uv.lock | 46 +++- 12 files changed, 654 insertions(+), 294 deletions(-) delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/__init__.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py delete mode 100644 docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py delete mode 100644 docs/guides/playwright_crawler_stagehand.mdx create mode 100644 src/crawlee/browsers/_stagehand_browser_controller.py create mode 100644 src/crawlee/browsers/_stagehand_browser_plugin.py create mode 100644 src/crawlee/crawlers/_stagehand/__init__.py create mode 100644 src/crawlee/crawlers/_stagehand/_stagehand_crawler.py diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py b/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py deleted file mode 100644 index 67b76f1f47..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, cast - -from stagehand.context import StagehandContext -from typing_extensions import override - -from crawlee.browsers import ( - PlaywrightBrowserController, - PlaywrightBrowserPlugin, - PlaywrightPersistentBrowser, -) - -from .support_classes import CrawleeStagehandPage - -if TYPE_CHECKING: - from collections.abc import Mapping - - from playwright.async_api import Page - from stagehand import Stagehand - - from crawlee.proxy_configuration import ProxyInfo - - -class StagehandBrowserController(PlaywrightBrowserController): - @override - def __init__( - self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any - ) -> None: - # Initialize with browser context instead of browser instance - super().__init__(browser, **kwargs) - - self._stagehand = stagehand - self._stagehand_context: StagehandContext | None = None - - @override - async def new_page( - self, - browser_new_context_options: Mapping[str, Any] | None = None, - proxy_info: ProxyInfo | None = None, - ) -> Page: - # Initialize browser context if not already done - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - - # Initialize Stagehand context if not already done - if not self._stagehand_context: - self._stagehand_context = await StagehandContext.init( - self._browser_context, self._stagehand - ) - - # Create a new page using Stagehand context - page = await self._stagehand_context.new_page() - - pw_page = page._page # noqa: SLF001 - - # Handle page close event - pw_page.on(event='close', f=self._on_page_close) - - # Update internal state - self._pages.append(pw_page) - self._last_page_opened_at = datetime.now(timezone.utc) - - self._total_opened_pages += 1 - - # Wrap StagehandPage to provide Playwright Page interface - return cast('Page', CrawleeStagehandPage(page)) - - -class StagehandPlugin(PlaywrightBrowserPlugin): - """Browser plugin that integrates Stagehand with Crawlee's browser management.""" - - @override - def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None: - super().__init__(**kwargs) - - self._stagehand = stagehand - - @override - async def new_browser(self) -> StagehandBrowserController: - if not self._playwright: - raise RuntimeError('Playwright browser plugin is not initialized.') - - browser = PlaywrightPersistentBrowser( - # Stagehand can run only on a Chromium-based browser. - self._playwright.chromium, - self._user_data_dir, - self._browser_launch_options, - ) - - # Return custom controller with Stagehand - return StagehandBrowserController( - browser=browser, - stagehand=self._stagehand, - header_generator=None, - fingerprint_generator=None, - ) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py b/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py deleted file mode 100644 index 6cf8cc2689..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from typing import cast - -from stagehand import StagehandConfig, StagehandPage - -from crawlee import ConcurrencySettings -from crawlee.browsers import BrowserPool -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -from .browser_classes import StagehandPlugin -from .support_classes import CrawleeStagehand - - -async def main() -> None: - # Configure local Stagehand with Gemini model - config = StagehandConfig( - env='LOCAL', - model_name='google/gemini-2.5-flash-preview-05-20', - model_api_key=os.getenv('GEMINI_API_KEY'), - ) - - # Create Stagehand instance - stagehand = CrawleeStagehand(config) - - # Create crawler with custom browser pool using Stagehand - crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=10, - # Custom browser pool. Gives users full control over browsers used by the crawler. - concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10), - browser_pool=BrowserPool( - plugins=[ - StagehandPlugin(stagehand, browser_launch_options={'headless': True}) - ], - ), - ) - - # Define the default request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - context.log.info(f'Processing {context.request.url} ...') - - # Cast to StagehandPage for proper type hints in IDE - page = cast('StagehandPage', context.page) - - # Use regular Playwright method - playwright_title = await page.title() - context.log.info(f'Playwright page title: {playwright_title}') - - # highlight-start - # Use AI-powered extraction with natural language - gemini_title = await page.extract('Extract page title') - context.log.info(f'Gemini page title: {gemini_title}') - # highlight-end - - await context.enqueue_links() - - # Run the crawler with the initial list of URLs. - await crawler.run(['https://crawlee.dev/']) - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py deleted file mode 100644 index cccb62e989..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from stagehand import Stagehand, StagehandPage - -if TYPE_CHECKING: - from types import TracebackType - - -class CrawleeStagehandPage: - """StagehandPage wrapper for Crawlee.""" - - def __init__(self, page: StagehandPage) -> None: - self._page = page - - async def goto( - self, - url: str, - *, - referer: str | None = None, - timeout: int | None = None, - wait_until: str | None = None, - ) -> Any: - """Navigate to the specified URL.""" - # Override goto to return navigation result that `PlaywrightCrawler` expects - return await self._page._page.goto( # noqa: SLF001 - url, - referer=referer, - timeout=timeout, - wait_until=wait_until, - ) - - def __getattr__(self, name: str) -> Any: - """Delegate all other methods to the underlying StagehandPage.""" - return getattr(self._page, name) - - async def __aenter__(self) -> CrawleeStagehandPage: - """Enter the context manager.""" - return self - - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - exc_traceback: TracebackType | None, - ) -> None: - await self._page.close() - - -class CrawleeStagehand(Stagehand): - """Stagehand wrapper for Crawlee to disable the launch of Playwright.""" - - async def init(self) -> None: - # Skip Stagehand's own Playwright initialization - # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle - self._initialized = True diff --git a/docs/guides/playwright_crawler_stagehand.mdx b/docs/guides/playwright_crawler_stagehand.mdx deleted file mode 100644 index 59a34b4cd2..0000000000 --- a/docs/guides/playwright_crawler_stagehand.mdx +++ /dev/null @@ -1,66 +0,0 @@ ---- -id: playwright-crawler-stagehand -title: Playwright with Stagehand -description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler. ---- - -import ApiLink from '@site/src/components/ApiLink'; -import CodeBlock from '@theme/CodeBlock'; - -import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py'; -import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py'; -import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py'; - -[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic. - -Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider. - -:::info - -This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions. - -::: - -## Get Gemini API key - -You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key. - -## Create support classes for Stagehand - -To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle. - -Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance. - -Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`. - - - {SupportClasses} - - -## Create browser integration classes - -You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`. - -Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances. - -Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand. - - - {BrowserClasses} - - -## Create a crawler - -Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands: - - - {StagehandRun} - - -The integration works through several key components: -- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle -- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances -- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities -- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations - -In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors. diff --git a/pyproject.toml b/pyproject.toml index 4a5444ea44..bd91dd98cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,9 @@ sql_postgres = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "asyncpg>=0.24.0" ] +stragehard = [ + "stagehand>=3.19.0", +] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "aiosqlite>=0.21.0", diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py new file mode 100644 index 0000000000..c391471f2c --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -0,0 +1,193 @@ +from __future__ import annotations + +from asyncio import Lock +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Any, cast + +from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.browsers._browser_controller import BrowserController +from crawlee.browsers._types import StagehandPage + +if TYPE_CHECKING: + from collections.abc import Mapping + + from stagehand import AsyncSession + + from crawlee.browsers._types import BrowserType + from crawlee.proxy_configuration import ProxyInfo + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserController(BrowserController): + """Controller for managing a Stagehand-controlled browser instance. + + Bridges Crawlee's browser management with Stagehand: provides page creation via + Playwright (connected to Stagehand's browser via CDP) and exposes the Stagehand + session so the crawling context can access AI methods (act/extract/observe). + """ + + AUTOMATION_LIBRARY = 'stagehand' + + def __init__( + self, + browser: Browser, + session: AsyncSession, + *, + max_open_pages_per_browser: int = 20, + ) -> None: + """Initialize a new instance. + + Args: + browser: Playwright browser connected to Stagehand via CDP. + session: Active Stagehand session used for AI operations. + max_open_pages_per_browser: Maximum number of pages open at the same time. + """ + self._browser = browser + self._session = session + self._max_open_pages_per_browser = max_open_pages_per_browser + + self._browser_context: BrowserContext | None = None + self._pages = list[Page]() + self._total_opened_pages = 0 + self._opening_pages_count = 0 + self._last_page_opened_at = datetime.now(timezone.utc) + self._context_creation_lock: Lock | None = None + + @property + @override + def pages(self) -> list[Page]: + return self._pages # type: ignore[return-value] + + @property + @override + def total_opened_pages(self) -> int: + return self._total_opened_pages + + @property + @override + def pages_count(self) -> int: + return len(self._pages) + + @property + @override + def last_page_opened_at(self) -> datetime: + return self._last_page_opened_at + + @property + @override + def idle_time(self) -> timedelta: + return datetime.now(timezone.utc) - self._last_page_opened_at + + @property + @override + def has_free_capacity(self) -> bool: + return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser + + @property + @override + def is_browser_connected(self) -> bool: + return self._browser.is_connected() + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + async def _get_context_creation_lock(self) -> Lock: + if self._context_creation_lock is None: + self._context_creation_lock = Lock() + return self._context_creation_lock + + @override + async def new_page( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> StagehandPage: + """Create a new page in the Stagehand-managed browser. + + Args: + browser_new_context_options: Ignored. Context is managed by Stagehand via CDP. + proxy_info: Proxy configuration applied when creating the shared browser context. + All pages share one context, so proxy is fixed on the first call. + + Returns: + The newly created page. + + Raises: + ValueError: If the browser has reached the maximum number of open pages. + """ + if not self.has_free_capacity: + raise ValueError('Cannot open more pages in this browser.') + + if browser_new_context_options: + logger.warning( + 'browser_new_context_options are ignored by StagehandBrowserController. ' + 'The existing CDP context is reused.' + ) + + self._opening_pages_count += 1 + + try: + async with await self._get_context_creation_lock(): + if self._browser_context is None: + if proxy_info: + self._browser_context = await self._browser.new_context( + proxy=ProxySettings( + server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + username=proxy_info.username, + password=proxy_info.password, + ) + ) + elif self._browser.contexts: + # Reuse the existing CDP context when no proxy is needed. + self._browser_context = self._browser.contexts[0] + else: + self._browser_context = await self._browser.new_context() + elif proxy_info: + logger.warning( + 'proxy_info is ignored for subsequent pages — all pages share the same browser context.' + ) + + raw_page = await self._browser_context.new_page() + page = StagehandPage(raw_page, self._session) + raw_page.on('close', lambda _: self._on_page_close(cast('Page', page))) + + self._pages.append(page) + self._last_page_opened_at = datetime.now(timezone.utc) + self._total_opened_pages += 1 + finally: + self._opening_pages_count -= 1 + + return page + + @override + async def close(self, *, force: bool = False) -> None: + """End the Stagehand session and close the browser connection. + + Args: + force: Whether to force close all open pages before closing. + + Raises: + ValueError: If there are still open pages when closing without force. + """ + if self.pages_count > 0 and not force: + raise ValueError('Cannot close the browser while there are open pages.') + + try: + await self._session.end() + except Exception: + logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) + + if self._browser.is_connected(): + await self._browser.close() + + def _on_page_close(self, page: Page) -> None: + """Handle actions after a page is closed.""" + self._pages.remove(page) diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py new file mode 100644 index 0000000000..65a65dbf76 --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from playwright.async_api import Playwright, async_playwright +from stagehand import AsyncStagehand +from typing_extensions import override + +from crawlee import service_locator +from crawlee._utils.context import ensure_context +from crawlee._utils.docs import docs_group +from crawlee.browsers._browser_plugin import BrowserPlugin +from crawlee.browsers._stagehand_browser_controller import StagehandBrowserController +from crawlee.browsers._types import StagehandOptions + +if TYPE_CHECKING: + from collections.abc import Mapping + from types import TracebackType + + from crawlee.browsers._browser_controller import BrowserController + from crawlee.browsers._types import BrowserType + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserPlugin(BrowserPlugin): + """A plugin for managing Stagehand AI-powered browser automation. + + Stagehand creates and manages the browser instance (local binary or Browserbase cloud). + Playwright then connects to it via CDP, enabling both standard Playwright automation + and AI-powered operations in the same crawling context. + + Only Chromium is supported because Stagehand relies on the Chrome DevTools Protocol. + """ + + AUTOMATION_LIBRARY = 'stagehand' + + def __init__( + self, + *, + stagehand_options: StagehandOptions | None = None, + browser_new_context_options: dict[str, Any] | None = None, + max_open_pages_per_browser: int = 20, + local_ready_timeout_s: float = 30.0, + ) -> None: + """Initialize a new instance. + + Args: + stagehand_options: Stagehand-specific configuration. Defaults to + ``StagehandOptions()`` if not provided. + browser_new_context_options: Options passed to Playwright's + ``browser.new_context`` after connecting via CDP. Refer to the + Playwright documentation for available options: + https://playwright.dev/python/docs/api/class-browser#browser-new-context. + max_open_pages_per_browser: Maximum number of pages that can be open per browser. + local_ready_timeout_s: Seconds to wait for the local Stagehand binary to + become ready. Only relevant when ``env='LOCAL'``. + """ + opts = stagehand_options or StagehandOptions() + config = service_locator.get_configuration() + + self._opts = opts + self._browser_new_context_options = browser_new_context_options or {} + self._max_open_pages_per_browser = max_open_pages_per_browser + + # headless comes from Configuration, same as PlaywrightBrowserPlugin. + # chrome_path is resolved lazily in __aenter__ once Playwright is available. + self._headless = config.headless + self._chrome_path: str | None = config.default_browser_path + + is_local = opts.env == 'LOCAL' + self._stagehand_init_kwargs: dict[str, Any] = { + 'server': 'local' if is_local else 'remote', + 'local_headless': self._headless, + 'local_ready_timeout_s': local_ready_timeout_s, + } + if is_local: + self._stagehand_init_kwargs['model_api_key'] = opts.api_key + else: + self._stagehand_init_kwargs['browserbase_api_key'] = opts.api_key + self._stagehand_init_kwargs['browserbase_project_id'] = opts.project_id + + # AsyncStagehand is created lazily in __aenter__ so that chrome_path + # can be resolved from playwright.chromium.executable_path if not set. + self._stagehand_context_manager: AsyncStagehand | None = None + self._stagehand_client: AsyncStagehand | None = None + + self._playwright_context_manager = async_playwright() + self._playwright: Playwright | None = None + + # Flag to indicate the context state. + self._active = False + + @property + @override + def active(self) -> bool: + return self._active + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + @property + @override + def browser_launch_options(self) -> Mapping[str, Any]: + """Return an empty mapping. + + Browser launch is managed by Stagehand, not Playwright directly. + """ + return {} + + @property + @override + def browser_new_context_options(self) -> Mapping[str, Any]: + """Return the options for the ``browser.new_context`` method. + + These options are passed to Playwright's ``browser.new_context`` after + connecting to the Stagehand-managed browser via CDP. Refer to the Playwright + documentation for available options: + https://playwright.dev/python/docs/api/class-browser#browser-new-context. + """ + return self._browser_new_context_options + + @property + @override + def max_open_pages_per_browser(self) -> int: + return self._max_open_pages_per_browser + + @override + async def __aenter__(self) -> StagehandBrowserPlugin: + if self._active: + raise RuntimeError(f'The {self.__class__.__name__} is already active.') + + self._active = True + self._playwright = await self._playwright_context_manager.__aenter__() + + # Resolve Chromium path from Playwright's own installation when not set + # explicitly via Configuration. The stagehand binary needs an explicit path. + if self._chrome_path is None and self._opts.env == 'LOCAL': + self._chrome_path = self._playwright.chromium.executable_path + self._stagehand_init_kwargs['local_chrome_path'] = self._chrome_path + logger.debug(f'Resolved Chromium path from Playwright: {self._chrome_path}') + + self._stagehand_context_manager = AsyncStagehand(**self._stagehand_init_kwargs) + self._stagehand_client = await self._stagehand_context_manager.__aenter__() + + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + if not self._active: + raise RuntimeError(f'The {self.__class__.__name__} is not active.') + + if self._stagehand_context_manager is not None: + await self._stagehand_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + + await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + + self._stagehand_context_manager = None + self._playwright_context_manager = async_playwright() + self._stagehand_client = None + self._playwright = None + self._active = False + + @override + @ensure_context + async def new_browser(self) -> BrowserController: + if not self._playwright or not self._stagehand_client: + raise RuntimeError(f'{self.__class__.__name__} is not initialized.') + + session = await self._stagehand_client.sessions.start(**self._build_session_kwargs()) + + cdp_url = session.data.cdp_url + if not cdp_url: + raise RuntimeError( + f'No cdp_url returned from Stagehand (env={self._opts.env!r}). ' + 'Cannot connect Playwright to the browser.' + ) + + browser = await self._playwright.chromium.connect_over_cdp(cdp_url) + + return StagehandBrowserController( + browser, + session, + max_open_pages_per_browser=self._max_open_pages_per_browser, + ) + + def _build_session_kwargs(self) -> dict[str, Any]: + """Build keyword arguments for ``sessions.start``.""" + opts = self._opts + + if opts.env == 'BROWSERBASE': + browser_param: dict[str, Any] = {'type': 'browserbase'} + else: + launch_options: dict[str, Any] = {'headless': self._headless} + browser_param = { + 'type': 'local', + 'launchOptions': launch_options, + } # , 'local_chrome_path': self._chrome_path} + + kwargs: dict[str, Any] = { + 'model_name': opts.model, + 'browser': browser_param, + 'verbose': opts.verbose, + 'self_heal': opts.self_heal, + } + + if opts.dom_settle_timeout_ms is not None: + kwargs['dom_settle_timeout_ms'] = opts.dom_settle_timeout_ms + if opts.system_prompt is not None: + kwargs['system_prompt'] = opts.system_prompt + + return kwargs diff --git a/src/crawlee/browsers/_types.py b/src/crawlee/browsers/_types.py index c5976b086a..083c542268 100644 --- a/src/crawlee/browsers/_types.py +++ b/src/crawlee/browsers/_types.py @@ -1,10 +1,22 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal + +from playwright.async_api import Page if TYPE_CHECKING: - from playwright.async_api import Page + from stagehand import AsyncSession + from stagehand.types.session_act_params import SessionActParams + from stagehand.types.session_act_response import SessionActResponse + from stagehand.types.session_execute_params import SessionExecuteParams + from stagehand.types.session_execute_response import SessionExecuteResponse + from stagehand.types.session_extract_params import SessionExtractParams + from stagehand.types.session_extract_response import SessionExtractResponse + from stagehand.types.session_observe_params import SessionObserveParams + from stagehand.types.session_observe_response import SessionObserveResponse + from typing_extensions import Unpack + BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome'] @@ -16,3 +28,80 @@ class CrawleePage: id: str browser_type: BrowserType page: Page + + +@dataclass +class StagehandOptions: + env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL' + api_key: str | None = None + project_id: str | None = None + model: str = 'openai/gpt-4.1-mini' + verbose: Literal[0, 1, 2] = 0 + self_heal: bool = True + dom_settle_timeout_ms: float | None = None + system_prompt: str | None = None + + +class StagehandPage(Page): + """A Playwright `Page` enhanced with Stagehand AI methods. + + Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright + methods transparently while adding `act()`, `extract()`, and `observe()` AI operations + bound to the current page. + """ + + def __init__(self, page: Page, session: AsyncSession) -> None: + self._page = page + self._session = session + + def __getattr__(self, name: str) -> Any: + return getattr(self._page, name) + + async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: + """Perform an action on the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.act()``. + The most common is ``instruction`` — a natural language description + of the action to perform, e.g. ``instruction='click the login button'``. + + Returns: + The action result from Stagehand. + """ + return await self._session.act(page=self._page, **kwargs) + + async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: + """Observe the page and get AI-suggested actions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.observe()``. + Optionally pass ``instruction`` to narrow the observation scope. + + Returns: + Observation result with suggested actions. + """ + return await self._session.observe(page=self._page, **kwargs) + + async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: + """Extract structured data from the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.extract()``. + Common parameters: ``instruction`` and ``schema`` (JSON Schema dict). + + Returns: + Extracted data matching the requested schema. + """ + return await self._session.extract(page=self._page, **kwargs) + + async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: + """Execute arbitrary code on the page via natural language instructions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.execute()``. + Common parameters: ``instruction`` describing the code to execute. + + Returns: + The result of the executed code. + """ + return await self._session.execute(page=self._page, **kwargs) diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py new file mode 100644 index 0000000000..1199f0cf2b --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/__init__.py @@ -0,0 +1,3 @@ +from crawlee.crawlers._stagehand._stagehand_crawler import StagehandCrawler + +__all__ = ['StagehandCrawler'] diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py new file mode 100644 index 0000000000..b367cc727a --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING, Any + +from crawlee._utils.docs import docs_group +from crawlee.browsers import BrowserPool +from crawlee.browsers._stagehand_browser_plugin import StagehandBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +if TYPE_CHECKING: + from typing_extensions import Unpack + + from crawlee.browsers._types import StagehandOptions + from crawlee.crawlers._basic import BasicCrawlerOptions + from crawlee.statistics import StatisticsState + + +@docs_group('Crawlers') +class StagehandCrawler(PlaywrightCrawler): + """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee. + + Extends `PlaywrightCrawler` with a `StagehandBrowserPlugin` that manages a Stagehand + session per browser instance. Each page in the crawling context is a `StagehandPage`, + which exposes AI methods alongside all standard Playwright `Page` methods: + + - `page.act(**kwargs)` — perform actions using natural language + - `page.extract(**kwargs)` — extract structured data with AI + - `page.observe(**kwargs)` — get AI-suggested actions on the page + - `page.execute(**kwargs)` — run an autonomous multi-step agent + + ### Usage + + ```python + from crawlee.crawlers import StagehandCrawler + from crawlee.crawlers._stagehand import StagehandCrawlingContext + + crawler = StagehandCrawler() + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + await context.page.act(input='Click the login button') + data = await context.page.extract(instruction='Get the page title') + await context.push_data(data) + + await crawler.run(['https://example.com']) + ``` + """ + + def __init__( + self, + *, + stagehand_options: StagehandOptions | None = None, + browser_pool: BrowserPool | None = None, + browser_new_context_options: dict[str, Any] | None = None, + max_open_pages_per_browser: int = 20, + **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], + ) -> None: + """Initialize a new instance. + + Args: + stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). + Ignored if `browser_pool` is provided. + browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of + `StagehandBrowserPlugin` (or its subclasses). If omitted, a pool is created + automatically from `stagehand_options`. + browser_new_context_options: Options passed to Playwright's `browser.new_context` + after connecting via CDP. Ignored if `browser_pool` is provided. + max_open_pages_per_browser: Maximum pages open per browser instance. + Ignored if `browser_pool` is provided. + kwargs: Additional keyword arguments forwarded to `BasicCrawler`. + """ + if browser_pool is not None: + self._validate_browser_pool(browser_pool) + if stagehand_options is not None: + warnings.warn( + '`stagehand_options` is ignored when `browser_pool` is provided.', + stacklevel=2, + ) + else: + browser_pool = BrowserPool( + plugins=[ + StagehandBrowserPlugin( + stagehand_options=stagehand_options, + browser_new_context_options=browser_new_context_options, + max_open_pages_per_browser=max_open_pages_per_browser, + ) + ] + ) + + super().__init__(browser_pool=browser_pool, **kwargs) + + @staticmethod + def _validate_browser_pool(pool: BrowserPool) -> None: + invalid = [p for p in pool.plugins if not isinstance(p, StagehandBrowserPlugin)] + if invalid: + raise ValueError( + f'All BrowserPool plugins must be StagehandBrowserPlugin instances. Invalid plugins: {invalid}' + ) diff --git a/uv.lock b/uv.lock index f7df250dc9..6f53d2c868 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-15T07:01:49.228326682Z" +exclude-newer = "2026-04-18T21:01:24.3365857Z" exclude-newer-span = "PT24H" [[package]] @@ -873,6 +873,9 @@ sql-sqlite = [ { name = "aiosqlite" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] +stragehard = [ + { name = "stagehand" }, +] [package.dev-dependencies] dev = [ @@ -948,13 +951,14 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-mysql'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-postgres'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, + { name = "stagehand", marker = "extra == 'stragehard'", specifier = ">=3.19.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "sql-sqlite", "sql-mysql", "redis"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stragehard", "sql-sqlite", "sql-mysql", "redis"] [package.metadata.requires-dev] dev = [ @@ -1148,6 +1152,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "docspec" version = "2.2.1" @@ -3704,6 +3717,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -3787,6 +3809,26 @@ asyncio = [ { name = "greenlet" }, ] +[[package]] +name = "stagehand" +version = "3.19.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/f8/ccd2bb2758a4eaf0af3846e097ff206e0aa76c8d3b5aa2bded77fb47825e/stagehand-3.19.5.tar.gz", hash = "sha256:3cb8279ac82051e584b34d26e87dc764f0ccad766a01625198ca578eb35f0b6c", size = 281033, upload-time = "2026-04-03T20:21:09.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/6f/a47bad258bfafc193ebb8e0e8c440e8028c9ab28b54a333b46aa3c0cff53/stagehand-3.19.5-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:14f39a4f8d30d77c089166185c705f66aade25432b903a663a937b3747439c26", size = 34495874, upload-time = "2026-04-03T20:21:07.366Z" }, + { url = "https://files.pythonhosted.org/packages/72/f7/e39868903121f1a80ae6eda088383362cd2d3a578c04493a2f83c1aac1da/stagehand-3.19.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80ed0d732cb9c3e952ad851e071dad5775a9ea88d2787c006289d61097fd2609", size = 33193535, upload-time = "2026-04-03T20:21:18.536Z" }, + { url = "https://files.pythonhosted.org/packages/c8/0b/35cb92bb53e9539c0147892dbd0a227b43bf0d8adcd0a8e867dc5f2bf7fd/stagehand-3.19.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:aa947a5f6241f5953ac238cd9b0ab72e0cb87f559f97e5ee875f83dbc0c351d1", size = 37273148, upload-time = "2026-04-03T20:21:11.939Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c7/dccf63cba1941b5710dc9968218e2883a937cf6534d644bb0c5222d3f40a/stagehand-3.19.5-py3-none-win_amd64.whl", hash = "sha256:e37bf630b99b4a9b7d95f151c56b296940db88b3049b68f0abb56f9e31cc6095", size = 30758357, upload-time = "2026-04-03T20:21:15.121Z" }, +] + [[package]] name = "text-unidecode" version = "1.3" From 85f18dea19d4134ae7e8a41365f2df80caed924d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 27 Apr 2026 19:16:30 +0000 Subject: [PATCH 02/10] update typing for stagehand --- src/crawlee/browsers/__init__.py | 4 +- src/crawlee/crawlers/__init__.py | 18 ++ .../_playwright/_playwright_crawler.py | 211 +++++++++++------- src/crawlee/crawlers/_stagehand/__init__.py | 12 +- .../crawlers/_stagehand/_stagehand_crawler.py | 34 ++- .../_stagehand/_stagehand_crawling_context.py | 29 +++ 6 files changed, 217 insertions(+), 91 deletions(-) create mode 100644 src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index 0ea1101df4..a71d6bc909 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -1,7 +1,7 @@ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import -from ._types import BrowserType, CrawleePage +from ._types import BrowserType, CrawleePage, StagehandOptions, StagehandPage _install_import_hook(__name__) @@ -25,4 +25,6 @@ 'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'PlaywrightPersistentBrowser', + 'StagehandOptions', + 'StagehandPage', ] diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index 6445ad5d64..ac97581bb0 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -51,6 +51,20 @@ RenderingTypePredictor, ) +with _try_import( + __name__, + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', +): + from ._stagehand import ( + StagehandCrawler, + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, + ) + __all__ = [ 'AbstractHttpCrawler', @@ -81,4 +95,8 @@ 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', ] diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0d6951d86d..0d8e052b5d 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -5,7 +5,7 @@ import warnings from datetime import timedelta from functools import partial -from typing import TYPE_CHECKING, Any, Generic, Literal +from typing import TYPE_CHECKING, Any, Generic, Literal, overload import playwright.async_api from more_itertools import partition @@ -35,14 +35,11 @@ from ._types import GotoOptions from ._utils import block_requests, infinite_scroll -TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) -TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) - if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping from pathlib import Path - from playwright.async_api import Page, Route + from playwright.async_api import Page, Response, Route from playwright.async_api import Request as PlaywrightRequest from typing_extensions import Unpack @@ -57,8 +54,21 @@ from crawlee.browsers._types import BrowserType +TPreNavContext = TypeVar( + 'TPreNavContext', bound=PlaywrightPreNavCrawlingContext, default=PlaywrightPreNavCrawlingContext +) +TPostNavContext = TypeVar( + 'TPostNavContext', bound=PlaywrightPostNavCrawlingContext, default=PlaywrightPostNavCrawlingContext +) +TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext, default=PlaywrightCrawlingContext) +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) + + @docs_group('Crawlers') -class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]): +class PlaywrightCrawler( + BasicCrawler[TCrawlingContext, StatisticsState], + Generic[TPreNavContext, TPostNavContext, TCrawlingContext], +): """A web crawler that leverages the `Playwright` browser automation library. The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. @@ -98,6 +108,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: ``` """ + # Define the context classes used in context factory. + _PRE_NAV_CONTEXT_CLASS = PlaywrightPreNavCrawlingContext + _POST_NAV_CONTEXT_CLASS = PlaywrightPostNavCrawlingContext + _CRAWLING_CONTEXT_CLASS = PlaywrightCrawlingContext + def __init__( self, *, @@ -111,7 +126,7 @@ def __init__( headless: bool | None = None, use_incognito_pages: bool | None = None, navigation_timeout: timedelta | None = None, - **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], + **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. @@ -203,8 +218,8 @@ def __init__( ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) - self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] - self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = [] + self._pre_navigation_hooks: list[Callable[[TPreNavContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[TPostNavContext], Awaitable[None]]] = [] kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] @@ -229,31 +244,112 @@ async def _manage_shared_navigation_timeout( finally: self._shared_navigation_timeouts.pop(request_id, None) + @overload + def _build_context( + self, + context: BasicCrawlingContext, + *, + page: Page, + response: None = None, + ) -> TPreNavContext: ... + + @overload + def _build_context( + self, + context: TPreNavContext, + *, + response: Response, + ) -> TPostNavContext: ... + + @overload + def _build_context( + self, + context: TPostNavContext, + ) -> TCrawlingContext: ... + + def _build_context( + self, + context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext, + *, + page: Page | None = None, + response: Response | None = None, + ) -> PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext | PlaywrightCrawlingContext: + """Build the crawling context by adding Playwright-specific properties.""" + match context: + case self._POST_NAV_CONTEXT_CLASS(): + extract_links = self._create_extract_links_function(context) + return self._CRAWLING_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=context.page, + goto_options=context.goto_options, + response=context.response, + infinite_scroll=lambda: infinite_scroll(context.page), + extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), + block_requests=context.block_requests, + ) + case self._PRE_NAV_CONTEXT_CLASS(): + if response is None: + raise ValueError('Response must be provided for post-navigation context.') + + return self._POST_NAV_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=context.page, + block_requests=context.block_requests, + goto_options=context.goto_options, + response=response, + ) + case BasicCrawlingContext(): + if page is None: + raise ValueError('Page must be provided for pre-navigation context.') + + return self._PRE_NAV_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=page, + block_requests=partial(block_requests, page=page), + goto_options=GotoOptions(**self._goto_options), + ) + case _: + raise ValueError('Invalid context type') + async def _open_page( self, context: BasicCrawlingContext, - ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]: + ) -> AsyncGenerator[TPreNavContext, None]: if self._browser_pool is None: raise ValueError('Browser pool is not initialized.') # Create a new browser page crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) - pre_navigation_context = PlaywrightPreNavCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=crawlee_page.page, - block_requests=partial(block_requests, page=crawlee_page.page), - goto_options=GotoOptions(**self._goto_options), - ) + pre_navigation_context = self._build_context(context, page=crawlee_page.page) request_id = id(pre_navigation_context.request) @@ -290,8 +386,8 @@ async def route_handler(route: Route, _: PlaywrightRequest) -> None: async def _navigate( self, - context: PlaywrightPreNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]: + context: TPreNavContext, + ) -> AsyncGenerator[TPostNavContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: @@ -352,24 +448,9 @@ async def _navigate( # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = context.page.url - yield PlaywrightPostNavCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=context.page, - block_requests=context.block_requests, - goto_options=context.goto_options, - response=response, - ) + yield self._build_context(context, response=response) - def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: + def _create_extract_links_function(self, context: TPostNavContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. Args: @@ -451,9 +532,7 @@ async def extract_links( return extract_links - async def _handle_status_code_response( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + async def _handle_status_code_response(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: @@ -475,8 +554,8 @@ async def _handle_status_code_response( async def _handle_blocked_request_by_content( self, - context: PlaywrightPostNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + context: TPostNavContext, + ) -> AsyncGenerator[TPostNavContext, None]: """Try to detect if the request is blocked based on the response content. Args: @@ -502,9 +581,7 @@ async def _handle_blocked_request_by_content( yield context - async def _execute_post_navigation_hooks( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + async def _execute_post_navigation_hooks(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]: request_id = id(context.request) for hook in self._post_navigation_hooks: @@ -513,36 +590,14 @@ async def _execute_post_navigation_hooks( yield context - async def _create_crawling_context( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: - extract_links = self._create_extract_links_function(context) - - yield PlaywrightCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=context.page, - goto_options=context.goto_options, - response=context.response, - infinite_scroll=lambda: infinite_scroll(context.page), - extract_links=extract_links, - enqueue_links=self._create_enqueue_links_function(context, extract_links), - block_requests=partial(block_requests, page=context.page), - ) + async def _create_crawling_context(self, context: TPostNavContext) -> AsyncGenerator[TCrawlingContext, None]: + yield self._build_context(context) if context.session: pw_cookies = await self._get_cookies(context.page) context.session.cookies.set_cookies_from_playwright_format(pw_cookies) - def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: + def pre_navigation_hook(self, hook: Callable[[TPreNavContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. Args: @@ -550,7 +605,7 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], """ self._pre_navigation_hooks.append(hook) - def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + def post_navigation_hook(self, hook: Callable[[TPostNavContext], Awaitable[None]]) -> None: """Register a hook to be called after each navigation. Args: diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py index 1199f0cf2b..684851349b 100644 --- a/src/crawlee/crawlers/_stagehand/__init__.py +++ b/src/crawlee/crawlers/_stagehand/__init__.py @@ -1,3 +1,13 @@ from crawlee.crawlers._stagehand._stagehand_crawler import StagehandCrawler +from crawlee.crawlers._stagehand._stagehand_crawling_context import ( + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, +) -__all__ = ['StagehandCrawler'] +__all__ = [ + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', +] diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index b367cc727a..a44e70195c 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -1,12 +1,17 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING, Any from crawlee._utils.docs import docs_group from crawlee.browsers import BrowserPool from crawlee.browsers._stagehand_browser_plugin import StagehandBrowserPlugin -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.crawlers import PlaywrightCrawler + +from ._stagehand_crawling_context import ( + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, +) if TYPE_CHECKING: from typing_extensions import Unpack @@ -17,7 +22,13 @@ @docs_group('Crawlers') -class StagehandCrawler(PlaywrightCrawler): +class StagehandCrawler( + PlaywrightCrawler[ + StagehandPreNavCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandCrawlingContext, + ] +): """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee. Extends `PlaywrightCrawler` with a `StagehandBrowserPlugin` that manages a Stagehand @@ -47,6 +58,10 @@ async def handler(context: StagehandCrawlingContext) -> None: ``` """ + _PRE_NAV_CONTEXT_CLASS = StagehandPreNavCrawlingContext + _POST_NAV_CONTEXT_CLASS = StagehandPostNavCrawlingContext + _CRAWLING_CONTEXT_CLASS = StagehandCrawlingContext + def __init__( self, *, @@ -54,29 +69,26 @@ def __init__( browser_pool: BrowserPool | None = None, browser_new_context_options: dict[str, Any] | None = None, max_open_pages_per_browser: int = 20, - **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], + **kwargs: Unpack[BasicCrawlerOptions[StagehandCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. Args: stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). - Ignored if `browser_pool` is provided. + Cannot be specified if `browser_pool` is provided. browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of `StagehandBrowserPlugin` (or its subclasses). If omitted, a pool is created automatically from `stagehand_options`. browser_new_context_options: Options passed to Playwright's `browser.new_context` - after connecting via CDP. Ignored if `browser_pool` is provided. + after connecting via CDP. Cannot be specified if `browser_pool` is provided. max_open_pages_per_browser: Maximum pages open per browser instance. - Ignored if `browser_pool` is provided. + Cannot be specified if `browser_pool` is provided. kwargs: Additional keyword arguments forwarded to `BasicCrawler`. """ if browser_pool is not None: self._validate_browser_pool(browser_pool) if stagehand_options is not None: - warnings.warn( - '`stagehand_options` is ignored when `browser_pool` is provided.', - stacklevel=2, - ) + raise ValueError('Cannot specify both browser_pool and stagehand_options') else: browser_pool = BrowserPool( plugins=[ diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py new file mode 100644 index 0000000000..645fdbd357 --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass + +from crawlee._utils.docs import docs_group +from crawlee.browsers import StagehandPage +from crawlee.crawlers import ( + PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandPreNavCrawlingContext(PlaywrightPreNavCrawlingContext): + """The pre navigation crawling context used by the `StagehandCrawler`.""" + + page: StagehandPage + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandPostNavCrawlingContext(PlaywrightPostNavCrawlingContext, StagehandPreNavCrawlingContext): + """The post navigation crawling context used by the `StagehandCrawler`.""" + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandCrawlingContext(PlaywrightCrawlingContext, StagehandPostNavCrawlingContext): + """The crawling context used by the `StagehandCrawler`.""" From a47b836cc69c7e4f65e60f04bf730045244e2f49 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 27 Apr 2026 23:48:14 +0000 Subject: [PATCH 03/10] update plugin --- pyproject.toml | 4 +- .../browsers/_stagehand_browser_controller.py | 150 ++++++++++------- .../browsers/_stagehand_browser_plugin.py | 157 +++++++----------- src/crawlee/browsers/_types.py | 16 +- uv.lock | 11 +- 5 files changed, 160 insertions(+), 178 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bd91dd98cd..22447e49e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ ] [project.optional-dependencies] -all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"] +all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,stagehand,redis]"] adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", @@ -77,7 +77,7 @@ sql_postgres = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "asyncpg>=0.24.0" ] -stragehard = [ +stagehand = [ "stagehand>=3.19.0", ] sql_sqlite = [ diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index c391471f2c..ed1b21f61b 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -3,9 +3,8 @@ from asyncio import Lock from datetime import datetime, timedelta, timezone from logging import getLogger -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any -from playwright.async_api import Browser, BrowserContext, Page, ProxySettings from typing_extensions import override from crawlee._utils.docs import docs_group @@ -15,11 +14,13 @@ if TYPE_CHECKING: from collections.abc import Mapping - from stagehand import AsyncSession + from playwright.async_api import Browser, BrowserContext, Playwright + from stagehand import AsyncSession, AsyncStagehand - from crawlee.browsers._types import BrowserType + from crawlee.browsers._types import BrowserType, StagehandOptions from crawlee.proxy_configuration import ProxyInfo + logger = getLogger(__name__) @@ -27,42 +28,51 @@ class StagehandBrowserController(BrowserController): """Controller for managing a Stagehand-controlled browser instance. - Bridges Crawlee's browser management with Stagehand: provides page creation via - Playwright (connected to Stagehand's browser via CDP) and exposes the Stagehand - session so the crawling context can access AI methods (act/extract/observe). + Bridges Crawlee's browser management with Stagehand: lazily creates a Stagehand + session on the first page request (injecting proxy at that point), then connects + Playwright to it via CDP. All pages share a single browser context per controller. """ AUTOMATION_LIBRARY = 'stagehand' def __init__( self, - browser: Browser, - session: AsyncSession, *, + playwright: Playwright, + stagehand_client: AsyncStagehand, + stagehand_options: StagehandOptions, + base_launch_options: dict[str, Any], max_open_pages_per_browser: int = 20, ) -> None: """Initialize a new instance. Args: - browser: Playwright browser connected to Stagehand via CDP. - session: Active Stagehand session used for AI operations. + playwright: Active Playwright instance used to connect to the browser via CDP. + stagehand_client: Active Stagehand client used to start sessions. + stagehand_options: Stagehand-specific configuration. + base_launch_options: Browser launch options (without proxy) built by the plugin. max_open_pages_per_browser: Maximum number of pages open at the same time. """ - self._browser = browser - self._session = session + self._playwright = playwright + self._stagehand_client = stagehand_client + self._stagehand_options = stagehand_options + self._base_launch_options = base_launch_options self._max_open_pages_per_browser = max_open_pages_per_browser + self._session: AsyncSession | None = None + self._browser: Browser | None = None self._browser_context: BrowserContext | None = None - self._pages = list[Page]() + self._session_init_lock = Lock() + + self._pages = list[StagehandPage]() self._total_opened_pages = 0 self._opening_pages_count = 0 self._last_page_opened_at = datetime.now(timezone.utc) - self._context_creation_lock: Lock | None = None @property @override - def pages(self) -> list[Page]: - return self._pages # type: ignore[return-value] + def pages(self) -> list[StagehandPage]: + return self._pages @property @override @@ -92,7 +102,8 @@ def has_free_capacity(self) -> bool: @property @override def is_browser_connected(self) -> bool: - return self._browser.is_connected() + # Session not yet started — controller is available for new pages. + return self._browser is None or self._browser.is_connected() @property @override @@ -110,19 +121,6 @@ async def new_page( browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> StagehandPage: - """Create a new page in the Stagehand-managed browser. - - Args: - browser_new_context_options: Ignored. Context is managed by Stagehand via CDP. - proxy_info: Proxy configuration applied when creating the shared browser context. - All pages share one context, so proxy is fixed on the first call. - - Returns: - The newly created page. - - Raises: - ValueError: If the browser has reached the maximum number of open pages. - """ if not self.has_free_capacity: raise ValueError('Cannot open more pages in this browser.') @@ -133,31 +131,15 @@ async def new_page( ) self._opening_pages_count += 1 - try: - async with await self._get_context_creation_lock(): - if self._browser_context is None: - if proxy_info: - self._browser_context = await self._browser.new_context( - proxy=ProxySettings( - server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', - username=proxy_info.username, - password=proxy_info.password, - ) - ) - elif self._browser.contexts: - # Reuse the existing CDP context when no proxy is needed. - self._browser_context = self._browser.contexts[0] - else: - self._browser_context = await self._browser.new_context() - elif proxy_info: - logger.warning( - 'proxy_info is ignored for subsequent pages — all pages share the same browser context.' - ) + await self._ensure_session(proxy_info) + + if self._browser is None or self._session is None or self._browser_context is None: + raise RuntimeError('Failed to initialize the browser session.') raw_page = await self._browser_context.new_page() page = StagehandPage(raw_page, self._session) - raw_page.on('close', lambda _: self._on_page_close(cast('Page', page))) + raw_page.on('close', lambda _: self._on_page_close(page)) self._pages.append(page) self._last_page_opened_at = datetime.now(timezone.utc) @@ -169,25 +151,67 @@ async def new_page( @override async def close(self, *, force: bool = False) -> None: - """End the Stagehand session and close the browser connection. - - Args: - force: Whether to force close all open pages before closing. - - Raises: - ValueError: If there are still open pages when closing without force. - """ if self.pages_count > 0 and not force: raise ValueError('Cannot close the browser while there are open pages.') + if self._session is None: + return + try: await self._session.end() except Exception: logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) - if self._browser.is_connected(): + if self._browser is not None and self._browser.is_connected(): await self._browser.close() - def _on_page_close(self, page: Page) -> None: - """Handle actions after a page is closed.""" + def _on_page_close(self, page: StagehandPage) -> None: self._pages.remove(page) + + async def _ensure_session(self, proxy_info: ProxyInfo | None = None) -> None: + if self._session is not None: + return + async with self._session_init_lock: + if self._session is not None: + return + + opts = self._stagehand_options + start_kwargs: dict[str, Any] = { + 'model_name': opts.model, + 'verbose': opts.verbose, + 'self_heal': opts.self_heal, + } + if opts.dom_settle_timeout_ms is not None: + start_kwargs['dom_settle_timeout_ms'] = opts.dom_settle_timeout_ms + if opts.system_prompt is not None: + start_kwargs['system_prompt'] = opts.system_prompt + + if opts.env == 'LOCAL': + launch_options: dict[str, Any] = dict(self._base_launch_options) + if proxy_info: + launch_options['proxy'] = { + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + start_kwargs['browser'] = {'type': 'local', 'launch_options': launch_options} + elif proxy_info: + logger.warning( + 'Proxy support in BROWSERBASE mode requires configuring proxies via ' + 'browserbase_session_create_params. proxy_info will be ignored.' + ) + + session: AsyncSession = await self._stagehand_client.sessions.start(**start_kwargs) + + cdp_url = session.data.cdp_url + if not cdp_url: + raise RuntimeError( + f'No cdp_url returned from Stagehand (env={self._stagehand_options.env!r}). ' + 'Cannot connect Playwright to the browser.' + ) + + self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url) + self._browser_context = ( + self._browser.contexts[0] if self._browser.contexts else await self._browser.new_context() + ) + self._session = session diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py index 65a65dbf76..8d2faf77e9 100644 --- a/src/crawlee/browsers/_stagehand_browser_plugin.py +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -10,16 +10,19 @@ from crawlee import service_locator from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group -from crawlee.browsers._browser_plugin import BrowserPlugin -from crawlee.browsers._stagehand_browser_controller import StagehandBrowserController -from crawlee.browsers._types import StagehandOptions + +from ._browser_plugin import BrowserPlugin +from ._stagehand_browser_controller import StagehandBrowserController +from ._types import StagehandOptions if TYPE_CHECKING: from collections.abc import Mapping + from pathlib import Path from types import TracebackType - from crawlee.browsers._browser_controller import BrowserController - from crawlee.browsers._types import BrowserType + from ._browser_controller import BrowserController + from ._types import BrowserType + logger = getLogger(__name__) @@ -40,57 +43,59 @@ class StagehandBrowserPlugin(BrowserPlugin): def __init__( self, *, + user_data_dir: str | Path | None = None, stagehand_options: StagehandOptions | None = None, + browser_launch_options: dict[str, Any] | None = None, browser_new_context_options: dict[str, Any] | None = None, max_open_pages_per_browser: int = 20, - local_ready_timeout_s: float = 30.0, ) -> None: """Initialize a new instance. Args: - stagehand_options: Stagehand-specific configuration. Defaults to - ``StagehandOptions()`` if not provided. - browser_new_context_options: Options passed to Playwright's - ``browser.new_context`` after connecting via CDP. Refer to the - Playwright documentation for available options: - https://playwright.dev/python/docs/api/class-browser#browser-new-context. + user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local + storage. + stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). + browser_launch_options: Keyword arguments for browser launch. Supported options are + a subset of Playwright's ``browser_type.launch`` options that map to Stagehand's + ``BrowserLaunchOptions``. Unsupported keys are logged as warnings and ignored. + browser_new_context_options: Keyword arguments for browser context creation. + Options that map to Stagehand's ``BrowserLaunchOptions`` are merged with + ``browser_launch_options``. Unsupported keys are logged as warnings and ignored. max_open_pages_per_browser: Maximum number of pages that can be open per browser. - local_ready_timeout_s: Seconds to wait for the local Stagehand binary to - become ready. Only relevant when ``env='LOCAL'``. """ - opts = stagehand_options or StagehandOptions() config = service_locator.get_configuration() - self._opts = opts - self._browser_new_context_options = browser_new_context_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser - # headless comes from Configuration, same as PlaywrightBrowserPlugin. - # chrome_path is resolved lazily in __aenter__ once Playwright is available. - self._headless = config.headless - self._chrome_path: str | None = config.default_browser_path + self.stagehand_options = stagehand_options or StagehandOptions() + self._browser_new_context_options = browser_new_context_options or {} + + is_local = self.stagehand_options.env == 'LOCAL' + + self._base_launch_options: dict[str, Any] = { + 'headless': config.headless, + 'chromium_sandbox': not config.disable_browser_sandbox, + } + if config.default_browser_path: + self._base_launch_options['executable_path'] = config.default_browser_path + + self._base_launch_options = {**self._base_launch_options, **(browser_launch_options or {})} - is_local = opts.env == 'LOCAL' self._stagehand_init_kwargs: dict[str, Any] = { 'server': 'local' if is_local else 'remote', - 'local_headless': self._headless, - 'local_ready_timeout_s': local_ready_timeout_s, + 'local_headless': self._base_launch_options.get('headless', config.headless), + 'local_ready_timeout_s': self.stagehand_options.local_ready_timeout_s, + 'user_data_dir': str(user_data_dir) if user_data_dir else None, } if is_local: - self._stagehand_init_kwargs['model_api_key'] = opts.api_key + self._stagehand_init_kwargs['model_api_key'] = self.stagehand_options.api_key else: - self._stagehand_init_kwargs['browserbase_api_key'] = opts.api_key - self._stagehand_init_kwargs['browserbase_project_id'] = opts.project_id + self._stagehand_init_kwargs['browserbase_api_key'] = self.stagehand_options.api_key + self._stagehand_init_kwargs['browserbase_project_id'] = self.stagehand_options.project_id - # AsyncStagehand is created lazily in __aenter__ so that chrome_path - # can be resolved from playwright.chromium.executable_path if not set. - self._stagehand_context_manager: AsyncStagehand | None = None self._stagehand_client: AsyncStagehand | None = None - self._playwright_context_manager = async_playwright() self._playwright: Playwright | None = None - - # Flag to indicate the context state. self._active = False @property @@ -106,23 +111,12 @@ def browser_type(self) -> BrowserType: @property @override def browser_launch_options(self) -> Mapping[str, Any]: - """Return an empty mapping. - - Browser launch is managed by Stagehand, not Playwright directly. - """ - return {} + return self._base_launch_options @property @override def browser_new_context_options(self) -> Mapping[str, Any]: - """Return the options for the ``browser.new_context`` method. - - These options are passed to Playwright's ``browser.new_context`` after - connecting to the Stagehand-managed browser via CDP. Refer to the Playwright - documentation for available options: - https://playwright.dev/python/docs/api/class-browser#browser-new-context. - """ - return self._browser_new_context_options + return {} @property @override @@ -137,15 +131,18 @@ async def __aenter__(self) -> StagehandBrowserPlugin: self._active = True self._playwright = await self._playwright_context_manager.__aenter__() - # Resolve Chromium path from Playwright's own installation when not set - # explicitly via Configuration. The stagehand binary needs an explicit path. - if self._chrome_path is None and self._opts.env == 'LOCAL': - self._chrome_path = self._playwright.chromium.executable_path - self._stagehand_init_kwargs['local_chrome_path'] = self._chrome_path - logger.debug(f'Resolved Chromium path from Playwright: {self._chrome_path}') + # Resolve Chromium path for LOCAL mode. + if self.stagehand_options.env == 'LOCAL': + if 'executable_path' not in self._base_launch_options: + chrome_path = self._playwright.chromium.executable_path + self._base_launch_options['executable_path'] = chrome_path + logger.debug(f'Resolved Chromium path from Playwright: {chrome_path}') - self._stagehand_context_manager = AsyncStagehand(**self._stagehand_init_kwargs) - self._stagehand_client = await self._stagehand_context_manager.__aenter__() + self._stagehand_init_kwargs['local_chrome_path'] = self._base_launch_options['executable_path'] + + client = AsyncStagehand(**self._stagehand_init_kwargs) + await client.__aenter__() + self._stagehand_client = client return self @@ -159,14 +156,12 @@ async def __aexit__( if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') - if self._stagehand_context_manager is not None: - await self._stagehand_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + if self._stagehand_client is not None: + await self._stagehand_client.__aexit__(exc_type, exc_value, exc_traceback) + self._stagehand_client = None await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) - - self._stagehand_context_manager = None self._playwright_context_manager = async_playwright() - self._stagehand_client = None self._playwright = None self._active = False @@ -176,46 +171,10 @@ async def new_browser(self) -> BrowserController: if not self._playwright or not self._stagehand_client: raise RuntimeError(f'{self.__class__.__name__} is not initialized.') - session = await self._stagehand_client.sessions.start(**self._build_session_kwargs()) - - cdp_url = session.data.cdp_url - if not cdp_url: - raise RuntimeError( - f'No cdp_url returned from Stagehand (env={self._opts.env!r}). ' - 'Cannot connect Playwright to the browser.' - ) - - browser = await self._playwright.chromium.connect_over_cdp(cdp_url) - return StagehandBrowserController( - browser, - session, + playwright=self._playwright, + stagehand_client=self._stagehand_client, + stagehand_options=self.stagehand_options, + base_launch_options=self._base_launch_options, max_open_pages_per_browser=self._max_open_pages_per_browser, ) - - def _build_session_kwargs(self) -> dict[str, Any]: - """Build keyword arguments for ``sessions.start``.""" - opts = self._opts - - if opts.env == 'BROWSERBASE': - browser_param: dict[str, Any] = {'type': 'browserbase'} - else: - launch_options: dict[str, Any] = {'headless': self._headless} - browser_param = { - 'type': 'local', - 'launchOptions': launch_options, - } # , 'local_chrome_path': self._chrome_path} - - kwargs: dict[str, Any] = { - 'model_name': opts.model, - 'browser': browser_param, - 'verbose': opts.verbose, - 'self_heal': opts.self_heal, - } - - if opts.dom_settle_timeout_ms is not None: - kwargs['dom_settle_timeout_ms'] = opts.dom_settle_timeout_ms - if opts.system_prompt is not None: - kwargs['system_prompt'] = opts.system_prompt - - return kwargs diff --git a/src/crawlee/browsers/_types.py b/src/crawlee/browsers/_types.py index 083c542268..7f0268e0f8 100644 --- a/src/crawlee/browsers/_types.py +++ b/src/crawlee/browsers/_types.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal from playwright.async_api import Page @@ -40,6 +40,7 @@ class StagehandOptions: self_heal: bool = True dom_settle_timeout_ms: float | None = None system_prompt: str | None = None + local_ready_timeout_s: float = 15.0 class StagehandPage(Page): @@ -51,12 +52,9 @@ class StagehandPage(Page): """ def __init__(self, page: Page, session: AsyncSession) -> None: - self._page = page + super().__init__(page._impl_obj) # noqa: SLF001 self._session = session - def __getattr__(self, name: str) -> Any: - return getattr(self._page, name) - async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: """Perform an action on the page using natural language. @@ -68,7 +66,7 @@ async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: Returns: The action result from Stagehand. """ - return await self._session.act(page=self._page, **kwargs) + return await self._session.act(page=self, **kwargs) async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: """Observe the page and get AI-suggested actions. @@ -80,7 +78,7 @@ async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserv Returns: Observation result with suggested actions. """ - return await self._session.observe(page=self._page, **kwargs) + return await self._session.observe(page=self, **kwargs) async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: """Extract structured data from the page using natural language. @@ -92,7 +90,7 @@ async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtrac Returns: Extracted data matching the requested schema. """ - return await self._session.extract(page=self._page, **kwargs) + return await self._session.extract(page=self, **kwargs) async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: """Execute arbitrary code on the page via natural language instructions. @@ -104,4 +102,4 @@ async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecut Returns: The result of the executed code. """ - return await self._session.execute(page=self._page, **kwargs) + return await self._session.execute(page=self, **kwargs) diff --git a/uv.lock b/uv.lock index 6f53d2c868..bbbfa1d2aa 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-18T21:01:24.3365857Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "PT24H" [[package]] @@ -819,6 +819,7 @@ all = [ { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "sqlalchemy", extra = ["asyncio"] }, + { name = "stagehand" }, { name = "typer" }, { name = "wrapt" }, ] @@ -873,7 +874,7 @@ sql-sqlite = [ { name = "aiosqlite" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] -stragehard = [ +stagehand = [ { name = "stagehand" }, ] @@ -922,7 +923,7 @@ requires-dist = [ { name = "cachetools", specifier = ">=5.5.0" }, { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, - { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "redis"], marker = "extra == 'all'" }, + { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "redis", "stagehand"], marker = "extra == 'all'" }, { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, @@ -951,14 +952,14 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-mysql'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-postgres'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, - { name = "stagehand", marker = "extra == 'stragehard'", specifier = ">=3.19.0" }, + { name = "stagehand", marker = "extra == 'stagehand'", specifier = ">=3.19.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stragehard", "sql-sqlite", "sql-mysql", "redis"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stagehand", "sql-sqlite", "sql-mysql", "redis"] [package.metadata.requires-dev] dev = [ From 62b0c66c155d590892bfad716b2fc496d829c384 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 28 Apr 2026 23:21:21 +0000 Subject: [PATCH 04/10] synchronize params between modules --- src/crawlee/browsers/__init__.py | 7 ++ .../browsers/_stagehand_browser_controller.py | 101 ++++++++------- .../browsers/_stagehand_browser_plugin.py | 90 ++++++++------ src/crawlee/crawlers/_stagehand/__init__.py | 24 +++- .../crawlers/_stagehand/_stagehand_crawler.py | 115 ++++++++++++++---- 5 files changed, 234 insertions(+), 103 deletions(-) diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index a71d6bc909..6bfcf3ea53 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -17,6 +17,11 @@ with _try_import(__name__, 'PlaywrightPersistentBrowser'): from ._playwright_browser import PlaywrightPersistentBrowser +with _try_import(__name__, 'StagehandBrowserController'): + from ._stagehand_browser_controller import StagehandBrowserController +with _try_import(__name__, 'StagehandBrowserPlugin'): + from ._stagehand_browser_plugin import StagehandBrowserPlugin + __all__ = [ 'BrowserPool', @@ -25,6 +30,8 @@ 'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'PlaywrightPersistentBrowser', + 'StagehandBrowserController', + 'StagehandBrowserPlugin', 'StagehandOptions', 'StagehandPage', ] diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index ed1b21f61b..01c1c07ea6 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -28,9 +28,10 @@ class StagehandBrowserController(BrowserController): """Controller for managing a Stagehand-controlled browser instance. - Bridges Crawlee's browser management with Stagehand: lazily creates a Stagehand - session on the first page request (injecting proxy at that point), then connects - Playwright to it via CDP. All pages share a single browser context per controller. + It creates and connects to the browser lazily on the first ``new_page`` call: Stagehand + starts a session, and Playwright then connects to it via CDP. All pages share a single + browser context, as Stagehand creates the browser and its context together during session + initialisation. """ AUTOMATION_LIBRARY = 'stagehand' @@ -41,22 +42,20 @@ def __init__( playwright: Playwright, stagehand_client: AsyncStagehand, stagehand_options: StagehandOptions, - base_launch_options: dict[str, Any], max_open_pages_per_browser: int = 20, ) -> None: """Initialize a new instance. Args: playwright: Active Playwright instance used to connect to the browser via CDP. - stagehand_client: Active Stagehand client used to start sessions. - stagehand_options: Stagehand-specific configuration. - base_launch_options: Browser launch options (without proxy) built by the plugin. - max_open_pages_per_browser: Maximum number of pages open at the same time. + stagehand_client: Active Stagehand REST client used to start and end sessions. + stagehand_options: Stagehand-specific configuration (model, env, self-heal, etc.). + browser_launch_options: Browser options built by the plugin. + max_open_pages_per_browser: Maximum number of pages that can be open at the same time. """ self._playwright = playwright self._stagehand_client = stagehand_client self._stagehand_options = stagehand_options - self._base_launch_options = base_launch_options self._max_open_pages_per_browser = max_open_pages_per_browser self._session: AsyncSession | None = None @@ -110,29 +109,35 @@ def is_browser_connected(self) -> bool: def browser_type(self) -> BrowserType: return 'chromium' - async def _get_context_creation_lock(self) -> Lock: - if self._context_creation_lock is None: - self._context_creation_lock = Lock() - return self._context_creation_lock - @override async def new_page( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> StagehandPage: + """Create a new page in the Stagehand-managed browser. + + On the first call, starts the Stagehand session with the provided options. On subsequent + calls, ``browser_new_context_options`` are ignored with a warning because the browser + context cannot be reconfigured once the session is running. + + Args: + browser_new_context_options: Options merged on top of the plugin's launch options + when creating the first session. Ignored if the session already exists. + proxy_info: Proxy injected into the session on first creation. + + Raises: + ValueError: If the browser has reached the maximum number of open pages. + RuntimeError: If the Stagehand session fails to initialise. + """ if not self.has_free_capacity: raise ValueError('Cannot open more pages in this browser.') - if browser_new_context_options: - logger.warning( - 'browser_new_context_options are ignored by StagehandBrowserController. ' - 'The existing CDP context is reused.' - ) - self._opening_pages_count += 1 try: - await self._ensure_session(proxy_info) + # Lazily start a Stagehand session on the first page request, passing proxy and context options + # at that point. + await self._ensure_session(browser_new_context_options, proxy_info) if self._browser is None or self._session is None or self._browser_context is None: raise RuntimeError('Failed to initialize the browser session.') @@ -168,40 +173,54 @@ async def close(self, *, force: bool = False) -> None: def _on_page_close(self, page: StagehandPage) -> None: self._pages.remove(page) - async def _ensure_session(self, proxy_info: ProxyInfo | None = None) -> None: + async def _ensure_session( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> None: if self._session is not None: return async with self._session_init_lock: if self._session is not None: return - opts = self._stagehand_options - start_kwargs: dict[str, Any] = { - 'model_name': opts.model, - 'verbose': opts.verbose, - 'self_heal': opts.self_heal, + browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} + + # Parameters for sessions.start() — AI model settings from stagehand_options. + session_start_params: dict[str, Any] = { + 'model_name': self._stagehand_options.model, + 'verbose': self._stagehand_options.verbose, + 'self_heal': self._stagehand_options.self_heal, } - if opts.dom_settle_timeout_ms is not None: - start_kwargs['dom_settle_timeout_ms'] = opts.dom_settle_timeout_ms - if opts.system_prompt is not None: - start_kwargs['system_prompt'] = opts.system_prompt + if self._stagehand_options.dom_settle_timeout_ms is not None: + session_start_params['dom_settle_timeout_ms'] = self._stagehand_options.dom_settle_timeout_ms - if opts.env == 'LOCAL': - launch_options: dict[str, Any] = dict(self._base_launch_options) + if self._stagehand_options.system_prompt is not None: + session_start_params['system_prompt'] = self._stagehand_options.system_prompt + + if self._stagehand_options.env == 'LOCAL': if proxy_info: - launch_options['proxy'] = { + browser_new_context_options['proxy'] = { 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', 'username': proxy_info.username or '', 'password': proxy_info.password or '', } - start_kwargs['browser'] = {'type': 'local', 'launch_options': launch_options} - elif proxy_info: - logger.warning( - 'Proxy support in BROWSERBASE mode requires configuring proxies via ' - 'browserbase_session_create_params. proxy_info will be ignored.' - ) + session_start_params['browser'] = {'type': 'local', 'launch_options': browser_new_context_options} + else: + session_start_params['browser'] = {'type': 'browserbase'} + if proxy_info: + session_start_params['browserbase_session_create_params'] = { + 'proxies': [ + { + 'type': 'external', + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + ] + } - session: AsyncSession = await self._stagehand_client.sessions.start(**start_kwargs) + session: AsyncSession = await self._stagehand_client.sessions.start(**session_start_params) cdp_url = session.data.cdp_url if not cdp_url: diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py index 8d2faf77e9..25e39176b8 100644 --- a/src/crawlee/browsers/_stagehand_browser_plugin.py +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -31,11 +31,16 @@ class StagehandBrowserPlugin(BrowserPlugin): """A plugin for managing Stagehand AI-powered browser automation. - Stagehand creates and manages the browser instance (local binary or Browserbase cloud). - Playwright then connects to it via CDP, enabling both standard Playwright automation - and AI-powered operations in the same crawling context. - - Only Chromium is supported because Stagehand relies on the Chrome DevTools Protocol. + It acts as a factory for creating `StagehandBrowserController` instances and manages the + lifecycle of the shared `AsyncStagehand` REST client and the Playwright context. Depending + on the configured environment, the browser runs locally via a bundled Chromium binary + (``env='LOCAL'``) or in the Browserbase cloud (``env='BROWSERBASE'``). Playwright connects + to the running browser via CDP, so only Chromium is supported. + + Not all Playwright browser and context options are supported — only those accepted by + Stagehand's ``BrowserLaunchOptions``. Because Stagehand creates the browser and its context + together in a single ``sessions.start()`` call, both ``browser_launch_options`` and + ``browser_new_context_options`` are merged into one set of options applied at session start. """ AUTOMATION_LIBRARY = 'stagehand' @@ -52,46 +57,50 @@ def __init__( """Initialize a new instance. Args: - user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local - storage. + user_data_dir: Path to a user data directory, which stores browser session data like + cookies and local storage. stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). - browser_launch_options: Keyword arguments for browser launch. Supported options are - a subset of Playwright's ``browser_type.launch`` options that map to Stagehand's - ``BrowserLaunchOptions``. Unsupported keys are logged as warnings and ignored. - browser_new_context_options: Keyword arguments for browser context creation. - Options that map to Stagehand's ``BrowserLaunchOptions`` are merged with - ``browser_launch_options``. Unsupported keys are logged as warnings and ignored. - max_open_pages_per_browser: Maximum number of pages that can be open per browser. + browser_launch_options: Keyword arguments passed to Stagehand's ``BrowserLaunchOptions`` + on session start. Supported keys are a subset of Playwright's ``browser_type.launch`` + options. These take priority over ``browser_new_context_options`` for shared keys. + browser_new_context_options: Additional options merged with ``browser_launch_options`` + at lower priority. Subject to the same ``BrowserLaunchOptions`` constraints. + max_open_pages_per_browser: The maximum number of pages that can be open in a single + browser instance. Once reached, a new browser instance will be launched. """ config = service_locator.get_configuration() self._max_open_pages_per_browser = max_open_pages_per_browser - self.stagehand_options = stagehand_options or StagehandOptions() - self._browser_new_context_options = browser_new_context_options or {} is_local = self.stagehand_options.env == 'LOCAL' - self._base_launch_options: dict[str, Any] = { + # browser_launch_options take priority over browser_new_context_options for shared keys. + self._browser_launch_options: dict[str, Any] = { 'headless': config.headless, 'chromium_sandbox': not config.disable_browser_sandbox, + **(browser_new_context_options or {}), + **(browser_launch_options or {}), } + if config.default_browser_path: - self._base_launch_options['executable_path'] = config.default_browser_path + self._browser_launch_options.setdefault('executable_path', config.default_browser_path) - self._base_launch_options = {**self._base_launch_options, **(browser_launch_options or {})} + if user_data_dir is not None: + self._browser_launch_options['user_data_dir'] = str(user_data_dir) - self._stagehand_init_kwargs: dict[str, Any] = { + # Parameters for AsyncStagehand. + self._stagehand_init_params: dict[str, Any] = { 'server': 'local' if is_local else 'remote', - 'local_headless': self._base_launch_options.get('headless', config.headless), + 'local_headless': self._browser_launch_options['headless'], 'local_ready_timeout_s': self.stagehand_options.local_ready_timeout_s, - 'user_data_dir': str(user_data_dir) if user_data_dir else None, } - if is_local: - self._stagehand_init_kwargs['model_api_key'] = self.stagehand_options.api_key - else: - self._stagehand_init_kwargs['browserbase_api_key'] = self.stagehand_options.api_key - self._stagehand_init_kwargs['browserbase_project_id'] = self.stagehand_options.project_id + + self._stagehand_init_params['model_api_key'] = self.stagehand_options.api_key + + if not is_local: + self._stagehand_init_params['browserbase_api_key'] = self.stagehand_options.api_key + self._stagehand_init_params['browserbase_project_id'] = self.stagehand_options.project_id self._stagehand_client: AsyncStagehand | None = None self._playwright_context_manager = async_playwright() @@ -111,12 +120,25 @@ def browser_type(self) -> BrowserType: @property @override def browser_launch_options(self) -> Mapping[str, Any]: - return self._base_launch_options + """Return the options passed to Stagehand's ``BrowserLaunchOptions`` on session start. + + These are a subset of Playwright's ``browser_type.launch`` options — only keys recognised + by Stagehand's ``BrowserLaunchOptions`` take effect. + """ + return self._browser_launch_options @property @override def browser_new_context_options(self) -> Mapping[str, Any]: - return {} + """Return the browser context options passed to Stagehand's ``BrowserLaunchOptions``. + + Stagehand creates the browser and its context together in a single ``sessions.start()`` + call, so context-level options such as ``viewport`` and ``locale`` are part of + ``BrowserLaunchOptions`` and share the same dictionary as ``browser_launch_options``. + Pre-navigation hooks that modify these options before the first page will take effect, + because session creation is deferred until the first ``new_page`` call. + """ + return self._browser_launch_options @property @override @@ -131,16 +153,15 @@ async def __aenter__(self) -> StagehandBrowserPlugin: self._active = True self._playwright = await self._playwright_context_manager.__aenter__() - # Resolve Chromium path for LOCAL mode. if self.stagehand_options.env == 'LOCAL': - if 'executable_path' not in self._base_launch_options: + if 'executable_path' not in self._browser_launch_options: chrome_path = self._playwright.chromium.executable_path - self._base_launch_options['executable_path'] = chrome_path + self._browser_launch_options['executable_path'] = chrome_path logger.debug(f'Resolved Chromium path from Playwright: {chrome_path}') - self._stagehand_init_kwargs['local_chrome_path'] = self._base_launch_options['executable_path'] + self._stagehand_init_params['local_chrome_path'] = self._browser_launch_options['executable_path'] - client = AsyncStagehand(**self._stagehand_init_kwargs) + client = AsyncStagehand(**self._stagehand_init_params) await client.__aenter__() self._stagehand_client = client @@ -175,6 +196,5 @@ async def new_browser(self) -> BrowserController: playwright=self._playwright, stagehand_client=self._stagehand_client, stagehand_options=self.stagehand_options, - base_launch_options=self._base_launch_options, max_open_pages_per_browser=self._max_open_pages_per_browser, ) diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py index 684851349b..d64e39549b 100644 --- a/src/crawlee/crawlers/_stagehand/__init__.py +++ b/src/crawlee/crawlers/_stagehand/__init__.py @@ -1,9 +1,21 @@ -from crawlee.crawlers._stagehand._stagehand_crawler import StagehandCrawler -from crawlee.crawlers._stagehand._stagehand_crawling_context import ( - StagehandCrawlingContext, - StagehandPostNavCrawlingContext, - StagehandPreNavCrawlingContext, -) +from crawlee._utils.try_import import install_import_hook as _install_import_hook +from crawlee._utils.try_import import try_import as _try_import + +_install_import_hook(__name__) + +# The following imports are wrapped in try_import to handle optional dependencies, +# ensuring the module can still function even if these dependencies are missing. +with _try_import(__name__, 'StagehandCrawler'): + from ._stagehand_crawler import StagehandCrawler +with _try_import( + __name__, 'StagehandCrawlingContext', 'StagehandPostNavCrawlingContext', 'StagehandPreNavCrawlingContext' +): + from ._stagehand_crawling_context import ( + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, + ) + __all__ = [ 'StagehandCrawler', diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index a44e70195c..534bd31156 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -14,10 +14,14 @@ ) if TYPE_CHECKING: + from datetime import timedelta + from pathlib import Path + from typing_extensions import Unpack - from crawlee.browsers._types import StagehandOptions + from crawlee.browsers import StagehandOptions from crawlee.crawlers._basic import BasicCrawlerOptions + from crawlee.crawlers._playwright._types import GotoOptions from crawlee.statistics import StatisticsState @@ -31,30 +35,55 @@ class StagehandCrawler( ): """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee. - Extends `PlaywrightCrawler` with a `StagehandBrowserPlugin` that manages a Stagehand - session per browser instance. Each page in the crawling context is a `StagehandPage`, - which exposes AI methods alongside all standard Playwright `Page` methods: + `StagehandCrawler` builds on top of `PlaywrightCrawler`, inheriting all of its features. + It uses `StagehandBrowserPlugin` to manage Stagehand sessions. Stagehand creates and manages + the browser instance — either locally via a bundled Chromium binary, or remotely via Browserbase + cloud — and Playwright connects to it via the Chrome DevTools Protocol (CDP). + + Because Stagehand relies on CDP, only Chromium is supported. Not all Playwright browser and + context configuration options are available — browser settings are limited to the subset accepted + by Stagehand's ``BrowserLaunchOptions`` (such as ``headless``, ``args``, ``viewport``, ``proxy``, + ``locale``, and ``executable_path``). Features like fingerprint generation and incognito pages + are not supported. - - `page.act(**kwargs)` — perform actions using natural language - - `page.extract(**kwargs)` — extract structured data with AI - - `page.observe(**kwargs)` — get AI-suggested actions on the page + Each page in the crawling context is a `StagehandPage`, which extends the standard Playwright + `Page` with the following AI methods: + + - `page.act(**kwargs)` — perform an action on the page using natural language + - `page.extract(**kwargs)` — extract structured data from the page with AI + - `page.observe(**kwargs)` — get AI-suggested actions available on the page - `page.execute(**kwargs)` — run an autonomous multi-step agent + Stagehand configuration (model, API key, environment) is provided via `stagehand_options`. + By default, the crawler runs locally using the `openai/gpt-4.1-mini` model. + ### Usage ```python + import asyncio from crawlee.crawlers import StagehandCrawler from crawlee.crawlers._stagehand import StagehandCrawlingContext + from crawlee.browsers._types import StagehandOptions - crawler = StagehandCrawler() + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + api_key='sk-...', + model='openai/gpt-4.1-mini', + ), + ) @crawler.router.default_handler async def handler(context: StagehandCrawlingContext) -> None: - await context.page.act(input='Click the login button') - data = await context.page.extract(instruction='Get the page title') + context.log.info(f'Processing {context.request.url} ...') + + # Use standard Playwright methods alongside AI methods. + await context.page.act(instruction='Click the accept cookies button if present') + + data = await context.page.extract(instruction='Get the article title and author') + await context.push_data(data) - await crawler.run(['https://example.com']) + asyncio.run(crawler.run(['https://example.com'])) ``` """ @@ -67,8 +96,13 @@ def __init__( *, stagehand_options: StagehandOptions | None = None, browser_pool: BrowserPool | None = None, + user_data_dir: str | Path | None = None, + headless: bool | None = None, + browser_launch_options: dict[str, Any] | None = None, browser_new_context_options: dict[str, Any] | None = None, - max_open_pages_per_browser: int = 20, + goto_options: GotoOptions | None = None, + navigation_timeout: timedelta | None = None, + max_open_pages_per_browser: int | None = None, **kwargs: Unpack[BasicCrawlerOptions[StagehandCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. @@ -77,30 +111,69 @@ def __init__( stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). Cannot be specified if `browser_pool` is provided. browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of - `StagehandBrowserPlugin` (or its subclasses). If omitted, a pool is created - automatically from `stagehand_options`. - browser_new_context_options: Options passed to Playwright's `browser.new_context` - after connecting via CDP. Cannot be specified if `browser_pool` is provided. - max_open_pages_per_browser: Maximum pages open per browser instance. + `StagehandBrowserPlugin`. If omitted, a pool is created automatically from the + other browser arguments. + user_data_dir: Path to a user data directory, which stores browser session data like + cookies and local storage. Cannot be specified if `browser_pool` is provided. + headless: Whether to run the browser in headless mode. Defaults to the value from + Crawlee's global `Configuration`. Cannot be specified if `browser_pool` is provided. + browser_launch_options: Keyword arguments for browser launch passed to Stagehand's + `BrowserLaunchOptions` (a subset of Playwright's launch options). Supported keys + include `args`, `executable_path`, `proxy`, `viewport`, `locale`, and others. + Cannot be specified if `browser_pool` is provided. + browser_new_context_options: Keyword arguments for browser context creation, merged + with `browser_launch_options`. Options that map to `BrowserLaunchOptions` take + effect on the first page; subsequent pages reuse the existing session context. + Cannot be specified if `browser_pool` is provided. + goto_options: Additional options passed to Stagehand's `Page.goto()`. The `timeout` + option is not supported — use `navigation_timeout` instead. + navigation_timeout: Timeout for the navigation phase (from opening the page to calling + the request handler). Defaults to one minute. + max_open_pages_per_browser: Maximum number of pages open per browser instance. Cannot be specified if `browser_pool` is provided. kwargs: Additional keyword arguments forwarded to `BasicCrawler`. """ if browser_pool is not None: self._validate_browser_pool(browser_pool) - if stagehand_options is not None: - raise ValueError('Cannot specify both browser_pool and stagehand_options') + if any( + param is not None + for param in ( + stagehand_options, + user_data_dir, + headless, + browser_launch_options, + browser_new_context_options, + max_open_pages_per_browser, + ) + ): + raise ValueError( + 'Cannot specify `stagehand_options`, `user_data_dir`, `headless`, ' + '`browser_launch_options`, `browser_new_context_options` or ' + '`max_open_pages_per_browser` when `browser_pool` is provided.' + ) else: + launch_options = dict(browser_launch_options or {}) + if headless is not None: + launch_options['headless'] = headless + browser_pool = BrowserPool( plugins=[ StagehandBrowserPlugin( stagehand_options=stagehand_options, + user_data_dir=user_data_dir, + browser_launch_options=launch_options or None, browser_new_context_options=browser_new_context_options, - max_open_pages_per_browser=max_open_pages_per_browser, + max_open_pages_per_browser=max_open_pages_per_browser or 20, ) ] ) - super().__init__(browser_pool=browser_pool, **kwargs) + super().__init__( + browser_pool=browser_pool, + goto_options=goto_options, + navigation_timeout=navigation_timeout, + **kwargs, + ) @staticmethod def _validate_browser_pool(pool: BrowserPool) -> None: From 0424afcca880a41a438067385d34801ea7e197dd Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 29 Apr 2026 23:40:39 +0000 Subject: [PATCH 05/10] fix docs --- pyproject.toml | 1 + src/crawlee/browsers/__init__.py | 4 +- .../browsers/_stagehand_browser_controller.py | 5 +- .../browsers/_stagehand_browser_plugin.py | 8 +- src/crawlee/browsers/_stagehand_types.py | 128 ++++++++++++++++++ src/crawlee/browsers/_types.py | 89 +----------- .../_playwright/_playwright_crawler.py | 5 + .../crawlers/_stagehand/_stagehand_crawler.py | 2 +- uv.lock | 10 +- 9 files changed, 155 insertions(+), 97 deletions(-) create mode 100644 src/crawlee/browsers/_stagehand_types.py diff --git a/pyproject.toml b/pyproject.toml index 5f5dbd0f54..ed0de57607 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ sql_postgres = [ ] stagehand = [ "stagehand>=3.19.0", + "playwright>=1.27.0", ] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index 6bfcf3ea53..cef3446690 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -1,7 +1,7 @@ from crawlee._utils.try_import import install_import_hook as _install_import_hook from crawlee._utils.try_import import try_import as _try_import -from ._types import BrowserType, CrawleePage, StagehandOptions, StagehandPage +from ._types import BrowserType, CrawleePage _install_import_hook(__name__) @@ -21,6 +21,8 @@ from ._stagehand_browser_controller import StagehandBrowserController with _try_import(__name__, 'StagehandBrowserPlugin'): from ._stagehand_browser_plugin import StagehandBrowserPlugin +with _try_import(__name__, 'StagehandOptions', 'StagehandPage'): + from ._stagehand_types import StagehandOptions, StagehandPage __all__ = [ diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index 01c1c07ea6..5906ea8b38 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group from crawlee.browsers._browser_controller import BrowserController -from crawlee.browsers._types import StagehandPage +from crawlee.browsers._stagehand_types import StagehandPage if TYPE_CHECKING: from collections.abc import Mapping @@ -17,7 +17,8 @@ from playwright.async_api import Browser, BrowserContext, Playwright from stagehand import AsyncSession, AsyncStagehand - from crawlee.browsers._types import BrowserType, StagehandOptions + from crawlee.browsers._stagehand_types import StagehandOptions + from crawlee.browsers._types import BrowserType from crawlee.proxy_configuration import ProxyInfo diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py index 25e39176b8..cb8d81d855 100644 --- a/src/crawlee/browsers/_stagehand_browser_plugin.py +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -13,7 +13,7 @@ from ._browser_plugin import BrowserPlugin from ._stagehand_browser_controller import StagehandBrowserController -from ._types import StagehandOptions +from ._stagehand_types import StagehandOptions if TYPE_CHECKING: from collections.abc import Mapping @@ -95,11 +95,11 @@ def __init__( 'local_headless': self._browser_launch_options['headless'], 'local_ready_timeout_s': self.stagehand_options.local_ready_timeout_s, } - - self._stagehand_init_params['model_api_key'] = self.stagehand_options.api_key + if self.stagehand_options.model_api_key is not None: + self._stagehand_init_params['model_api_key'] = self.stagehand_options.model_api_key if not is_local: - self._stagehand_init_params['browserbase_api_key'] = self.stagehand_options.api_key + self._stagehand_init_params['browserbase_api_key'] = self.stagehand_options.browserbase_api_key self._stagehand_init_params['browserbase_project_id'] = self.stagehand_options.project_id self._stagehand_client: AsyncStagehand | None = None diff --git a/src/crawlee/browsers/_stagehand_types.py b/src/crawlee/browsers/_stagehand_types.py new file mode 100644 index 0000000000..fbc6cb5972 --- /dev/null +++ b/src/crawlee/browsers/_stagehand_types.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Literal + +from playwright.async_api import Page + +if TYPE_CHECKING: + from stagehand import AsyncSession + from stagehand.types.session_act_params import SessionActParams + from stagehand.types.session_act_response import SessionActResponse + from stagehand.types.session_execute_params import SessionExecuteParams + from stagehand.types.session_execute_response import SessionExecuteResponse + from stagehand.types.session_extract_params import SessionExtractParams + from stagehand.types.session_extract_response import SessionExtractResponse + from stagehand.types.session_observe_params import SessionObserveParams + from stagehand.types.session_observe_response import SessionObserveResponse + from typing_extensions import Unpack + + +@dataclass +class StagehandOptions: + """Configuration options for Stagehand AI-powered browser automation. + + Controls the execution environment (local or Browserbase cloud), the AI model used + for page operations, and session-level behaviour such as self-healing and DOM settling. + """ + + env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL' + """Execution environment. + 'LOCAL' - Stagehand runs a local Chromium browser. + 'BROWSERBASE' - Stagehand uses a Browserbase cloud browser session. + """ + + browserbase_api_key: str | None = None + """API key for authenticating with Browserbase when `env='BROWSERBASE'`. If not provided, read from + the `BROWSERBASE_API_KEY` environment variable.""" + + model_api_key: str | None = None + """API key for the AI model provider (e.g. OpenAI, Anthropic). Must be provided explicitly - unlike + Browserbase credentials""" + + project_id: str | None = None + """Browserbase project ID, required when `env='BROWSERBASE'`. If not provided, read from + the `BROWSERBASE_PROJECT_ID` environment variable.""" + + model: str = 'openai/gpt-4.1-mini' + """The AI model to use for page operations.""" + + verbose: Literal[0, 1, 2] = 0 + """Verbosity level for logging Stagehand session activity. + 0 - quiet + 1 - normal + 2 - debug""" + + self_heal: bool = True + """When `True`, Stagehand automatically retries failed actions.""" + + dom_settle_timeout_ms: float | None = None + """Maximum time to wait for the DOM to settle before performing an action, in milliseconds.""" + + system_prompt: str | None = None + """Optional system prompt to guide the AI's behavior in sessions.""" + + local_ready_timeout_s: float = 10.0 + """Maximum time to wait for the local Stagehand browser to be ready, in seconds.""" + + +class StagehandPage(Page): + """A Playwright `Page` enhanced with Stagehand AI methods. + + Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright + methods transparently while adding `act()`, `extract()`, and `observe()` AI operations + bound to the current page. + """ + + def __init__(self, page: Page, session: AsyncSession) -> None: + super().__init__(page._impl_obj) # noqa: SLF001 + self._session = session + + async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: + """Perform an action on the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.act()``. + The most common is ``instruction`` — a natural language description + of the action to perform, e.g. ``instruction='click the login button'``. + + Returns: + The action result from Stagehand. + """ + return await self._session.act(page=self, **kwargs) + + async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: + """Observe the page and get AI-suggested actions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.observe()``. + Optionally pass ``instruction`` to narrow the observation scope. + + Returns: + Observation result with suggested actions. + """ + return await self._session.observe(page=self, **kwargs) + + async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: + """Extract structured data from the page using natural language. + + Args: + **kwargs: Parameters passed to ``AsyncSession.extract()``. + Common parameters: ``instruction`` and ``schema`` (JSON Schema dict). + + Returns: + Extracted data matching the requested schema. + """ + return await self._session.extract(page=self, **kwargs) + + async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: + """Execute arbitrary code on the page via natural language instructions. + + Args: + **kwargs: Parameters passed to ``AsyncSession.execute()``. + Common parameters: ``instruction`` describing the code to execute. + + Returns: + The result of the executed code. + """ + return await self._session.execute(page=self, **kwargs) diff --git a/src/crawlee/browsers/_types.py b/src/crawlee/browsers/_types.py index 7f0268e0f8..c5976b086a 100644 --- a/src/crawlee/browsers/_types.py +++ b/src/crawlee/browsers/_types.py @@ -3,20 +3,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Literal -from playwright.async_api import Page - if TYPE_CHECKING: - from stagehand import AsyncSession - from stagehand.types.session_act_params import SessionActParams - from stagehand.types.session_act_response import SessionActResponse - from stagehand.types.session_execute_params import SessionExecuteParams - from stagehand.types.session_execute_response import SessionExecuteResponse - from stagehand.types.session_extract_params import SessionExtractParams - from stagehand.types.session_extract_response import SessionExtractResponse - from stagehand.types.session_observe_params import SessionObserveParams - from stagehand.types.session_observe_response import SessionObserveResponse - from typing_extensions import Unpack - + from playwright.async_api import Page BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome'] @@ -28,78 +16,3 @@ class CrawleePage: id: str browser_type: BrowserType page: Page - - -@dataclass -class StagehandOptions: - env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL' - api_key: str | None = None - project_id: str | None = None - model: str = 'openai/gpt-4.1-mini' - verbose: Literal[0, 1, 2] = 0 - self_heal: bool = True - dom_settle_timeout_ms: float | None = None - system_prompt: str | None = None - local_ready_timeout_s: float = 15.0 - - -class StagehandPage(Page): - """A Playwright `Page` enhanced with Stagehand AI methods. - - Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright - methods transparently while adding `act()`, `extract()`, and `observe()` AI operations - bound to the current page. - """ - - def __init__(self, page: Page, session: AsyncSession) -> None: - super().__init__(page._impl_obj) # noqa: SLF001 - self._session = session - - async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: - """Perform an action on the page using natural language. - - Args: - **kwargs: Parameters passed to ``AsyncSession.act()``. - The most common is ``instruction`` — a natural language description - of the action to perform, e.g. ``instruction='click the login button'``. - - Returns: - The action result from Stagehand. - """ - return await self._session.act(page=self, **kwargs) - - async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: - """Observe the page and get AI-suggested actions. - - Args: - **kwargs: Parameters passed to ``AsyncSession.observe()``. - Optionally pass ``instruction`` to narrow the observation scope. - - Returns: - Observation result with suggested actions. - """ - return await self._session.observe(page=self, **kwargs) - - async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: - """Extract structured data from the page using natural language. - - Args: - **kwargs: Parameters passed to ``AsyncSession.extract()``. - Common parameters: ``instruction`` and ``schema`` (JSON Schema dict). - - Returns: - Extracted data matching the requested schema. - """ - return await self._session.extract(page=self, **kwargs) - - async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: - """Execute arbitrary code on the page via natural language instructions. - - Args: - **kwargs: Parameters passed to ``AsyncSession.execute()``. - Common parameters: ``instruction`` describing the code to execute. - - Returns: - The result of the executed code. - """ - return await self._session.execute(page=self, **kwargs) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0d8e052b5d..1f06564649 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -275,7 +275,10 @@ def _build_context( response: Response | None = None, ) -> PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext | PlaywrightCrawlingContext: """Build the crawling context by adding Playwright-specific properties.""" + # Order is important, as all context subclasses each other, + # so the most specific context should be checked first. match context: + # Create the final crawling context from `PostNavCrawlingContext` case self._POST_NAV_CONTEXT_CLASS(): extract_links = self._create_extract_links_function(context) return self._CRAWLING_CONTEXT_CLASS( @@ -297,6 +300,7 @@ def _build_context( enqueue_links=self._create_enqueue_links_function(context, extract_links), block_requests=context.block_requests, ) + # Create the post-navigation context from `PreNavCrawlingContext` case self._PRE_NAV_CONTEXT_CLASS(): if response is None: raise ValueError('Response must be provided for post-navigation context.') @@ -317,6 +321,7 @@ def _build_context( goto_options=context.goto_options, response=response, ) + # Create the pre-navigation context from `BasicCrawlingContext` case BasicCrawlingContext(): if page is None: raise ValueError('Page must be provided for pre-navigation context.') diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index 534bd31156..d2483d8320 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -63,7 +63,7 @@ class StagehandCrawler( import asyncio from crawlee.crawlers import StagehandCrawler from crawlee.crawlers._stagehand import StagehandCrawlingContext - from crawlee.browsers._types import StagehandOptions + from crawlee.browsers import StagehandOptions crawler = StagehandCrawler( stagehand_options=StagehandOptions( diff --git a/uv.lock b/uv.lock index a4103d084d..f07e49d5c0 100644 --- a/uv.lock +++ b/uv.lock @@ -12,6 +12,12 @@ resolution-markers = [ exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "PT24H" +[options.exclude-newer-package] +apify-fingerprint-datapoints = false +crawlee = false +apify-shared = false +apify-client = false + [[package]] name = "aiomysql" version = "0.3.2" @@ -875,6 +881,7 @@ sql-sqlite = [ { name = "sqlalchemy", extra = ["asyncio"] }, ] stagehand = [ + { name = "playwright" }, { name = "stagehand" }, ] @@ -923,7 +930,7 @@ requires-dist = [ { name = "cachetools", specifier = ">=5.5.0" }, { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, - { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "redis", "stagehand"], marker = "extra == 'all'" }, + { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "stagehand", "redis"], marker = "extra == 'all'" }, { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, @@ -941,6 +948,7 @@ requires-dist = [ { name = "parsel", marker = "extra == 'parsel'", specifier = ">=1.10.0" }, { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" }, + { name = "playwright", marker = "extra == 'stagehand'", specifier = ">=1.27.0" }, { name = "protego", specifier = ">=0.5.0" }, { name = "psutil", specifier = ">=6.0.0" }, { name = "pydantic", specifier = ">=2.11.0" }, From bd459158f4a3e87c55f04d31f40e400741a2e26d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 3 May 2026 17:11:48 +0000 Subject: [PATCH 06/10] add tests --- .../browsers/_stagehand_browser_controller.py | 7 +- .../browsers/_stagehand_browser_plugin.py | 23 +- src/crawlee/browsers/_stagehand_types.py | 8 +- .../crawlers/_stagehand/_stagehand_crawler.py | 5 +- .../test_stagehand_browser_controller.py | 339 ++++++++++++++++++ .../browsers/test_stagehand_browser_plugin.py | 145 ++++++++ .../_stagehand/test_stagehand_crawler.py | 157 ++++++++ 7 files changed, 667 insertions(+), 17 deletions(-) create mode 100644 tests/unit/browsers/test_stagehand_browser_controller.py create mode 100644 tests/unit/browsers/test_stagehand_browser_plugin.py create mode 100644 tests/unit/crawlers/_stagehand/test_stagehand_crawler.py diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index 5906ea8b38..7052dd9585 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -179,9 +179,12 @@ async def _ensure_session( browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, ) -> None: + if self._session is not None: return + async with self._session_init_lock: + # Double-check if the session was created while waiting for the lock. if self._session is not None: return @@ -231,7 +234,5 @@ async def _ensure_session( ) self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url) - self._browser_context = ( - self._browser.contexts[0] if self._browser.contexts else await self._browser.new_context() - ) + self._browser_context = self._browser.contexts[0] self._session = session diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py index cb8d81d855..3bfb72b09f 100644 --- a/src/crawlee/browsers/_stagehand_browser_plugin.py +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -71,9 +71,9 @@ def __init__( config = service_locator.get_configuration() self._max_open_pages_per_browser = max_open_pages_per_browser - self.stagehand_options = stagehand_options or StagehandOptions() + self._stagehand_options = stagehand_options or StagehandOptions() - is_local = self.stagehand_options.env == 'LOCAL' + is_local = self._stagehand_options.env == 'LOCAL' # browser_launch_options take priority over browser_new_context_options for shared keys. self._browser_launch_options: dict[str, Any] = { @@ -93,14 +93,14 @@ def __init__( self._stagehand_init_params: dict[str, Any] = { 'server': 'local' if is_local else 'remote', 'local_headless': self._browser_launch_options['headless'], - 'local_ready_timeout_s': self.stagehand_options.local_ready_timeout_s, + 'local_ready_timeout_s': self._stagehand_options.local_ready_timeout_s, } - if self.stagehand_options.model_api_key is not None: - self._stagehand_init_params['model_api_key'] = self.stagehand_options.model_api_key + if self._stagehand_options.model_api_key is not None: + self._stagehand_init_params['model_api_key'] = self._stagehand_options.model_api_key if not is_local: - self._stagehand_init_params['browserbase_api_key'] = self.stagehand_options.browserbase_api_key - self._stagehand_init_params['browserbase_project_id'] = self.stagehand_options.project_id + self._stagehand_init_params['browserbase_api_key'] = self._stagehand_options.browserbase_api_key + self._stagehand_init_params['browserbase_project_id'] = self._stagehand_options.project_id self._stagehand_client: AsyncStagehand | None = None self._playwright_context_manager = async_playwright() @@ -145,6 +145,11 @@ def browser_new_context_options(self) -> Mapping[str, Any]: def max_open_pages_per_browser(self) -> int: return self._max_open_pages_per_browser + @property + def stagehand_options(self) -> StagehandOptions: + """Return the Stagehand-specific configuration options.""" + return self._stagehand_options + @override async def __aenter__(self) -> StagehandBrowserPlugin: if self._active: @@ -153,7 +158,7 @@ async def __aenter__(self) -> StagehandBrowserPlugin: self._active = True self._playwright = await self._playwright_context_manager.__aenter__() - if self.stagehand_options.env == 'LOCAL': + if self._stagehand_options.env == 'LOCAL': if 'executable_path' not in self._browser_launch_options: chrome_path = self._playwright.chromium.executable_path self._browser_launch_options['executable_path'] = chrome_path @@ -195,6 +200,6 @@ async def new_browser(self) -> BrowserController: return StagehandBrowserController( playwright=self._playwright, stagehand_client=self._stagehand_client, - stagehand_options=self.stagehand_options, + stagehand_options=self._stagehand_options, max_open_pages_per_browser=self._max_open_pages_per_browser, ) diff --git a/src/crawlee/browsers/_stagehand_types.py b/src/crawlee/browsers/_stagehand_types.py index fbc6cb5972..0b30f2ad73 100644 --- a/src/crawlee/browsers/_stagehand_types.py +++ b/src/crawlee/browsers/_stagehand_types.py @@ -79,7 +79,7 @@ def __init__(self, page: Page, session: AsyncSession) -> None: self._session = session async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: - """Perform an action on the page using natural language. + """Perform an action on the page using natural language. Argument `page` is automatically set. Args: **kwargs: Parameters passed to ``AsyncSession.act()``. @@ -92,7 +92,7 @@ async def act(self, **kwargs: Unpack[SessionActParams]) -> SessionActResponse: return await self._session.act(page=self, **kwargs) async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserveResponse: - """Observe the page and get AI-suggested actions. + """Observe the page and get AI-suggested actions. Argument `page` is automatically set. Args: **kwargs: Parameters passed to ``AsyncSession.observe()``. @@ -104,7 +104,7 @@ async def observe(self, **kwargs: Unpack[SessionObserveParams]) -> SessionObserv return await self._session.observe(page=self, **kwargs) async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtractResponse: - """Extract structured data from the page using natural language. + """Extract structured data from the page using natural language. Argument `page` is automatically set. Args: **kwargs: Parameters passed to ``AsyncSession.extract()``. @@ -116,7 +116,7 @@ async def extract(self, **kwargs: Unpack[SessionExtractParams]) -> SessionExtrac return await self._session.extract(page=self, **kwargs) async def execute(self, **kwargs: Unpack[SessionExecuteParams]) -> SessionExecuteResponse: - """Execute arbitrary code on the page via natural language instructions. + """Execute arbitrary code on the page via natural language instructions. Argument `page` is automatically set. Args: **kwargs: Parameters passed to ``AsyncSession.execute()``. diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index d2483d8320..62edb0231f 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging from typing import TYPE_CHECKING, Any from crawlee._utils.docs import docs_group @@ -67,7 +68,7 @@ class StagehandCrawler( crawler = StagehandCrawler( stagehand_options=StagehandOptions( - api_key='sk-...', + model_api_key='sk-...', model='openai/gpt-4.1-mini', ), ) @@ -168,6 +169,8 @@ def __init__( ] ) + kwargs.setdefault('_logger', logging.getLogger(__name__)) + super().__init__( browser_pool=browser_pool, goto_options=goto_options, diff --git a/tests/unit/browsers/test_stagehand_browser_controller.py b/tests/unit/browsers/test_stagehand_browser_controller.py new file mode 100644 index 0000000000..dfd08ab614 --- /dev/null +++ b/tests/unit/browsers/test_stagehand_browser_controller.py @@ -0,0 +1,339 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from playwright.async_api import Browser, Playwright, async_playwright +from stagehand import AsyncSession, AsyncStagehand + +from crawlee.browsers import StagehandBrowserController, StagehandOptions, StagehandPage +from crawlee.proxy_configuration import ProxyInfo + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def playwright() -> AsyncGenerator[Playwright, None]: + async with async_playwright() as playwright: + yield playwright + + +@pytest.fixture +async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: + browser = await playwright.chromium.launch() + # Stagehand's CDP setup always provides a pre-existing context; simulate it. + await browser.new_context() + yield browser + await browser.close() + + +@pytest.fixture +def session_mock() -> MagicMock: + session = MagicMock(spec=AsyncSession) + session.data = MagicMock() + session.data.cdp_url = 'ws://fake-cdp' + session.end = AsyncMock() + return session + + +@pytest.fixture +def stagehand_client_mock(session_mock: MagicMock) -> MagicMock: + client = MagicMock(spec=AsyncStagehand) + client.sessions.start = AsyncMock(return_value=session_mock) + return client + + +@pytest.fixture +async def controller( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> AsyncGenerator[StagehandBrowserController, None]: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(), + max_open_pages_per_browser=2, + ) + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + yield controller + await controller.close(force=True) + + +async def test_initial_state(playwright: Playwright, session_mock: MagicMock) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(), + ) + + assert controller.pages == [] + assert controller.pages_count == 0 + assert isinstance(controller.last_page_opened_at, datetime) + assert controller.idle_time < timedelta(seconds=1) + assert controller.has_free_capacity + assert controller.is_browser_connected # True before any session is started + assert controller.browser_type == 'chromium' + + +async def test_open_and_close_page(controller: StagehandBrowserController) -> None: + page = await controller.new_page() + + assert isinstance(page, StagehandPage) + assert page in controller.pages + assert controller.pages_count == 1 + assert controller.last_page_opened_at <= datetime.now(timezone.utc) + + await page.close() + + assert page not in controller.pages + assert controller.pages_count == 0 + + +async def test_max_open_pages_limit(controller: StagehandBrowserController) -> None: + page1 = await controller.new_page() + assert controller.pages_count == 1 + + page2 = await controller.new_page() + assert controller.pages_count == 2 + + with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'): + await controller.new_page() + + assert controller.pages_count == 2 + + await page1.close() + assert controller.pages_count == 1 + + page3 = await controller.new_page() + assert controller.pages_count == 2 + + await page2.close() + await page3.close() + + assert controller.pages == [] + assert controller.pages_count == 0 + + +async def test_idle_time(controller: StagehandBrowserController) -> None: + idle_time_before = controller.idle_time + await asyncio.sleep(1) # Simulate waiting + idle_time_after = controller.idle_time + assert idle_time_after > idle_time_before + + +async def test_close_browser_with_open_pages( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, + session_mock: MagicMock, +) -> None: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(), + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + _ = await controller.new_page() + + with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'): + await controller.close() + + assert controller.pages_count == 1 + assert controller.is_browser_connected + + await controller.close(force=True) + + assert controller.pages_count == 0 + + assert session_mock.end.await_count == 1 + assert not controller.is_browser_connected + + +async def test_second_page_reuses_session( + controller: StagehandBrowserController, + stagehand_client_mock: MagicMock, +) -> None: + page1 = await controller.new_page() + page2 = await controller.new_page() + + assert stagehand_client_mock.sessions.start.await_count == 1 + assert page1.context == page2.context + assert page1 is not page2 # Different Page instances + + await page1.close() + await page2.close() + + +async def test_concurrent_session_init( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + call_count = 0 + + async def delayed_start(**_kwargs: Any) -> Any: + nonlocal call_count + call_count += 1 + await asyncio.sleep(5) # Simulate delay in session start + return session_mock + + stagehand_client = MagicMock() + stagehand_client.sessions.start = delayed_start + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(), + max_open_pages_per_browser=5, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + pages = await asyncio.gather(controller.new_page(), controller.new_page()) + + # Only one session should be started despite concurrent calls to `new_page()`. + assert call_count == 1 + + for page in pages: + await page.close() + await controller.close(force=True) + + +async def test_session_start_params_local( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> None: + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions( + env='LOCAL', + model='openai/gpt-4o', + verbose=1, + self_heal=False, + dom_settle_timeout_ms=500.0, + system_prompt='Test prompt', + ), + max_open_pages_per_browser=2, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + page = await controller.new_page() + await page.close() + await controller.close() + + call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs + assert call_kwargs['model_name'] == 'openai/gpt-4o' + assert call_kwargs['verbose'] == 1 + assert call_kwargs['self_heal'] is False + assert call_kwargs['dom_settle_timeout_ms'] == 500.0 + assert call_kwargs['system_prompt'] == 'Test prompt' + assert call_kwargs['browser'] == {'type': 'local', 'launch_options': {}} + + +async def test_session_start_params_browserbase( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> None: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(env='BROWSERBASE'), + max_open_pages_per_browser=2, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + page = await controller.new_page() + await page.close() + await controller.close() + + call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs + assert call_kwargs['browser'] == {'type': 'browserbase'} + assert 'browserbase_session_create_params' not in call_kwargs + + +async def test_proxy_set_local( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(env='LOCAL'), + max_open_pages_per_browser=2, + ) + + proxy = ProxyInfo( + url='http://proxy.example.com:8080', + scheme='http', + hostname='proxy.example.com', + port=8080, + username='user', + password='pass', + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + await controller.new_page(proxy_info=proxy) + await controller.close(force=True) + + call_kwargs = stagehand_client.sessions.start.call_args.kwargs + + browser_options = call_kwargs['browser'] + assert browser_options['type'] == 'local' + + proxy_options = browser_options['launch_options']['proxy'] + assert proxy_options['server'] == 'http://proxy.example.com:8080' + assert proxy_options['username'] == 'user' + assert proxy_options['password'] == 'pass' + + +async def test_proxy_set_browserbase( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(env='BROWSERBASE'), + max_open_pages_per_browser=2, + ) + + proxy = ProxyInfo( + url='http://proxy.example.com:8080', + scheme='http', + hostname='proxy.example.com', + port=8080, + username='user', + password='pass', + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + await controller.new_page(proxy_info=proxy) + await controller.close(force=True) + + call_kwargs = stagehand_client.sessions.start.call_args.kwargs + + browserbase_proxy_options = call_kwargs['browserbase_session_create_params'] + assert browserbase_proxy_options['proxies'][0]['type'] == 'external' + assert browserbase_proxy_options['proxies'][0]['server'] == 'http://proxy.example.com:8080' + assert browserbase_proxy_options['proxies'][0]['username'] == 'user' + assert browserbase_proxy_options['proxies'][0]['password'] == 'pass' diff --git a/tests/unit/browsers/test_stagehand_browser_plugin.py b/tests/unit/browsers/test_stagehand_browser_plugin.py new file mode 100644 index 0000000000..40cec7b472 --- /dev/null +++ b/tests/unit/browsers/test_stagehand_browser_plugin.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from stagehand import AsyncStagehand + +from crawlee.browsers import StagehandBrowserController, StagehandBrowserPlugin, StagehandOptions + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +_PATCH_MODULE = 'crawlee.browsers._stagehand_browser_plugin' + + +@pytest.fixture +def stagehand_client_mock() -> MagicMock: + client = MagicMock(spec=AsyncStagehand) + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=None) + return client + + +@pytest.fixture +async def plugin() -> AsyncGenerator[StagehandBrowserPlugin, None]: + async with StagehandBrowserPlugin() as plugin: + yield plugin + + +def test_initial_state() -> None: + plugin = StagehandBrowserPlugin(max_open_pages_per_browser=5) + + assert plugin.active is False + assert plugin.browser_type == 'chromium' + assert plugin.max_open_pages_per_browser == 5 + + # headless should be True by default + assert plugin.browser_launch_options['headless'] is True + + +def test_implicit_set_options() -> None: + plugin = StagehandBrowserPlugin( + browser_new_context_options={'viewport': {'width': 1280, 'height': 720}}, + browser_launch_options={'headless': False}, + user_data_dir='./test', + ) + + assert plugin.browser_launch_options['headless'] is False + assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720} + assert plugin.browser_launch_options['user_data_dir'] == './test' + + +def test_order_priority_of_implicit_options() -> None: + # `browser_launch_options` takes priority over `browser_new_context_options` for shared keys, + # while non-conflicting keys from both dicts are merged. + plugin = StagehandBrowserPlugin( + browser_new_context_options={'headless': True, 'viewport': {'width': 1280, 'height': 720}}, + browser_launch_options={'headless': False}, + ) + + assert plugin.browser_launch_options['headless'] is False + assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720} + + +def test_stagehand_options_defaults_when_not_provided() -> None: + plugin = StagehandBrowserPlugin() + + assert isinstance(plugin.stagehand_options, StagehandOptions) + assert plugin.stagehand_options == StagehandOptions() + + +async def test_stagehand_called_with_local_params(stagehand_client_mock: MagicMock) -> None: + with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock: + async with StagehandBrowserPlugin( + stagehand_options=StagehandOptions(env='LOCAL', local_ready_timeout_s=20.0, model_api_key='test_model_key') + ): + pass + + call_kwargs = stagehand_mock.call_args.kwargs + assert call_kwargs['server'] == 'local' + assert call_kwargs['local_ready_timeout_s'] == 20.0 + assert call_kwargs['model_api_key'] == 'test_model_key' + assert 'browserbase_api_key' not in call_kwargs + assert 'browserbase_project_id' not in call_kwargs + + # In local environment, the plugin should set `local_chrome_path` to the path of the Playwright Chromium executable. + assert 'local_chrome_path' in call_kwargs + + +async def test_stagehand_called_with_browserbase_params(stagehand_client_mock: MagicMock) -> None: + with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock: + async with StagehandBrowserPlugin( + stagehand_options=StagehandOptions( + env='BROWSERBASE', + browserbase_api_key='test_key', + project_id='test_project_id', + model_api_key='test_model_key', + ) + ): + pass + + call_kwargs = stagehand_mock.call_args.kwargs + assert call_kwargs['server'] == 'remote' + assert call_kwargs['browserbase_api_key'] == 'test_key' + assert call_kwargs['browserbase_project_id'] == 'test_project_id' + assert call_kwargs['model_api_key'] == 'test_model_key' + + # In Browserbase environment, the plugin should NOT set `local_chrome_path` as it's not used. + assert 'local_chrome_path' not in call_kwargs + + +async def test_methods_raise_error_when_not_active() -> None: + plugin = StagehandBrowserPlugin() + + assert plugin.active is False + + with pytest.raises(RuntimeError, match=r'Plugin is not active'): + await plugin.new_browser() + + with pytest.raises(RuntimeError, match=r'Plugin is already active.'): + async with plugin, plugin: + pass + + async with plugin: + assert plugin.active is True + + +async def test_new_browser(plugin: StagehandBrowserPlugin) -> None: + browser_controller = await plugin.new_browser() + + assert isinstance(browser_controller, StagehandBrowserController) + + assert browser_controller.is_browser_connected + assert browser_controller.has_free_capacity + + await browser_controller.close() + + +async def test_multiple_new_browsers(plugin: StagehandBrowserPlugin) -> None: + browser_controller_1 = await plugin.new_browser() + browser_controller_2 = await plugin.new_browser() + + assert browser_controller_1 is not browser_controller_2 diff --git a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py new file mode 100644 index 0000000000..67e76dfb0b --- /dev/null +++ b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from playwright.async_api import async_playwright +from stagehand import AsyncSession, AsyncStagehand + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserPlugin, + StagehandBrowserPlugin, + StagehandOptions, + StagehandPage, +) +from crawlee.crawlers import ( + PlaywrightCrawlingContext, + StagehandCrawler, + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, +) + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from yarl import URL + + +_PLUGIN_MODULE = 'crawlee.browsers._stagehand_browser_plugin' + + +@pytest.fixture +def stagehand_session_mock() -> MagicMock: + session = MagicMock(spec=AsyncSession) + session.data = MagicMock() + session.data.cdp_url = 'ws://fake-cdp' + session.end = AsyncMock() + return session + + +@pytest.fixture +async def patched_crawler(stagehand_session_mock: MagicMock) -> AsyncGenerator[StagehandCrawler, None]: + """StagehandCrawler with real Playwright but Stagehand session mocked.""" + stagehand_client = MagicMock(spec=AsyncStagehand) + stagehand_client.sessions.start = AsyncMock(return_value=stagehand_session_mock) + stagehand_client.__aenter__ = AsyncMock(return_value=stagehand_client) + stagehand_client.__aexit__ = AsyncMock() + + async with async_playwright() as playwright: + browser = await playwright.chromium.launch() + # Stagehand's CDP always provides a pre-existing context; simulate it. + await browser.new_context() + + mock_playwright_cm = AsyncMock() + mock_playwright_cm.__aenter__ = AsyncMock(return_value=playwright) + mock_playwright_cm.__aexit__ = AsyncMock() + + with ( + patch(f'{_PLUGIN_MODULE}.AsyncStagehand', return_value=stagehand_client), + patch(f'{_PLUGIN_MODULE}.async_playwright', return_value=mock_playwright_cm), + patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)), + ): + yield StagehandCrawler() + + +def test_init_raises_with_browser_pool_and_params() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin()]) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, stagehand_options=StagehandOptions()) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, headless=True) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, max_open_pages_per_browser=5) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, browser_launch_options={'headless': True}) + + +def test_init_raises_with_mixed_plugins_in_browser_pool() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin(), PlaywrightBrowserPlugin()]) + + with pytest.raises(ValueError, match=r'All BrowserPool plugins must be StagehandBrowserPlugin instances'): + StagehandCrawler(browser_pool=pool) + + +def test_init_with_valid_browser_pool() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin()]) + StagehandCrawler(browser_pool=pool) + + +async def test_context_type(patched_crawler: StagehandCrawler, server_url: URL) -> None: + handler_mock = AsyncMock() + pre_navigation_hook_mock = AsyncMock() + post_navigation_hook_mock = AsyncMock() + patched_crawler.pre_navigation_hook(pre_navigation_hook_mock) + patched_crawler.post_navigation_hook(post_navigation_hook_mock) + patched_crawler.router.default_handler(handler_mock) + + await patched_crawler.run([str(server_url)]) + + context = handler_mock.call_args[0][0] + assert isinstance(context, StagehandCrawlingContext) + assert isinstance(context, PlaywrightCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_pre_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None: + pre_nav_hook_mock = AsyncMock() + patched_crawler.pre_navigation_hook(pre_nav_hook_mock) + patched_crawler.router.default_handler(AsyncMock()) + + await patched_crawler.run([str(server_url)]) + + pre_nav_hook_mock.assert_called_once() + context = pre_nav_hook_mock.call_args[0][0] + assert isinstance(context, StagehandPreNavCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_post_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None: + post_nav_hook_mock = AsyncMock() + patched_crawler.post_navigation_hook(post_nav_hook_mock) + patched_crawler.router.default_handler(AsyncMock()) + + await patched_crawler.run([str(server_url)]) + + post_nav_hook_mock.assert_called_once() + context = post_nav_hook_mock.call_args[0][0] + assert isinstance(context, StagehandPostNavCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_stagehand_page_ai_methods_delegate_to_session( + patched_crawler: StagehandCrawler, + stagehand_session_mock: MagicMock, + server_url: URL, +) -> None: + @patched_crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + await context.page.act(instruction='click button') + await context.page.extract(instruction='get title') + await context.page.observe(instruction='find links') + await context.page.execute(instruction='run script') + + await patched_crawler.run([str(server_url)]) + + for method_name in ('act', 'extract', 'observe', 'execute'): + method_mock = getattr(stagehand_session_mock, method_name) + method_mock.assert_awaited_once() + assert isinstance(method_mock.call_args.kwargs['page'], StagehandPage) + + assert 'instruction' in method_mock.call_args.kwargs From 33503cb04c1501ea9759a2fddac51a3aa401bab4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 3 May 2026 22:08:44 +0000 Subject: [PATCH 07/10] add docs and fingerprint headers --- docs/guides/architecture_overview.mdx | 35 +++- .../stagehand_crawler/basic_example.py | 47 +++++ .../stagehand_crawler/browserbase_example.py | 37 ++++ docs/guides/stagehand_crawler.mdx | 124 +++++++++++++ pyproject.toml | 2 + .../browsers/_stagehand_browser_controller.py | 174 ++++++++++++------ .../crawlers/_stagehand/_stagehand_crawler.py | 19 +- .../test_stagehand_browser_controller.py | 16 +- uv.lock | 4 + 9 files changed, 385 insertions(+), 73 deletions(-) create mode 100644 docs/guides/code_examples/stagehand_crawler/basic_example.py create mode 100644 docs/guides/code_examples/stagehand_crawler/browserbase_example.py create mode 100644 docs/guides/stagehand_crawler.mdx diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index 0f1b235b60..4fefc0354e 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -53,6 +53,8 @@ class PlaywrightCrawler class AdaptivePlaywrightCrawler +class StagehandCrawler + %% ======================== %% Inheritance arrows %% ======================== @@ -63,6 +65,7 @@ BasicCrawler --|> AdaptivePlaywrightCrawler AbstractHttpCrawler --|> HttpCrawler AbstractHttpCrawler --|> ParselCrawler AbstractHttpCrawler --|> BeautifulSoupCrawler +PlaywrightCrawler --|> StagehandCrawler ``` ### HTTP crawlers @@ -79,7 +82,19 @@ You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawl ### Browser crawlers -Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is `PlaywrightCrawler`, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about `PlaywrightCrawler`, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler). +Browser crawlers use a real browser to render pages, enabling scraping of sites that require +JavaScript. They manage browser instances, pages, and context lifecycles. Crawlee provides +two browser crawlers: + +- `PlaywrightCrawler` utilizes the + [Playwright](https://playwright.dev/) library and provides a high-level API for controlling + and navigating browsers. You can learn more about it in the + [Playwright crawler guide](./playwright-crawler). +- `StagehandCrawler` extends + `PlaywrightCrawler` with AI-powered browser automation via + [Stagehand](https://github.com/browserbase/stagehand). It adds natural-language methods + (`act`, `extract`, `observe`, `execute`) directly on the page object. You can learn more + about it in the [Stagehand crawler guide](./stagehand-crawler). ### Adaptive crawler @@ -122,6 +137,12 @@ class AdaptivePlaywrightPreNavCrawlingContext class AdaptivePlaywrightCrawlingContext +class StagehandPreNavCrawlingContext + +class StagehandPostNavCrawlingContext + +class StagehandCrawlingContext + %% ======================== %% Inheritance arrows %% ======================== @@ -143,6 +164,12 @@ PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext + +PlaywrightPreNavCrawlingContext --|> StagehandPreNavCrawlingContext + +StagehandPreNavCrawlingContext --|> StagehandPostNavCrawlingContext + +StagehandPostNavCrawlingContext --|> StagehandCrawlingContext ``` They have a similar inheritance structure as the crawlers, with the base class being `BasicCrawlingContext`. The specific crawling contexts are: @@ -154,6 +181,12 @@ They have a similar inheritance structure as the crawlers, with the base class b - `PlaywrightCrawlingContext` for Playwright crawlers. - `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated. - `AdaptivePlaywrightCrawlingContext` for Adaptive Playwright crawlers. +- `StagehandPreNavCrawlingContext` + for Stagehand crawlers before the page is navigated. +- `StagehandPostNavCrawlingContext` + for Stagehand crawlers after the page is navigated. +- `StagehandCrawlingContext` + for Stagehand crawlers. ## Storages diff --git a/docs/guides/code_examples/stagehand_crawler/basic_example.py b/docs/guides/code_examples/stagehand_crawler/basic_example.py new file mode 100644 index 0000000000..e187ee2d06 --- /dev/null +++ b/docs/guides/code_examples/stagehand_crawler/basic_example.py @@ -0,0 +1,47 @@ +import asyncio +from typing import cast + +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext + + +async def main() -> None: + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + model_api_key='your-openai-api-key', + model='openai/gpt-4.1-mini', + ), + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Dismiss overlays or interact with the page using natural language. + await context.page.act(instruction='Click the accept cookies button if present') + + # Extract data from the page using AI. + extracted = await context.page.extract( + instruction='Get the page title and the main heading text', + schema={ + 'type': 'object', + 'properties': { + 'title': {'type': 'string'}, + 'heading': {'type': 'string'}, + }, + }, + ) + + extract_result = extracted.data.result + + if isinstance(extract_result, dict): + # Push extracted data to the dataset + # Use `cast()` to provide a more specific type hint for the extracted data. + await context.push_data(cast('dict[str, str | None]', extract_result)) + + await crawler.run(['https://example.com']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/stagehand_crawler/browserbase_example.py b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py new file mode 100644 index 0000000000..ac465aba73 --- /dev/null +++ b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py @@ -0,0 +1,37 @@ +import asyncio +from typing import cast + +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext + + +async def main() -> None: + # Use Browserbase cloud browser instead of a local Chromium instance. + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + env='BROWSERBASE', + browserbase_api_key='your-browserbase-api-key', + project_id='your-project-id', + model_api_key='your-openai-api-key', + model='openai/gpt-4.1-mini', + ), + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + extracted = await context.page.extract( + instruction='Get the main content of the page', + ) + + extract_result = extracted.data.result + + await context.push_data(cast('dict[str, str | None]', extract_result)) + + await crawler.run(['https://example.com']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/stagehand_crawler.mdx b/docs/guides/stagehand_crawler.mdx new file mode 100644 index 0000000000..b74c413995 --- /dev/null +++ b/docs/guides/stagehand_crawler.mdx @@ -0,0 +1,124 @@ +--- +id: stagehand-crawler +title: Stagehand crawler +description: Learn how to use StagehandCrawler for AI-powered browser automation and data extraction. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import BasicExample from '!!raw-loader!./code_examples/stagehand_crawler/basic_example.py'; +import BrowserbaseExample from '!!raw-loader!./code_examples/stagehand_crawler/browserbase_example.py'; + +A `StagehandCrawler` extends `PlaywrightCrawler` with AI-powered browser automation via [Stagehand](https://www.browserbase.com/stagehand). Instead of writing CSS selectors or XPath expressions, you describe what you want in plain English and the AI model takes care of the rest. + +Each page in the crawling context is a `StagehandPage` - a drop-in replacement for the standard Playwright `Page` that adds four AI methods: + +- `page.act(**kwargs)` - perform an action using a natural language instruction +- `page.extract(**kwargs)` - extract structured data from the page using AI +- `page.observe(**kwargs)` - get a list of AI-suggested actions available on the page +- `page.execute(**kwargs)` - run an autonomous multi-step agent on the page + +All standard Playwright methods remain available alongside these AI methods. + +## When to use StagehandCrawler + +Use `StagehandCrawler` when: + +- **Selectors are brittle or unknown** - the AI can locate elements by their visual role or label rather than a specific CSS class. +- **Interactions are complex** - multi-step forms, dynamic menus, or context-dependent flows that are hard to script. +- **Rapid prototyping** - you want to build a scraper quickly without spending time reverse-engineering the page structure. + +For straightforward scraping tasks where the page structure is stable and well-known, `PlaywrightCrawler` is more efficient, read more in that [guide](./playwright_crawler). + +## Installation + +`StagehandCrawler` requires the `stagehand` optional dependency group: + +```bash +pip install 'crawlee[stagehand]' +``` + +or with uv: + +```bash +uv add 'crawlee[stagehand]' +``` + +## Basic usage + +The example below demonstrates the typical usage pattern: dismiss cookie banners with `act()` and extract structured data with `extract()`. + + + {BasicExample} + + +## StagehandOptions configuration + +Stagehand-specific settings are provided via `StagehandOptions`. Pass the instance to the `stagehand_options` argument of `StagehandCrawler`. + +## AI page operations + +### `act` - perform actions + +Use `act()` to interact with the page using a natural language instruction: + +```python +await context.page.act(instruction='Click the "Sign in" button') +``` + +### `extract` - structured data extraction + +Use `extract()` to pull structured data from the page. Pass a JSON Schema via schema to enforce the output shape: + +```python +data = await context.page.extract( + instruction='Extract the top comment on this page', + schema={ + 'type': 'object', + 'properties': { + 'comment_text': {'type': 'string'}, + 'author': {'type': 'string'}, + }, + 'required': ['comment_text'], + }, +) +``` + +### `observe` - inspect available actions + +Use `observe()` to get AI-suggested actions currently available on the page. Useful for debugging or building adaptive workflows: + +```python +actions = await context.page.observe( + instruction='What actions are available in the navigation menu?' +) +``` + +### `execute` - autonomous multi-step agent + +Use `execute()` for longer autonomous tasks that span multiple interactions: + +```python +result = await context.page.execute( + instruction='Search for "web scraping" and return the titles of the first five results', +) +``` + +## Browserbase integration + +By default, Stagehand launches a local Chromium browser. To use [Browserbase](https://www.browserbase.com/) - a managed cloud browser service - set `env='BROWSERBASE'` in `StagehandOptions` and supply the required credentials: + + + {BrowserbaseExample} + + +Browserbase credentials (`browserbase_api_key`, `project_id`) can also be provided via the `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` environment variables. + +## Browser configuration limitations + +Because Stagehand manages the browser session internally via CDP, only Chromium is supported. Browser settings are limited to the subset accepted by Stagehand's `BrowserLaunchOptions` - `headless`, `args`, `viewport`, `proxy`, `locale`, `executable_path`, and a few others. Features like fingerprint generation and incognito pages are not supported. + +## Conclusion + +This guide introduced `StagehandCrawler` and its AI page operations: `act()`, `extract()`, `observe()`, and `execute()`. You learned how to configure Stagehand via `StagehandOptions` and switch to Browserbase for cloud browser execution. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/pyproject.toml b/pyproject.toml index ed0de57607..4900e7fd07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,8 @@ sql_postgres = [ stagehand = [ "stagehand>=3.19.0", "playwright>=1.27.0", + "apify_fingerprint_datapoints>=0.0.2", + "browserforge>=1.2.3", ] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index 7052dd9585..7ad363d5e6 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -10,6 +10,8 @@ from crawlee._utils.docs import docs_group from crawlee.browsers._browser_controller import BrowserController from crawlee.browsers._stagehand_types import StagehandPage +from crawlee.fingerprint_suite import HeaderGenerator +from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type if TYPE_CHECKING: from collections.abc import Mapping @@ -36,6 +38,7 @@ class StagehandBrowserController(BrowserController): """ AUTOMATION_LIBRARY = 'stagehand' + _DEFAULT_HEADER_GENERATOR = HeaderGenerator() def __init__( self, @@ -44,6 +47,7 @@ def __init__( stagehand_client: AsyncStagehand, stagehand_options: StagehandOptions, max_open_pages_per_browser: int = 20, + header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, ) -> None: """Initialize a new instance. @@ -51,18 +55,21 @@ def __init__( playwright: Active Playwright instance used to connect to the browser via CDP. stagehand_client: Active Stagehand REST client used to start and end sessions. stagehand_options: Stagehand-specific configuration (model, env, self-heal, etc.). - browser_launch_options: Browser options built by the plugin. max_open_pages_per_browser: Maximum number of pages that can be open at the same time. + header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for + requests made by the browser. By default, a predefined header generator is used. Set to `None` to + disable automatic header modifications. """ self._playwright = playwright self._stagehand_client = stagehand_client self._stagehand_options = stagehand_options self._max_open_pages_per_browser = max_open_pages_per_browser + self._header_generator = header_generator self._session: AsyncSession | None = None self._browser: Browser | None = None self._browser_context: BrowserContext | None = None - self._session_init_lock = Lock() + self._context_creation_lock = Lock() self._pages = list[StagehandPage]() self._total_opened_pages = 0 @@ -102,7 +109,7 @@ def has_free_capacity(self) -> bool: @property @override def is_browser_connected(self) -> bool: - # Session not yet started — controller is available for new pages. + # Session not yet started - controller is available for new pages. return self._browser is None or self._browser.is_connected() @property @@ -138,7 +145,13 @@ async def new_page( try: # Lazily start a Stagehand session on the first page request, passing proxy and context options # at that point. - await self._ensure_session(browser_new_context_options, proxy_info) + if not self._browser_context: + async with self._context_creation_lock: + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) if self._browser is None or self._session is None or self._browser_context is None: raise RuntimeError('Failed to initialize the browser session.') @@ -171,68 +184,107 @@ async def close(self, *, force: bool = False) -> None: if self._browser is not None and self._browser.is_connected(): await self._browser.close() + self._session = None + self._browser_context = None + def _on_page_close(self, page: StagehandPage) -> None: self._pages.remove(page) - async def _ensure_session( + def _get_extra_http_headers( + self, + extra_http_headers: Mapping[str, str] | None = None, + ) -> dict[str, str]: + if extra_http_headers: + return dict(extra_http_headers) + + if self._header_generator: + generated_headers = self._header_generator.get_specific_headers( + header_names={ + 'Accept', + 'Accept-Language', + 'User-Agent', + 'sec-ch-ua', + 'sec-ch-ua-mobile', + 'sec-ch-ua-platform', + }, + browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type), + ) + else: + generated_headers = {} + + return dict(generated_headers) + + def _build_session_start_params( + self, + browser_new_context_options: dict[str, Any], + proxy_info: ProxyInfo | None = None, + ) -> dict[str, Any]: + session_start_params: dict[str, Any] = { + 'model_name': self._stagehand_options.model, + 'verbose': self._stagehand_options.verbose, + 'self_heal': self._stagehand_options.self_heal, + } + launch_options = dict(browser_new_context_options) + + if self._stagehand_options.dom_settle_timeout_ms is not None: + session_start_params['dom_settle_timeout_ms'] = self._stagehand_options.dom_settle_timeout_ms + + if self._stagehand_options.system_prompt is not None: + session_start_params['system_prompt'] = self._stagehand_options.system_prompt + + if self._stagehand_options.env == 'LOCAL': + if proxy_info: + launch_options['proxy'] = { + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + session_start_params['browser'] = {'type': 'local', 'launch_options': launch_options} + else: + session_start_params['browser'] = {'type': 'browserbase', 'launch_options': launch_options} + if proxy_info: + session_start_params['browserbase_session_create_params'] = { + 'proxies': [ + { + 'type': 'external', + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + ] + } + + return session_start_params + + async def _create_browser_context( self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None, - ) -> None: + ) -> BrowserContext: + browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} - if self._session is not None: - return + extra_http_headers = self._get_extra_http_headers(browser_new_context_options.pop('extra_http_headers', None)) + + session_start_params = self._build_session_start_params( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + + session: AsyncSession = await self._stagehand_client.sessions.start(**session_start_params) + + cdp_url = session.data.cdp_url + if not cdp_url: + raise RuntimeError( + f'No cdp_url returned from Stagehand (env={self._stagehand_options.env!r}). ' + 'Cannot connect Playwright to the browser.' + ) + + self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url) + + context = self._browser.contexts[0] + + await context.set_extra_http_headers(extra_http_headers) + + self._session = session - async with self._session_init_lock: - # Double-check if the session was created while waiting for the lock. - if self._session is not None: - return - - browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} - - # Parameters for sessions.start() — AI model settings from stagehand_options. - session_start_params: dict[str, Any] = { - 'model_name': self._stagehand_options.model, - 'verbose': self._stagehand_options.verbose, - 'self_heal': self._stagehand_options.self_heal, - } - if self._stagehand_options.dom_settle_timeout_ms is not None: - session_start_params['dom_settle_timeout_ms'] = self._stagehand_options.dom_settle_timeout_ms - - if self._stagehand_options.system_prompt is not None: - session_start_params['system_prompt'] = self._stagehand_options.system_prompt - - if self._stagehand_options.env == 'LOCAL': - if proxy_info: - browser_new_context_options['proxy'] = { - 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', - 'username': proxy_info.username or '', - 'password': proxy_info.password or '', - } - session_start_params['browser'] = {'type': 'local', 'launch_options': browser_new_context_options} - else: - session_start_params['browser'] = {'type': 'browserbase'} - if proxy_info: - session_start_params['browserbase_session_create_params'] = { - 'proxies': [ - { - 'type': 'external', - 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', - 'username': proxy_info.username or '', - 'password': proxy_info.password or '', - } - ] - } - - session: AsyncSession = await self._stagehand_client.sessions.start(**session_start_params) - - cdp_url = session.data.cdp_url - if not cdp_url: - raise RuntimeError( - f'No cdp_url returned from Stagehand (env={self._stagehand_options.env!r}). ' - 'Cannot connect Playwright to the browser.' - ) - - self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url) - self._browser_context = self._browser.contexts[0] - self._session = session + return context diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index 62edb0231f..76b5e8863d 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -38,11 +38,11 @@ class StagehandCrawler( `StagehandCrawler` builds on top of `PlaywrightCrawler`, inheriting all of its features. It uses `StagehandBrowserPlugin` to manage Stagehand sessions. Stagehand creates and manages - the browser instance — either locally via a bundled Chromium binary, or remotely via Browserbase - cloud — and Playwright connects to it via the Chrome DevTools Protocol (CDP). + the browser instance - either locally via a bundled Chromium binary, or remotely via Browserbase + cloud - and Playwright connects to it via the Chrome DevTools Protocol (CDP). Because Stagehand relies on CDP, only Chromium is supported. Not all Playwright browser and - context configuration options are available — browser settings are limited to the subset accepted + context configuration options are available - browser settings are limited to the subset accepted by Stagehand's ``BrowserLaunchOptions`` (such as ``headless``, ``args``, ``viewport``, ``proxy``, ``locale``, and ``executable_path``). Features like fingerprint generation and incognito pages are not supported. @@ -50,10 +50,10 @@ class StagehandCrawler( Each page in the crawling context is a `StagehandPage`, which extends the standard Playwright `Page` with the following AI methods: - - `page.act(**kwargs)` — perform an action on the page using natural language - - `page.extract(**kwargs)` — extract structured data from the page with AI - - `page.observe(**kwargs)` — get AI-suggested actions available on the page - - `page.execute(**kwargs)` — run an autonomous multi-step agent + - `page.act(**kwargs)` - perform an action on the page using natural language + - `page.extract(**kwargs)` - extract structured data from the page with AI + - `page.observe(**kwargs)` - get AI-suggested actions available on the page + - `page.execute(**kwargs)` - run an autonomous multi-step agent Stagehand configuration (model, API key, environment) is provided via `stagehand_options`. By default, the crawler runs locally using the `openai/gpt-4.1-mini` model. @@ -62,8 +62,7 @@ class StagehandCrawler( ```python import asyncio - from crawlee.crawlers import StagehandCrawler - from crawlee.crawlers._stagehand import StagehandCrawlingContext + from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext from crawlee.browsers import StagehandOptions crawler = StagehandCrawler( @@ -127,7 +126,7 @@ def __init__( effect on the first page; subsequent pages reuse the existing session context. Cannot be specified if `browser_pool` is provided. goto_options: Additional options passed to Stagehand's `Page.goto()`. The `timeout` - option is not supported — use `navigation_timeout` instead. + option is not supported - use `navigation_timeout` instead. navigation_timeout: Timeout for the navigation phase (from opening the page to calling the request handler). Defaults to one minute. max_open_pages_per_browser: Maximum number of pages open per browser instance. diff --git a/tests/unit/browsers/test_stagehand_browser_controller.py b/tests/unit/browsers/test_stagehand_browser_controller.py index dfd08ab614..fda9bf6a81 100644 --- a/tests/unit/browsers/test_stagehand_browser_controller.py +++ b/tests/unit/browsers/test_stagehand_browser_controller.py @@ -15,6 +15,8 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator + from yarl import URL + @pytest.fixture async def playwright() -> AsyncGenerator[Playwright, None]: @@ -259,7 +261,7 @@ async def test_session_start_params_browserbase( await controller.close() call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs - assert call_kwargs['browser'] == {'type': 'browserbase'} + assert call_kwargs['browser'] == {'type': 'browserbase', 'launch_options': {}} assert 'browserbase_session_create_params' not in call_kwargs @@ -337,3 +339,15 @@ async def test_proxy_set_browserbase( assert browserbase_proxy_options['proxies'][0]['server'] == 'http://proxy.example.com:8080' assert browserbase_proxy_options['proxies'][0]['username'] == 'user' assert browserbase_proxy_options['proxies'][0]['password'] == 'pass' + + +async def test_fingerprint_headers_set_on_new_page(controller: StagehandBrowserController, server_url: URL) -> None: + page = await controller.new_page() + + response = await page.goto(str(server_url / 'headers')) + + assert response is not None + + response_json = await response.json() + + assert 'Headless' not in response_json['user-agent'] diff --git a/uv.lock b/uv.lock index f07e49d5c0..3c215b99a8 100644 --- a/uv.lock +++ b/uv.lock @@ -881,6 +881,8 @@ sql-sqlite = [ { name = "sqlalchemy", extra = ["asyncio"] }, ] stagehand = [ + { name = "apify-fingerprint-datapoints" }, + { name = "browserforge" }, { name = "playwright" }, { name = "stagehand" }, ] @@ -921,12 +923,14 @@ requires-dist = [ { name = "apify-fingerprint-datapoints", marker = "extra == 'adaptive-crawler'", specifier = ">=0.0.3" }, { name = "apify-fingerprint-datapoints", marker = "extra == 'httpx'", specifier = ">=0.0.2" }, { name = "apify-fingerprint-datapoints", marker = "extra == 'playwright'", specifier = ">=0.0.2" }, + { name = "apify-fingerprint-datapoints", marker = "extra == 'stagehand'", specifier = ">=0.0.2" }, { name = "async-timeout", specifier = ">=5.0.1" }, { name = "asyncpg", marker = "extra == 'sql-postgres'", specifier = ">=0.24.0" }, { name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'beautifulsoup'", specifier = ">=4.12.0" }, { name = "browserforge", marker = "extra == 'adaptive-crawler'", specifier = ">=1.2.4" }, { name = "browserforge", marker = "extra == 'httpx'", specifier = ">=1.2.3" }, { name = "browserforge", marker = "extra == 'playwright'", specifier = ">=1.2.3" }, + { name = "browserforge", marker = "extra == 'stagehand'", specifier = ">=1.2.3" }, { name = "cachetools", specifier = ">=5.5.0" }, { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, From 4115ae2a9192c7957ac103a941bc64cd9cf78325 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 3 May 2026 22:46:52 +0000 Subject: [PATCH 08/10] fix docs --- docs/guides/stagehand_crawler.mdx | 2 +- src/crawlee/browsers/_stagehand_types.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/guides/stagehand_crawler.mdx b/docs/guides/stagehand_crawler.mdx index b74c413995..8a645a6a06 100644 --- a/docs/guides/stagehand_crawler.mdx +++ b/docs/guides/stagehand_crawler.mdx @@ -29,7 +29,7 @@ Use `StagehandCrawler` when: - **Interactions are complex** - multi-step forms, dynamic menus, or context-dependent flows that are hard to script. - **Rapid prototyping** - you want to build a scraper quickly without spending time reverse-engineering the page structure. -For straightforward scraping tasks where the page structure is stable and well-known, `PlaywrightCrawler` is more efficient, read more in that [guide](./playwright_crawler). +For straightforward scraping tasks where the page structure is stable and well-known, `PlaywrightCrawler` is more efficient, read more in that [guide](./playwright-crawler). ## Installation diff --git a/src/crawlee/browsers/_stagehand_types.py b/src/crawlee/browsers/_stagehand_types.py index 0b30f2ad73..0a614b1f89 100644 --- a/src/crawlee/browsers/_stagehand_types.py +++ b/src/crawlee/browsers/_stagehand_types.py @@ -5,6 +5,8 @@ from playwright.async_api import Page +from crawlee._utils.docs import docs_group + if TYPE_CHECKING: from stagehand import AsyncSession from stagehand.types.session_act_params import SessionActParams @@ -19,6 +21,7 @@ @dataclass +@docs_group('Browser management') class StagehandOptions: """Configuration options for Stagehand AI-powered browser automation. @@ -66,6 +69,7 @@ class StagehandOptions: """Maximum time to wait for the local Stagehand browser to be ready, in seconds.""" +@docs_group('Browser management') class StagehandPage(Page): """A Playwright `Page` enhanced with Stagehand AI methods. From 476aa4fb200cea815a50a8d57e4631ed43b15922 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 4 May 2026 00:51:59 +0000 Subject: [PATCH 09/10] fixes --- docs/guides/stagehand_crawler.mdx | 7 ++++- .../browsers/_stagehand_browser_controller.py | 28 ++++++++++--------- .../crawlers/_stagehand/_stagehand_crawler.py | 13 +++------ 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/docs/guides/stagehand_crawler.mdx b/docs/guides/stagehand_crawler.mdx index 8a645a6a06..b8dbd2b7b5 100644 --- a/docs/guides/stagehand_crawler.mdx +++ b/docs/guides/stagehand_crawler.mdx @@ -117,7 +117,12 @@ Browserbase credentials (`browserbase_api_key`, `project_id`) can also be provid ## Browser configuration limitations -Because Stagehand manages the browser session internally via CDP, only Chromium is supported. Browser settings are limited to the subset accepted by Stagehand's `BrowserLaunchOptions` - `headless`, `args`, `viewport`, `proxy`, `locale`, `executable_path`, and a few others. Features like fingerprint generation and incognito pages are not supported. +Because Stagehand manages the browser session internally via CDP, only Chromium is supported. +Browser settings are limited to the subset accepted by Stagehand's `BrowserLaunchOptions` - +`headless`, `args`, `viewport`, `proxy`, `locale`, `executable_path`, and a few others. +Features like full browser fingerprinting (canvas, WebGL, screen properties) and incognito +pages are not supported. Fingerprint-consistent HTTP headers (`User-Agent`, `Accept`, `sec-ch-ua`) +are still injected automatically. ## Conclusion diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py index 7ad363d5e6..950c3b0823 100644 --- a/src/crawlee/browsers/_stagehand_browser_controller.py +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -126,8 +126,8 @@ async def new_page( """Create a new page in the Stagehand-managed browser. On the first call, starts the Stagehand session with the provided options. On subsequent - calls, ``browser_new_context_options`` are ignored with a warning because the browser - context cannot be reconfigured once the session is running. + calls, ``browser_new_context_options`` and ``proxy_info`` are ignored because the session + context cannot be reconfigured once it is running. Args: browser_new_context_options: Options merged on top of the plugin's launch options @@ -173,19 +173,21 @@ async def close(self, *, force: bool = False) -> None: if self.pages_count > 0 and not force: raise ValueError('Cannot close the browser while there are open pages.') - if self._session is None: + if self._session is None and self._browser is None: return try: - await self._session.end() - except Exception: - logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) - - if self._browser is not None and self._browser.is_connected(): - await self._browser.close() - - self._session = None - self._browser_context = None + if self._session is not None: + try: + await self._session.end() + except Exception: + logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) + + if self._browser is not None and self._browser.is_connected(): + await self._browser.close() + finally: + self._session = None + self._browser_context = None def _on_page_close(self, page: StagehandPage) -> None: self._pages.remove(page) @@ -194,7 +196,7 @@ def _get_extra_http_headers( self, extra_http_headers: Mapping[str, str] | None = None, ) -> dict[str, str]: - if extra_http_headers: + if extra_http_headers is not None: return dict(extra_http_headers) if self._header_generator: diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py index 76b5e8863d..20cce39475 100644 --- a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -44,8 +44,9 @@ class StagehandCrawler( Because Stagehand relies on CDP, only Chromium is supported. Not all Playwright browser and context configuration options are available - browser settings are limited to the subset accepted by Stagehand's ``BrowserLaunchOptions`` (such as ``headless``, ``args``, ``viewport``, ``proxy``, - ``locale``, and ``executable_path``). Features like fingerprint generation and incognito pages - are not supported. + ``locale``, and ``executable_path``). Full browser fingerprinting (canvas, WebGL, screen + properties) and incognito pages are not supported; fingerprint-consistent HTTP headers + (``User-Agent``, ``Accept``, ``sec-ch-ua``) are still injected automatically. Each page in the crawling context is a `StagehandPage`, which extends the standard Playwright `Page` with the following AI methods: @@ -102,7 +103,6 @@ def __init__( browser_new_context_options: dict[str, Any] | None = None, goto_options: GotoOptions | None = None, navigation_timeout: timedelta | None = None, - max_open_pages_per_browser: int | None = None, **kwargs: Unpack[BasicCrawlerOptions[StagehandCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. @@ -129,8 +129,6 @@ def __init__( option is not supported - use `navigation_timeout` instead. navigation_timeout: Timeout for the navigation phase (from opening the page to calling the request handler). Defaults to one minute. - max_open_pages_per_browser: Maximum number of pages open per browser instance. - Cannot be specified if `browser_pool` is provided. kwargs: Additional keyword arguments forwarded to `BasicCrawler`. """ if browser_pool is not None: @@ -143,13 +141,11 @@ def __init__( headless, browser_launch_options, browser_new_context_options, - max_open_pages_per_browser, ) ): raise ValueError( 'Cannot specify `stagehand_options`, `user_data_dir`, `headless`, ' - '`browser_launch_options`, `browser_new_context_options` or ' - '`max_open_pages_per_browser` when `browser_pool` is provided.' + '`browser_launch_options`, `browser_new_context_options` when `browser_pool` is provided.' ) else: launch_options = dict(browser_launch_options or {}) @@ -163,7 +159,6 @@ def __init__( user_data_dir=user_data_dir, browser_launch_options=launch_options or None, browser_new_context_options=browser_new_context_options, - max_open_pages_per_browser=max_open_pages_per_browser or 20, ) ] ) From 96458083f9f6d368f7ddedb536726f6112452521 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 4 May 2026 01:03:42 +0000 Subject: [PATCH 10/10] fix test --- tests/unit/crawlers/_stagehand/test_stagehand_crawler.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py index 67e76dfb0b..a036e8a0fc 100644 --- a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py +++ b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py @@ -74,9 +74,6 @@ def test_init_raises_with_browser_pool_and_params() -> None: with pytest.raises(ValueError, match=r'Cannot specify'): StagehandCrawler(browser_pool=pool, headless=True) - with pytest.raises(ValueError, match=r'Cannot specify'): - StagehandCrawler(browser_pool=pool, max_open_pages_per_browser=5) - with pytest.raises(ValueError, match=r'Cannot specify'): StagehandCrawler(browser_pool=pool, browser_launch_options={'headless': True})