diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx
index 0f1b235b60..4fefc0354e 100644
--- a/docs/guides/architecture_overview.mdx
+++ b/docs/guides/architecture_overview.mdx
@@ -53,6 +53,8 @@ class PlaywrightCrawler
class AdaptivePlaywrightCrawler
+class StagehandCrawler
+
%% ========================
%% Inheritance arrows
%% ========================
@@ -63,6 +65,7 @@ BasicCrawler --|> AdaptivePlaywrightCrawler
AbstractHttpCrawler --|> HttpCrawler
AbstractHttpCrawler --|> ParselCrawler
AbstractHttpCrawler --|> BeautifulSoupCrawler
+PlaywrightCrawler --|> StagehandCrawler
```
### HTTP crawlers
@@ -79,7 +82,19 @@ You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawl
### Browser crawlers
-Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is `PlaywrightCrawler`, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about `PlaywrightCrawler`, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler).
+Browser crawlers use a real browser to render pages, enabling scraping of sites that require
+JavaScript. They manage browser instances, pages, and context lifecycles. Crawlee provides
+two browser crawlers:
+
+- `PlaywrightCrawler` utilizes the
+ [Playwright](https://playwright.dev/) library and provides a high-level API for controlling
+ and navigating browsers. You can learn more about it in the
+ [Playwright crawler guide](./playwright-crawler).
+- `StagehandCrawler` extends
+ `PlaywrightCrawler` with AI-powered browser automation via
+ [Stagehand](https://github.com/browserbase/stagehand). It adds natural-language methods
+ (`act`, `extract`, `observe`, `execute`) directly on the page object. You can learn more
+ about it in the [Stagehand crawler guide](./stagehand-crawler).
### Adaptive crawler
@@ -122,6 +137,12 @@ class AdaptivePlaywrightPreNavCrawlingContext
class AdaptivePlaywrightCrawlingContext
+class StagehandPreNavCrawlingContext
+
+class StagehandPostNavCrawlingContext
+
+class StagehandCrawlingContext
+
%% ========================
%% Inheritance arrows
%% ========================
@@ -143,6 +164,12 @@ PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext
BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext
ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext
+
+PlaywrightPreNavCrawlingContext --|> StagehandPreNavCrawlingContext
+
+StagehandPreNavCrawlingContext --|> StagehandPostNavCrawlingContext
+
+StagehandPostNavCrawlingContext --|> StagehandCrawlingContext
```
They have a similar inheritance structure as the crawlers, with the base class being `BasicCrawlingContext`. The specific crawling contexts are:
@@ -154,6 +181,12 @@ They have a similar inheritance structure as the crawlers, with the base class b
- `PlaywrightCrawlingContext` for Playwright crawlers.
- `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated.
- `AdaptivePlaywrightCrawlingContext` for Adaptive Playwright crawlers.
+- `StagehandPreNavCrawlingContext`
+ for Stagehand crawlers before the page is navigated.
+- `StagehandPostNavCrawlingContext`
+ for Stagehand crawlers after the page is navigated.
+- `StagehandCrawlingContext`
+ for Stagehand crawlers.
## Storages
diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py b/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py
deleted file mode 100644
index 67b76f1f47..0000000000
--- a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Any, cast
-
-from stagehand.context import StagehandContext
-from typing_extensions import override
-
-from crawlee.browsers import (
- PlaywrightBrowserController,
- PlaywrightBrowserPlugin,
- PlaywrightPersistentBrowser,
-)
-
-from .support_classes import CrawleeStagehandPage
-
-if TYPE_CHECKING:
- from collections.abc import Mapping
-
- from playwright.async_api import Page
- from stagehand import Stagehand
-
- from crawlee.proxy_configuration import ProxyInfo
-
-
-class StagehandBrowserController(PlaywrightBrowserController):
- @override
- def __init__(
- self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any
- ) -> None:
- # Initialize with browser context instead of browser instance
- super().__init__(browser, **kwargs)
-
- self._stagehand = stagehand
- self._stagehand_context: StagehandContext | None = None
-
- @override
- async def new_page(
- self,
- browser_new_context_options: Mapping[str, Any] | None = None,
- proxy_info: ProxyInfo | None = None,
- ) -> Page:
- # Initialize browser context if not already done
- if not self._browser_context:
- self._browser_context = await self._create_browser_context(
- browser_new_context_options=browser_new_context_options,
- proxy_info=proxy_info,
- )
-
- # Initialize Stagehand context if not already done
- if not self._stagehand_context:
- self._stagehand_context = await StagehandContext.init(
- self._browser_context, self._stagehand
- )
-
- # Create a new page using Stagehand context
- page = await self._stagehand_context.new_page()
-
- pw_page = page._page # noqa: SLF001
-
- # Handle page close event
- pw_page.on(event='close', f=self._on_page_close)
-
- # Update internal state
- self._pages.append(pw_page)
- self._last_page_opened_at = datetime.now(timezone.utc)
-
- self._total_opened_pages += 1
-
- # Wrap StagehandPage to provide Playwright Page interface
- return cast('Page', CrawleeStagehandPage(page))
-
-
-class StagehandPlugin(PlaywrightBrowserPlugin):
- """Browser plugin that integrates Stagehand with Crawlee's browser management."""
-
- @override
- def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None:
- super().__init__(**kwargs)
-
- self._stagehand = stagehand
-
- @override
- async def new_browser(self) -> StagehandBrowserController:
- if not self._playwright:
- raise RuntimeError('Playwright browser plugin is not initialized.')
-
- browser = PlaywrightPersistentBrowser(
- # Stagehand can run only on a Chromium-based browser.
- self._playwright.chromium,
- self._user_data_dir,
- self._browser_launch_options,
- )
-
- # Return custom controller with Stagehand
- return StagehandBrowserController(
- browser=browser,
- stagehand=self._stagehand,
- header_generator=None,
- fingerprint_generator=None,
- )
diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py b/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py
deleted file mode 100644
index 6cf8cc2689..0000000000
--- a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-import os
-from typing import cast
-
-from stagehand import StagehandConfig, StagehandPage
-
-from crawlee import ConcurrencySettings
-from crawlee.browsers import BrowserPool
-from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
-
-from .browser_classes import StagehandPlugin
-from .support_classes import CrawleeStagehand
-
-
-async def main() -> None:
- # Configure local Stagehand with Gemini model
- config = StagehandConfig(
- env='LOCAL',
- model_name='google/gemini-2.5-flash-preview-05-20',
- model_api_key=os.getenv('GEMINI_API_KEY'),
- )
-
- # Create Stagehand instance
- stagehand = CrawleeStagehand(config)
-
- # Create crawler with custom browser pool using Stagehand
- crawler = PlaywrightCrawler(
- # Limit the crawl to max requests. Remove or increase it for crawling all links.
- max_requests_per_crawl=10,
- # Custom browser pool. Gives users full control over browsers used by the crawler.
- concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10),
- browser_pool=BrowserPool(
- plugins=[
- StagehandPlugin(stagehand, browser_launch_options={'headless': True})
- ],
- ),
- )
-
- # Define the default request handler, which will be called for every request.
- @crawler.router.default_handler
- async def request_handler(context: PlaywrightCrawlingContext) -> None:
- context.log.info(f'Processing {context.request.url} ...')
-
- # Cast to StagehandPage for proper type hints in IDE
- page = cast('StagehandPage', context.page)
-
- # Use regular Playwright method
- playwright_title = await page.title()
- context.log.info(f'Playwright page title: {playwright_title}')
-
- # highlight-start
- # Use AI-powered extraction with natural language
- gemini_title = await page.extract('Extract page title')
- context.log.info(f'Gemini page title: {gemini_title}')
- # highlight-end
-
- await context.enqueue_links()
-
- # Run the crawler with the initial list of URLs.
- await crawler.run(['https://crawlee.dev/'])
-
-
-if __name__ == '__main__':
- asyncio.run(main())
diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py
deleted file mode 100644
index cccb62e989..0000000000
--- a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from stagehand import Stagehand, StagehandPage
-
-if TYPE_CHECKING:
- from types import TracebackType
-
-
-class CrawleeStagehandPage:
- """StagehandPage wrapper for Crawlee."""
-
- def __init__(self, page: StagehandPage) -> None:
- self._page = page
-
- async def goto(
- self,
- url: str,
- *,
- referer: str | None = None,
- timeout: int | None = None,
- wait_until: str | None = None,
- ) -> Any:
- """Navigate to the specified URL."""
- # Override goto to return navigation result that `PlaywrightCrawler` expects
- return await self._page._page.goto( # noqa: SLF001
- url,
- referer=referer,
- timeout=timeout,
- wait_until=wait_until,
- )
-
- def __getattr__(self, name: str) -> Any:
- """Delegate all other methods to the underlying StagehandPage."""
- return getattr(self._page, name)
-
- async def __aenter__(self) -> CrawleeStagehandPage:
- """Enter the context manager."""
- return self
-
- async def __aexit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- exc_traceback: TracebackType | None,
- ) -> None:
- await self._page.close()
-
-
-class CrawleeStagehand(Stagehand):
- """Stagehand wrapper for Crawlee to disable the launch of Playwright."""
-
- async def init(self) -> None:
- # Skip Stagehand's own Playwright initialization
- # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle
- self._initialized = True
diff --git a/docs/guides/code_examples/stagehand_crawler/basic_example.py b/docs/guides/code_examples/stagehand_crawler/basic_example.py
new file mode 100644
index 0000000000..f6cbc10736
--- /dev/null
+++ b/docs/guides/code_examples/stagehand_crawler/basic_example.py
@@ -0,0 +1,47 @@
+import asyncio
+from typing import cast
+
+from crawlee.browsers import StagehandOptions
+from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext
+
+
+async def main() -> None:
+ crawler = StagehandCrawler(
+ stagehand_options=StagehandOptions(
+ model_api_key='your-openai-api-key',
+ model='openai/gpt-4.1-mini',
+ ),
+ max_requests_per_crawl=5,
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: StagehandCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Dismiss overlays or interact with the page using natural language.
+ await context.page.act(input='Click the accept cookies button if present')
+
+ # Extract data from the page using AI.
+ extracted = await context.page.extract(
+ instruction='Get the page title and the main heading text',
+ schema={
+ 'type': 'object',
+ 'properties': {
+ 'title': {'type': 'string'},
+ 'heading': {'type': 'string'},
+ },
+ },
+ )
+
+ extract_result = extracted.data.result
+
+ if isinstance(extract_result, dict):
+ # Push extracted data to the dataset
+ # Use `cast()` to provide a more specific type hint for the extracted data.
+ await context.push_data(cast('dict[str, str | None]', extract_result))
+
+ await crawler.run(['https://example.com'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/stagehand_crawler/browserbase_example.py b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py
new file mode 100644
index 0000000000..ac465aba73
--- /dev/null
+++ b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py
@@ -0,0 +1,37 @@
+import asyncio
+from typing import cast
+
+from crawlee.browsers import StagehandOptions
+from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext
+
+
+async def main() -> None:
+ # Use Browserbase cloud browser instead of a local Chromium instance.
+ crawler = StagehandCrawler(
+ stagehand_options=StagehandOptions(
+ env='BROWSERBASE',
+ browserbase_api_key='your-browserbase-api-key',
+ project_id='your-project-id',
+ model_api_key='your-openai-api-key',
+ model='openai/gpt-4.1-mini',
+ ),
+ max_requests_per_crawl=5,
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: StagehandCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ extracted = await context.page.extract(
+ instruction='Get the main content of the page',
+ )
+
+ extract_result = extracted.data.result
+
+ await context.push_data(cast('dict[str, str | None]', extract_result))
+
+ await crawler.run(['https://example.com'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/playwright_crawler_stagehand.mdx b/docs/guides/playwright_crawler_stagehand.mdx
deleted file mode 100644
index 59a34b4cd2..0000000000
--- a/docs/guides/playwright_crawler_stagehand.mdx
+++ /dev/null
@@ -1,66 +0,0 @@
----
-id: playwright-crawler-stagehand
-title: Playwright with Stagehand
-description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler.
----
-
-import ApiLink from '@site/src/components/ApiLink';
-import CodeBlock from '@theme/CodeBlock';
-
-import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py';
-import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py';
-import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py';
-
-[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic.
-
-Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider.
-
-:::info
-
-This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions.
-
-:::
-
-## Get Gemini API key
-
-You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key.
-
-## Create support classes for Stagehand
-
-To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle.
-
-Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance.
-
-Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`.
-
-
- {SupportClasses}
-
-
-## Create browser integration classes
-
-You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`.
-
-Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances.
-
-Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand.
-
-
- {BrowserClasses}
-
-
-## Create a crawler
-
-Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands:
-
-
- {StagehandRun}
-
-
-The integration works through several key components:
-- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle
-- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances
-- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities
-- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations
-
-In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors.
diff --git a/docs/guides/stagehand_crawler.mdx b/docs/guides/stagehand_crawler.mdx
new file mode 100644
index 0000000000..ca609684f9
--- /dev/null
+++ b/docs/guides/stagehand_crawler.mdx
@@ -0,0 +1,131 @@
+---
+id: stagehand-crawler
+title: Stagehand crawler
+description: Learn how to use StagehandCrawler for AI-powered browser automation and data extraction.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import BasicExample from '!!raw-loader!./code_examples/stagehand_crawler/basic_example.py';
+import BrowserbaseExample from '!!raw-loader!./code_examples/stagehand_crawler/browserbase_example.py';
+
+A `StagehandCrawler` extends `PlaywrightCrawler` with AI-powered browser automation via [Stagehand](https://www.browserbase.com/stagehand). Instead of writing CSS selectors or XPath expressions, you describe what you want in plain English and the AI model takes care of the rest.
+
+Each page in the crawling context is a `StagehandPage` - a drop-in replacement for the standard Playwright `Page` that adds four AI methods:
+
+- `page.act(**kwargs)` - perform an action using a natural language instruction
+- `page.extract(**kwargs)` - extract structured data from the page using AI
+- `page.observe(**kwargs)` - get a list of AI-suggested actions available on the page
+- `page.execute(**kwargs)` - run an autonomous multi-step agent on the page
+
+All standard Playwright methods remain available alongside these AI methods.
+
+## When to use StagehandCrawler
+
+Use `StagehandCrawler` when:
+
+- **Selectors are brittle or unknown** - the AI can locate elements by their visual role or label rather than a specific CSS class.
+- **Interactions are complex** - multi-step forms, dynamic menus, or context-dependent flows that are hard to script.
+- **Rapid prototyping** - you want to build a scraper quickly without spending time reverse-engineering the page structure.
+
+For straightforward scraping tasks where the page structure is stable and well-known, `PlaywrightCrawler` is more efficient, read more in that [guide](./playwright-crawler).
+
+## Installation
+
+`StagehandCrawler` requires the `stagehand` optional dependency group:
+
+```bash
+pip install 'crawlee[stagehand]'
+```
+
+or with uv:
+
+```bash
+uv add 'crawlee[stagehand]'
+```
+
+## Basic usage
+
+The example below demonstrates the typical usage pattern: dismiss cookie banners with `act()` and extract structured data with `extract()`.
+
+
+ {BasicExample}
+
+
+## StagehandOptions configuration
+
+Stagehand-specific settings are provided via `StagehandOptions`. Pass the instance to the `stagehand_options` argument of `StagehandCrawler`.
+
+## AI page operations
+
+### `act` - perform actions
+
+Use `act()` to interact with the page using a natural language instruction:
+
+```python
+await context.page.act(input='Click the "Sign in" button')
+```
+
+### `extract` - structured data extraction
+
+Use `extract()` to pull structured data from the page. Pass a JSON Schema via schema to enforce the output shape:
+
+```python
+data = await context.page.extract(
+ instruction='Extract the top comment on this page',
+ schema={
+ 'type': 'object',
+ 'properties': {
+ 'comment_text': {'type': 'string'},
+ 'author': {'type': 'string'},
+ },
+ 'required': ['comment_text'],
+ },
+)
+```
+
+### `observe` - inspect available actions
+
+Use `observe()` to get AI-suggested actions currently available on the page. Useful for debugging or building adaptive workflows:
+
+```python
+actions = await context.page.observe(
+ instruction='What actions are available in the navigation menu?'
+)
+```
+
+### `execute` - autonomous multi-step agent
+
+Use `execute()` for longer autonomous tasks that span multiple interactions:
+
+```python
+result = await context.page.execute(
+ agent_config={},
+ execute_options={
+ 'instruction': 'Search for "web scraping" and return the titles of the first five results',
+ })
+```
+
+## Browserbase integration
+
+By default, Stagehand launches a local Chromium browser. To use [Browserbase](https://www.browserbase.com/) - a managed cloud browser service - set `env='BROWSERBASE'` in `StagehandOptions` and supply the required credentials:
+
+
+ {BrowserbaseExample}
+
+
+Browserbase credentials (`browserbase_api_key`, `project_id`) can also be provided via the `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` environment variables.
+
+## Browser configuration limitations
+
+Because Stagehand manages the browser session internally via CDP, only Chromium is supported.
+Browser settings are limited to the subset accepted by Stagehand's `BrowserLaunchOptions` -
+`headless`, `args`, `viewport`, `proxy`, `locale`, `executable_path`, and a few others.
+Features like full browser fingerprinting (canvas, WebGL, screen properties) and incognito
+pages are not supported. Fingerprint-consistent HTTP headers (`User-Agent`, `Accept`, `sec-ch-ua`)
+are still injected automatically.
+
+## Conclusion
+
+This guide introduced `StagehandCrawler` and its AI page operations: `act()`, `extract()`, `observe()`, and `execute()`. You learned how to configure Stagehand via `StagehandOptions` and switch to Browserbase for cloud browser execution. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/pyproject.toml b/pyproject.toml
index 699d68dffe..006769bcdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
]
[project.optional-dependencies]
-all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"]
+all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,stagehand,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
@@ -77,6 +77,12 @@ sql_postgres = [
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
"asyncpg>=0.24.0"
]
+stagehand = [
+ "stagehand>=3.19.5",
+ "playwright>=1.27.0",
+ "apify_fingerprint_datapoints>=0.0.2",
+ "browserforge>=1.2.3",
+]
sql_sqlite = [
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
"aiosqlite>=0.21.0",
diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py
index 0ea1101df4..cef3446690 100644
--- a/src/crawlee/browsers/__init__.py
+++ b/src/crawlee/browsers/__init__.py
@@ -17,6 +17,13 @@
with _try_import(__name__, 'PlaywrightPersistentBrowser'):
from ._playwright_browser import PlaywrightPersistentBrowser
+with _try_import(__name__, 'StagehandBrowserController'):
+ from ._stagehand_browser_controller import StagehandBrowserController
+with _try_import(__name__, 'StagehandBrowserPlugin'):
+ from ._stagehand_browser_plugin import StagehandBrowserPlugin
+with _try_import(__name__, 'StagehandOptions', 'StagehandPage'):
+ from ._stagehand_types import StagehandOptions, StagehandPage
+
__all__ = [
'BrowserPool',
@@ -25,4 +32,8 @@
'PlaywrightBrowserController',
'PlaywrightBrowserPlugin',
'PlaywrightPersistentBrowser',
+ 'StagehandBrowserController',
+ 'StagehandBrowserPlugin',
+ 'StagehandOptions',
+ 'StagehandPage',
]
diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py
new file mode 100644
index 0000000000..950c3b0823
--- /dev/null
+++ b/src/crawlee/browsers/_stagehand_browser_controller.py
@@ -0,0 +1,292 @@
+from __future__ import annotations
+
+from asyncio import Lock
+from datetime import datetime, timedelta, timezone
+from logging import getLogger
+from typing import TYPE_CHECKING, Any
+
+from typing_extensions import override
+
+from crawlee._utils.docs import docs_group
+from crawlee.browsers._browser_controller import BrowserController
+from crawlee.browsers._stagehand_types import StagehandPage
+from crawlee.fingerprint_suite import HeaderGenerator
+from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
+
+if TYPE_CHECKING:
+ from collections.abc import Mapping
+
+ from playwright.async_api import Browser, BrowserContext, Playwright
+ from stagehand import AsyncSession, AsyncStagehand
+
+ from crawlee.browsers._stagehand_types import StagehandOptions
+ from crawlee.browsers._types import BrowserType
+ from crawlee.proxy_configuration import ProxyInfo
+
+
+logger = getLogger(__name__)
+
+
+@docs_group('Browser management')
+class StagehandBrowserController(BrowserController):
+ """Controller for managing a Stagehand-controlled browser instance.
+
+ It creates and connects to the browser lazily on the first ``new_page`` call: Stagehand
+ starts a session, and Playwright then connects to it via CDP. All pages share a single
+ browser context, as Stagehand creates the browser and its context together during session
+ initialisation.
+ """
+
+ AUTOMATION_LIBRARY = 'stagehand'
+ _DEFAULT_HEADER_GENERATOR = HeaderGenerator()
+
+ def __init__(
+ self,
+ *,
+ playwright: Playwright,
+ stagehand_client: AsyncStagehand,
+ stagehand_options: StagehandOptions,
+ max_open_pages_per_browser: int = 20,
+ header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
+ ) -> None:
+ """Initialize a new instance.
+
+ Args:
+ playwright: Active Playwright instance used to connect to the browser via CDP.
+ stagehand_client: Active Stagehand REST client used to start and end sessions.
+ stagehand_options: Stagehand-specific configuration (model, env, self-heal, etc.).
+ max_open_pages_per_browser: Maximum number of pages that can be open at the same time.
+ header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
+ requests made by the browser. By default, a predefined header generator is used. Set to `None` to
+ disable automatic header modifications.
+ """
+ self._playwright = playwright
+ self._stagehand_client = stagehand_client
+ self._stagehand_options = stagehand_options
+ self._max_open_pages_per_browser = max_open_pages_per_browser
+ self._header_generator = header_generator
+
+ self._session: AsyncSession | None = None
+ self._browser: Browser | None = None
+ self._browser_context: BrowserContext | None = None
+ self._context_creation_lock = Lock()
+
+ self._pages = list[StagehandPage]()
+ self._total_opened_pages = 0
+ self._opening_pages_count = 0
+ self._last_page_opened_at = datetime.now(timezone.utc)
+
+ @property
+ @override
+ def pages(self) -> list[StagehandPage]:
+ return self._pages
+
+ @property
+ @override
+ def total_opened_pages(self) -> int:
+ return self._total_opened_pages
+
+ @property
+ @override
+ def pages_count(self) -> int:
+ return len(self._pages)
+
+ @property
+ @override
+ def last_page_opened_at(self) -> datetime:
+ return self._last_page_opened_at
+
+ @property
+ @override
+ def idle_time(self) -> timedelta:
+ return datetime.now(timezone.utc) - self._last_page_opened_at
+
+ @property
+ @override
+ def has_free_capacity(self) -> bool:
+ return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser
+
+ @property
+ @override
+ def is_browser_connected(self) -> bool:
+ # Session not yet started - controller is available for new pages.
+ return self._browser is None or self._browser.is_connected()
+
+ @property
+ @override
+ def browser_type(self) -> BrowserType:
+ return 'chromium'
+
+ @override
+ async def new_page(
+ self,
+ browser_new_context_options: Mapping[str, Any] | None = None,
+ proxy_info: ProxyInfo | None = None,
+ ) -> StagehandPage:
+ """Create a new page in the Stagehand-managed browser.
+
+ On the first call, starts the Stagehand session with the provided options. On subsequent
+ calls, ``browser_new_context_options`` and ``proxy_info`` are ignored because the session
+ context cannot be reconfigured once it is running.
+
+ Args:
+ browser_new_context_options: Options merged on top of the plugin's launch options
+ when creating the first session. Ignored if the session already exists.
+ proxy_info: Proxy injected into the session on first creation.
+
+ Raises:
+ ValueError: If the browser has reached the maximum number of open pages.
+ RuntimeError: If the Stagehand session fails to initialise.
+ """
+ if not self.has_free_capacity:
+ raise ValueError('Cannot open more pages in this browser.')
+
+ self._opening_pages_count += 1
+ try:
+ # Lazily start a Stagehand session on the first page request, passing proxy and context options
+ # at that point.
+ if not self._browser_context:
+ async with self._context_creation_lock:
+ if not self._browser_context:
+ self._browser_context = await self._create_browser_context(
+ browser_new_context_options=browser_new_context_options,
+ proxy_info=proxy_info,
+ )
+
+ if self._browser is None or self._session is None or self._browser_context is None:
+ raise RuntimeError('Failed to initialize the browser session.')
+
+ raw_page = await self._browser_context.new_page()
+ page = StagehandPage(raw_page, self._session)
+ raw_page.on('close', lambda _: self._on_page_close(page))
+
+ self._pages.append(page)
+ self._last_page_opened_at = datetime.now(timezone.utc)
+ self._total_opened_pages += 1
+ finally:
+ self._opening_pages_count -= 1
+
+ return page
+
+ @override
+ async def close(self, *, force: bool = False) -> None:
+ if self.pages_count > 0 and not force:
+ raise ValueError('Cannot close the browser while there are open pages.')
+
+ if self._session is None and self._browser is None:
+ return
+
+ try:
+ if self._session is not None:
+ try:
+ await self._session.end()
+ except Exception:
+ logger.warning('Failed to end Stagehand session gracefully.', exc_info=True)
+
+ if self._browser is not None and self._browser.is_connected():
+ await self._browser.close()
+ finally:
+ self._session = None
+ self._browser_context = None
+
+ def _on_page_close(self, page: StagehandPage) -> None:
+ self._pages.remove(page)
+
+ def _get_extra_http_headers(
+ self,
+ extra_http_headers: Mapping[str, str] | None = None,
+ ) -> dict[str, str]:
+ if extra_http_headers is not None:
+ return dict(extra_http_headers)
+
+ if self._header_generator:
+ generated_headers = self._header_generator.get_specific_headers(
+ header_names={
+ 'Accept',
+ 'Accept-Language',
+ 'User-Agent',
+ 'sec-ch-ua',
+ 'sec-ch-ua-mobile',
+ 'sec-ch-ua-platform',
+ },
+ browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),
+ )
+ else:
+ generated_headers = {}
+
+ return dict(generated_headers)
+
+ def _build_session_start_params(
+ self,
+ browser_new_context_options: dict[str, Any],
+ proxy_info: ProxyInfo | None = None,
+ ) -> dict[str, Any]:
+ session_start_params: dict[str, Any] = {
+ 'model_name': self._stagehand_options.model,
+ 'verbose': self._stagehand_options.verbose,
+ 'self_heal': self._stagehand_options.self_heal,
+ }
+ launch_options = dict(browser_new_context_options)
+
+ if self._stagehand_options.dom_settle_timeout_ms is not None:
+ session_start_params['dom_settle_timeout_ms'] = self._stagehand_options.dom_settle_timeout_ms
+
+ if self._stagehand_options.system_prompt is not None:
+ session_start_params['system_prompt'] = self._stagehand_options.system_prompt
+
+ if self._stagehand_options.env == 'LOCAL':
+ if proxy_info:
+ launch_options['proxy'] = {
+ 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
+ 'username': proxy_info.username or '',
+ 'password': proxy_info.password or '',
+ }
+ session_start_params['browser'] = {'type': 'local', 'launch_options': launch_options}
+ else:
+ session_start_params['browser'] = {'type': 'browserbase', 'launch_options': launch_options}
+ if proxy_info:
+ session_start_params['browserbase_session_create_params'] = {
+ 'proxies': [
+ {
+ 'type': 'external',
+ 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
+ 'username': proxy_info.username or '',
+ 'password': proxy_info.password or '',
+ }
+ ]
+ }
+
+ return session_start_params
+
+ async def _create_browser_context(
+ self,
+ browser_new_context_options: Mapping[str, Any] | None = None,
+ proxy_info: ProxyInfo | None = None,
+ ) -> BrowserContext:
+ browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
+
+ extra_http_headers = self._get_extra_http_headers(browser_new_context_options.pop('extra_http_headers', None))
+
+ session_start_params = self._build_session_start_params(
+ browser_new_context_options=browser_new_context_options,
+ proxy_info=proxy_info,
+ )
+
+ session: AsyncSession = await self._stagehand_client.sessions.start(**session_start_params)
+
+ cdp_url = session.data.cdp_url
+ if not cdp_url:
+ raise RuntimeError(
+ f'No cdp_url returned from Stagehand (env={self._stagehand_options.env!r}). '
+ 'Cannot connect Playwright to the browser.'
+ )
+
+ self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url)
+
+ context = self._browser.contexts[0]
+
+ await context.set_extra_http_headers(extra_http_headers)
+
+ self._session = session
+
+ return context
diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py
new file mode 100644
index 0000000000..3bfb72b09f
--- /dev/null
+++ b/src/crawlee/browsers/_stagehand_browser_plugin.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+from logging import getLogger
+from typing import TYPE_CHECKING, Any
+
+from playwright.async_api import Playwright, async_playwright
+from stagehand import AsyncStagehand
+from typing_extensions import override
+
+from crawlee import service_locator
+from crawlee._utils.context import ensure_context
+from crawlee._utils.docs import docs_group
+
+from ._browser_plugin import BrowserPlugin
+from ._stagehand_browser_controller import StagehandBrowserController
+from ._stagehand_types import StagehandOptions
+
+if TYPE_CHECKING:
+ from collections.abc import Mapping
+ from pathlib import Path
+ from types import TracebackType
+
+ from ._browser_controller import BrowserController
+ from ._types import BrowserType
+
+
+logger = getLogger(__name__)
+
+
+@docs_group('Browser management')
+class StagehandBrowserPlugin(BrowserPlugin):
+ """A plugin for managing Stagehand AI-powered browser automation.
+
+ It acts as a factory for creating `StagehandBrowserController` instances and manages the
+ lifecycle of the shared `AsyncStagehand` REST client and the Playwright context. Depending
+ on the configured environment, the browser runs locally via a bundled Chromium binary
+ (``env='LOCAL'``) or in the Browserbase cloud (``env='BROWSERBASE'``). Playwright connects
+ to the running browser via CDP, so only Chromium is supported.
+
+ Not all Playwright browser and context options are supported — only those accepted by
+ Stagehand's ``BrowserLaunchOptions``. Because Stagehand creates the browser and its context
+ together in a single ``sessions.start()`` call, both ``browser_launch_options`` and
+ ``browser_new_context_options`` are merged into one set of options applied at session start.
+ """
+
+ AUTOMATION_LIBRARY = 'stagehand'
+
+ def __init__(
+ self,
+ *,
+ user_data_dir: str | Path | None = None,
+ stagehand_options: StagehandOptions | None = None,
+ browser_launch_options: dict[str, Any] | None = None,
+ browser_new_context_options: dict[str, Any] | None = None,
+ max_open_pages_per_browser: int = 20,
+ ) -> None:
+ """Initialize a new instance.
+
+ Args:
+ user_data_dir: Path to a user data directory, which stores browser session data like
+ cookies and local storage.
+ stagehand_options: Stagehand-specific configuration (model, API key, env, etc.).
+ browser_launch_options: Keyword arguments passed to Stagehand's ``BrowserLaunchOptions``
+ on session start. Supported keys are a subset of Playwright's ``browser_type.launch``
+ options. These take priority over ``browser_new_context_options`` for shared keys.
+ browser_new_context_options: Additional options merged with ``browser_launch_options``
+ at lower priority. Subject to the same ``BrowserLaunchOptions`` constraints.
+ max_open_pages_per_browser: The maximum number of pages that can be open in a single
+ browser instance. Once reached, a new browser instance will be launched.
+ """
+ config = service_locator.get_configuration()
+
+ self._max_open_pages_per_browser = max_open_pages_per_browser
+ self._stagehand_options = stagehand_options or StagehandOptions()
+
+ is_local = self._stagehand_options.env == 'LOCAL'
+
+ # browser_launch_options take priority over browser_new_context_options for shared keys.
+ self._browser_launch_options: dict[str, Any] = {
+ 'headless': config.headless,
+ 'chromium_sandbox': not config.disable_browser_sandbox,
+ **(browser_new_context_options or {}),
+ **(browser_launch_options or {}),
+ }
+
+ if config.default_browser_path:
+ self._browser_launch_options.setdefault('executable_path', config.default_browser_path)
+
+ if user_data_dir is not None:
+ self._browser_launch_options['user_data_dir'] = str(user_data_dir)
+
+ # Parameters for AsyncStagehand.
+ self._stagehand_init_params: dict[str, Any] = {
+ 'server': 'local' if is_local else 'remote',
+ 'local_headless': self._browser_launch_options['headless'],
+ 'local_ready_timeout_s': self._stagehand_options.local_ready_timeout_s,
+ }
+ if self._stagehand_options.model_api_key is not None:
+ self._stagehand_init_params['model_api_key'] = self._stagehand_options.model_api_key
+
+ if not is_local:
+ self._stagehand_init_params['browserbase_api_key'] = self._stagehand_options.browserbase_api_key
+ self._stagehand_init_params['browserbase_project_id'] = self._stagehand_options.project_id
+
+ self._stagehand_client: AsyncStagehand | None = None
+ self._playwright_context_manager = async_playwright()
+ self._playwright: Playwright | None = None
+ self._active = False
+
+ @property
+ @override
+ def active(self) -> bool:
+ return self._active
+
+ @property
+ @override
+ def browser_type(self) -> BrowserType:
+ return 'chromium'
+
+ @property
+ @override
+ def browser_launch_options(self) -> Mapping[str, Any]:
+ """Return the options passed to Stagehand's ``BrowserLaunchOptions`` on session start.
+
+ These are a subset of Playwright's ``browser_type.launch`` options — only keys recognised
+ by Stagehand's ``BrowserLaunchOptions`` take effect.
+ """
+ return self._browser_launch_options
+
+ @property
+ @override
+ def browser_new_context_options(self) -> Mapping[str, Any]:
+ """Return the browser context options passed to Stagehand's ``BrowserLaunchOptions``.
+
+ Stagehand creates the browser and its context together in a single ``sessions.start()``
+ call, so context-level options such as ``viewport`` and ``locale`` are part of
+ ``BrowserLaunchOptions`` and share the same dictionary as ``browser_launch_options``.
+ Pre-navigation hooks that modify these options before the first page will take effect,
+ because session creation is deferred until the first ``new_page`` call.
+ """
+ return self._browser_launch_options
+
+ @property
+ @override
+ def max_open_pages_per_browser(self) -> int:
+ return self._max_open_pages_per_browser
+
+ @property
+ def stagehand_options(self) -> StagehandOptions:
+ """Return the Stagehand-specific configuration options."""
+ return self._stagehand_options
+
+ @override
+ async def __aenter__(self) -> StagehandBrowserPlugin:
+ if self._active:
+ raise RuntimeError(f'The {self.__class__.__name__} is already active.')
+
+ self._active = True
+ self._playwright = await self._playwright_context_manager.__aenter__()
+
+ if self._stagehand_options.env == 'LOCAL':
+ if 'executable_path' not in self._browser_launch_options:
+ chrome_path = self._playwright.chromium.executable_path
+ self._browser_launch_options['executable_path'] = chrome_path
+ logger.debug(f'Resolved Chromium path from Playwright: {chrome_path}')
+
+ self._stagehand_init_params['local_chrome_path'] = self._browser_launch_options['executable_path']
+
+ client = AsyncStagehand(**self._stagehand_init_params)
+ await client.__aenter__()
+ self._stagehand_client = client
+
+ return self
+
+ @override
+ async def __aexit__(
+ self,
+ exc_type: type[BaseException] | None,
+ exc_value: BaseException | None,
+ exc_traceback: TracebackType | None,
+ ) -> None:
+ if not self._active:
+ raise RuntimeError(f'The {self.__class__.__name__} is not active.')
+
+ if self._stagehand_client is not None:
+ await self._stagehand_client.__aexit__(exc_type, exc_value, exc_traceback)
+ self._stagehand_client = None
+
+ await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback)
+ self._playwright_context_manager = async_playwright()
+ self._playwright = None
+ self._active = False
+
+ @override
+ @ensure_context
+ async def new_browser(self) -> BrowserController:
+ if not self._playwright or not self._stagehand_client:
+ raise RuntimeError(f'{self.__class__.__name__} is not initialized.')
+
+ return StagehandBrowserController(
+ playwright=self._playwright,
+ stagehand_client=self._stagehand_client,
+ stagehand_options=self._stagehand_options,
+ max_open_pages_per_browser=self._max_open_pages_per_browser,
+ )
diff --git a/src/crawlee/browsers/_stagehand_types.py b/src/crawlee/browsers/_stagehand_types.py
new file mode 100644
index 0000000000..7df6f5fd64
--- /dev/null
+++ b/src/crawlee/browsers/_stagehand_types.py
@@ -0,0 +1,140 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal
+
+from playwright.async_api import Page
+
+from crawlee._utils.docs import docs_group
+
+if TYPE_CHECKING:
+ from stagehand import AsyncSession
+ from stagehand.types.session_act_params import SessionActParamsNonStreaming
+ from stagehand.types.session_act_response import SessionActResponse
+ from stagehand.types.session_execute_params import SessionExecuteParamsNonStreaming
+ from stagehand.types.session_execute_response import SessionExecuteResponse
+ from stagehand.types.session_extract_params import SessionExtractParamsNonStreaming
+ from stagehand.types.session_extract_response import SessionExtractResponse
+ from stagehand.types.session_observe_params import SessionObserveParamsNonStreaming
+ from stagehand.types.session_observe_response import SessionObserveResponse
+ from typing_extensions import Unpack
+
+
+@dataclass
+@docs_group('Browser management')
+class StagehandOptions:
+ """Configuration options for Stagehand AI-powered browser automation.
+
+ Controls the execution environment (local or Browserbase cloud), the AI model used
+ for page operations, and session-level behaviour such as self-healing and DOM settling.
+ """
+
+ env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL'
+ """Execution environment.
+ 'LOCAL' - Stagehand runs a local Chromium browser.
+ 'BROWSERBASE' - Stagehand uses a Browserbase cloud browser session.
+ """
+
+ browserbase_api_key: str | None = None
+ """API key for authenticating with Browserbase when `env='BROWSERBASE'`. If not provided, read from
+ the `BROWSERBASE_API_KEY` environment variable."""
+
+ model_api_key: str | None = None
+ """API key for the AI model provider (e.g. OpenAI, Anthropic). Must be provided explicitly - unlike
+ Browserbase credentials"""
+
+ project_id: str | None = None
+ """Browserbase project ID, required when `env='BROWSERBASE'`. If not provided, read from
+ the `BROWSERBASE_PROJECT_ID` environment variable."""
+
+ model: str = 'openai/gpt-4.1-mini'
+ """The AI model to use for page operations."""
+
+ verbose: Literal[0, 1, 2] = 0
+ """Verbosity level for logging Stagehand session activity.
+ 0 - quiet
+ 1 - normal
+ 2 - debug"""
+
+ self_heal: bool = True
+ """When `True`, Stagehand automatically retries failed actions."""
+
+ dom_settle_timeout_ms: float | None = None
+ """Maximum time to wait for the DOM to settle before performing an action, in milliseconds."""
+
+ system_prompt: str | None = None
+ """Optional system prompt to guide the AI's behavior in sessions."""
+
+ local_ready_timeout_s: float = 10.0
+ """Maximum time to wait for the local Stagehand browser to be ready, in seconds."""
+
+
+@docs_group('Browser management')
+class StagehandPage(Page):
+ """A Playwright `Page` enhanced with Stagehand AI methods.
+
+ Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright
+ methods transparently while adding `act()`, `extract()`, `observe()`, and `execute()`
+ AI operations bound to the current page.
+ """
+
+ def __init__(self, page: Page, session: AsyncSession) -> None:
+ super().__init__(page._impl_obj) # noqa: SLF001
+ self._session = session
+
+ async def act(self, **kwargs: Unpack[SessionActParamsNonStreaming]) -> SessionActResponse:
+ """Perform an action on the page using natural language. Argument `page` is automatically set.
+
+ Args:
+ **kwargs: Parameters passed to ``AsyncSession.act()``.
+ The most common is ``input`` — a natural language instruction string or
+ ``ActionParam`` object, e.g. ``input='click the login button'``.
+
+ Returns:
+ The action result from Stagehand.
+ """
+ return await self._session.act(page=self, **kwargs)
+
+ async def observe(self, **kwargs: Unpack[SessionObserveParamsNonStreaming]) -> SessionObserveResponse:
+ """Observe the page and get AI-suggested actions. Argument `page` is automatically set.
+
+ Args:
+ **kwargs: Parameters passed to ``AsyncSession.observe()``.
+ Optionally pass ``instruction`` to narrow the observation scope.
+
+ Returns:
+ Observation result with suggested actions.
+ """
+ return await self._session.observe(page=self, **kwargs)
+
+ async def extract(self, **kwargs: Unpack[SessionExtractParamsNonStreaming]) -> SessionExtractResponse:
+ """Extract structured data from the page using natural language. Argument `page` is automatically set.
+
+ Args:
+ **kwargs: Parameters passed to ``AsyncSession.extract()``.
+ Common parameters: ``instruction`` and ``schema`` (JSON Schema dict).
+
+ Returns:
+ Extracted data matching the requested schema.
+ """
+ return await self._session.extract(page=self, **kwargs)
+
+ async def execute(self, **kwargs: Unpack[SessionExecuteParamsNonStreaming]) -> SessionExecuteResponse:
+ """Run an autonomous multi-step AI agent on the page. Argument `page` is automatically set.
+
+ Args:
+ **kwargs: Parameters passed to ``AsyncSession.execute()``.
+ Required parameters:
+
+ - ``agent_config`` — agent behaviour settings (pass ``{}`` for defaults).
+ Supports keys: ``model``, ``mode`` (``'dom'``/``'hybrid'``/``'cua'``),
+ ``system_prompt``, and others.
+ - ``execute_options`` — execution options dict with a required
+ ``instruction`` key: a natural language description of the multi-step
+ task to perform, e.g.
+ ``execute_options={'instruction': 'find the login form and sign in'}``.
+
+ Returns:
+ The result of the agent execution.
+ """
+ return await self._session.execute(page=self, **kwargs)
diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
index 6445ad5d64..ac97581bb0 100644
--- a/src/crawlee/crawlers/__init__.py
+++ b/src/crawlee/crawlers/__init__.py
@@ -51,6 +51,20 @@
RenderingTypePredictor,
)
+with _try_import(
+ __name__,
+ 'StagehandCrawler',
+ 'StagehandCrawlingContext',
+ 'StagehandPostNavCrawlingContext',
+ 'StagehandPreNavCrawlingContext',
+):
+ from ._stagehand import (
+ StagehandCrawler,
+ StagehandCrawlingContext,
+ StagehandPostNavCrawlingContext,
+ StagehandPreNavCrawlingContext,
+ )
+
__all__ = [
'AbstractHttpCrawler',
@@ -81,4 +95,8 @@
'RenderingType',
'RenderingTypePrediction',
'RenderingTypePredictor',
+ 'StagehandCrawler',
+ 'StagehandCrawlingContext',
+ 'StagehandPostNavCrawlingContext',
+ 'StagehandPreNavCrawlingContext',
]
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 5222181a6f..93cda854b9 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -5,7 +5,7 @@
import warnings
from datetime import timedelta
from functools import partial
-from typing import TYPE_CHECKING, Any, Generic, Literal, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload
import playwright.async_api
from more_itertools import partition
@@ -35,14 +35,11 @@
from ._types import BlockRequestsFunction, GotoOptions
from ._utils import block_requests, infinite_scroll
-TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
-TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
-
if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping
from pathlib import Path
- from playwright.async_api import Page, Route
+ from playwright.async_api import Page, Response, Route
from playwright.async_api import Request as PlaywrightRequest
from typing_extensions import Unpack
@@ -57,8 +54,21 @@
from crawlee.browsers._types import BrowserType
+TPreNavContext = TypeVar(
+ 'TPreNavContext', bound=PlaywrightPreNavCrawlingContext, default=PlaywrightPreNavCrawlingContext
+)
+TPostNavContext = TypeVar(
+ 'TPostNavContext', bound=PlaywrightPostNavCrawlingContext, default=PlaywrightPostNavCrawlingContext
+)
+TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext, default=PlaywrightCrawlingContext)
+TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
+
+
@docs_group('Crawlers')
-class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]):
+class PlaywrightCrawler(
+ BasicCrawler[TCrawlingContext, StatisticsState],
+ Generic[TPreNavContext, TPostNavContext, TCrawlingContext],
+):
"""A web crawler that leverages the `Playwright` browser automation library.
The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features.
@@ -98,6 +108,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
```
"""
+ # Define the context classes used in context factory.
+ _PRE_NAV_CONTEXT_CLASS = PlaywrightPreNavCrawlingContext
+ _POST_NAV_CONTEXT_CLASS = PlaywrightPostNavCrawlingContext
+ _CRAWLING_CONTEXT_CLASS = PlaywrightCrawlingContext
+
def __init__(
self,
*,
@@ -111,7 +126,7 @@ def __init__(
headless: bool | None = None,
use_incognito_pages: bool | None = None,
navigation_timeout: timedelta | None = None,
- **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
+ **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
) -> None:
"""Initialize a new instance.
@@ -203,8 +218,8 @@ def __init__(
)
kwargs['_additional_context_managers'] = [self._browser_pool]
kwargs.setdefault('_logger', logging.getLogger(__name__))
- self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []
- self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = []
+ self._pre_navigation_hooks: list[Callable[[TPreNavContext], Awaitable[None]]] = []
+ self._post_navigation_hooks: list[Callable[[TPostNavContext], Awaitable[None]]] = []
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
@@ -229,31 +244,117 @@ async def _manage_shared_navigation_timeout(
finally:
self._shared_navigation_timeouts.pop(request_id, None)
+ @overload
+ def _build_context(
+ self,
+ context: BasicCrawlingContext,
+ *,
+ page: Page,
+ response: None = None,
+ ) -> TPreNavContext: ...
+
+ @overload
+ def _build_context(
+ self,
+ context: TPreNavContext,
+ *,
+ response: Response,
+ ) -> TPostNavContext: ...
+
+ @overload
+ def _build_context(
+ self,
+ context: TPostNavContext,
+ ) -> TCrawlingContext: ...
+
+ def _build_context(
+ self,
+ context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext,
+ *,
+ page: Page | None = None,
+ response: Response | None = None,
+ ) -> PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext | PlaywrightCrawlingContext:
+ """Build the crawling context by adding Playwright-specific properties."""
+ # Order is important, as all context subclasses each other,
+ # so the most specific context should be checked first.
+ match context:
+ # Create the final crawling context from `PostNavCrawlingContext`
+ case self._POST_NAV_CONTEXT_CLASS():
+ extract_links = self._create_extract_links_function(context)
+ return self._CRAWLING_CONTEXT_CLASS(
+ request=context.request,
+ session=context.session,
+ add_requests=context.add_requests,
+ send_request=context.send_request,
+ push_data=context.push_data,
+ use_state=context.use_state,
+ proxy_info=context.proxy_info,
+ get_key_value_store=context.get_key_value_store,
+ log=context.log,
+ register_deferred_cleanup=context.register_deferred_cleanup,
+ page=context.page,
+ goto_options=context.goto_options,
+ response=context.response,
+ infinite_scroll=lambda: infinite_scroll(context.page),
+ extract_links=extract_links,
+ enqueue_links=self._create_enqueue_links_function(context, extract_links),
+ block_requests=context.block_requests,
+ )
+ # Create the post-navigation context from `PreNavCrawlingContext`
+ case self._PRE_NAV_CONTEXT_CLASS():
+ if response is None:
+ raise ValueError('Response must be provided for post-navigation context.')
+
+ return self._POST_NAV_CONTEXT_CLASS(
+ request=context.request,
+ session=context.session,
+ add_requests=context.add_requests,
+ send_request=context.send_request,
+ push_data=context.push_data,
+ use_state=context.use_state,
+ proxy_info=context.proxy_info,
+ get_key_value_store=context.get_key_value_store,
+ log=context.log,
+ register_deferred_cleanup=context.register_deferred_cleanup,
+ page=context.page,
+ block_requests=context.block_requests,
+ goto_options=context.goto_options,
+ response=response,
+ )
+ # Create the pre-navigation context from `BasicCrawlingContext`
+ case BasicCrawlingContext():
+ if page is None:
+ raise ValueError('Page must be provided for pre-navigation context.')
+
+ return self._PRE_NAV_CONTEXT_CLASS(
+ request=context.request,
+ session=context.session,
+ add_requests=context.add_requests,
+ send_request=context.send_request,
+ push_data=context.push_data,
+ use_state=context.use_state,
+ proxy_info=context.proxy_info,
+ get_key_value_store=context.get_key_value_store,
+ log=context.log,
+ register_deferred_cleanup=context.register_deferred_cleanup,
+ page=page,
+ block_requests=cast('BlockRequestsFunction', partial(block_requests, page=page)),
+ goto_options=GotoOptions(**self._goto_options),
+ )
+ case _:
+ raise ValueError('Invalid context type')
+
async def _open_page(
self,
context: BasicCrawlingContext,
- ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]:
+ ) -> AsyncGenerator[TPreNavContext, None]:
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')
# Create a new browser page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
- pre_navigation_context = PlaywrightPreNavCrawlingContext(
- request=context.request,
- session=context.session,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- use_state=context.use_state,
- proxy_info=context.proxy_info,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- register_deferred_cleanup=context.register_deferred_cleanup,
- page=crawlee_page.page,
- block_requests=cast('BlockRequestsFunction', partial(block_requests, page=crawlee_page.page)),
- goto_options=GotoOptions(**self._goto_options),
- )
+ pre_navigation_context = self._build_context(context, page=crawlee_page.page)
request_id = id(pre_navigation_context.request)
@@ -290,8 +391,8 @@ async def route_handler(route: Route, _: PlaywrightRequest) -> None:
async def _navigate(
self,
- context: PlaywrightPreNavCrawlingContext,
- ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]:
+ context: TPreNavContext,
+ ) -> AsyncGenerator[TPostNavContext, Exception | None]:
"""Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library.
Args:
@@ -352,24 +453,9 @@ async def _navigate(
# Set the loaded URL to the actual URL after redirection.
context.request.loaded_url = context.page.url
- yield PlaywrightPostNavCrawlingContext(
- request=context.request,
- session=context.session,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- use_state=context.use_state,
- proxy_info=context.proxy_info,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- register_deferred_cleanup=context.register_deferred_cleanup,
- page=context.page,
- block_requests=context.block_requests,
- goto_options=context.goto_options,
- response=response,
- )
+ yield self._build_context(context, response=response)
- def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
+ def _create_extract_links_function(self, context: TPostNavContext) -> ExtractLinksFunction:
"""Create a callback function for extracting links from context.
Args:
@@ -451,9 +537,7 @@ async def extract_links(
return extract_links
- async def _handle_status_code_response(
- self, context: PlaywrightPostNavCrawlingContext
- ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
+ async def _handle_status_code_response(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]:
"""Validate the HTTP status code and raise appropriate exceptions if needed.
Args:
@@ -475,8 +559,8 @@ async def _handle_status_code_response(
async def _handle_blocked_request_by_content(
self,
- context: PlaywrightPostNavCrawlingContext,
- ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
+ context: TPostNavContext,
+ ) -> AsyncGenerator[TPostNavContext, None]:
"""Try to detect if the request is blocked based on the response content.
Args:
@@ -502,9 +586,7 @@ async def _handle_blocked_request_by_content(
yield context
- async def _execute_post_navigation_hooks(
- self, context: PlaywrightPostNavCrawlingContext
- ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
+ async def _execute_post_navigation_hooks(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]:
request_id = id(context.request)
for hook in self._post_navigation_hooks:
@@ -513,36 +595,14 @@ async def _execute_post_navigation_hooks(
yield context
- async def _create_crawling_context(
- self, context: PlaywrightPostNavCrawlingContext
- ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
- extract_links = self._create_extract_links_function(context)
-
- yield PlaywrightCrawlingContext(
- request=context.request,
- session=context.session,
- add_requests=context.add_requests,
- send_request=context.send_request,
- push_data=context.push_data,
- use_state=context.use_state,
- proxy_info=context.proxy_info,
- get_key_value_store=context.get_key_value_store,
- log=context.log,
- register_deferred_cleanup=context.register_deferred_cleanup,
- page=context.page,
- goto_options=context.goto_options,
- response=context.response,
- infinite_scroll=lambda: infinite_scroll(context.page),
- extract_links=extract_links,
- enqueue_links=self._create_enqueue_links_function(context, extract_links),
- block_requests=cast('BlockRequestsFunction', partial(block_requests, page=context.page)),
- )
+ async def _create_crawling_context(self, context: TPostNavContext) -> AsyncGenerator[TCrawlingContext, None]:
+ yield self._build_context(context)
if context.session:
pw_cookies = await self._get_cookies(context.page)
context.session.cookies.set_cookies_from_playwright_format(pw_cookies)
- def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:
+ def pre_navigation_hook(self, hook: Callable[[TPreNavContext], Awaitable[None]]) -> None:
"""Register a hook to be called before each navigation.
Args:
@@ -550,7 +610,7 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
"""
self._pre_navigation_hooks.append(hook)
- def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
+ def post_navigation_hook(self, hook: Callable[[TPostNavContext], Awaitable[None]]) -> None:
"""Register a hook to be called after each navigation.
Args:
diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py
new file mode 100644
index 0000000000..d64e39549b
--- /dev/null
+++ b/src/crawlee/crawlers/_stagehand/__init__.py
@@ -0,0 +1,25 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+
+_install_import_hook(__name__)
+
+# The following imports are wrapped in try_import to handle optional dependencies,
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'StagehandCrawler'):
+ from ._stagehand_crawler import StagehandCrawler
+with _try_import(
+ __name__, 'StagehandCrawlingContext', 'StagehandPostNavCrawlingContext', 'StagehandPreNavCrawlingContext'
+):
+ from ._stagehand_crawling_context import (
+ StagehandCrawlingContext,
+ StagehandPostNavCrawlingContext,
+ StagehandPreNavCrawlingContext,
+ )
+
+
+__all__ = [
+ 'StagehandCrawler',
+ 'StagehandCrawlingContext',
+ 'StagehandPostNavCrawlingContext',
+ 'StagehandPreNavCrawlingContext',
+]
diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py
new file mode 100644
index 0000000000..e155eed2ec
--- /dev/null
+++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py
@@ -0,0 +1,181 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from crawlee._utils.docs import docs_group
+from crawlee.browsers import BrowserPool
+from crawlee.browsers._stagehand_browser_plugin import StagehandBrowserPlugin
+from crawlee.crawlers import PlaywrightCrawler
+
+from ._stagehand_crawling_context import (
+ StagehandCrawlingContext,
+ StagehandPostNavCrawlingContext,
+ StagehandPreNavCrawlingContext,
+)
+
+if TYPE_CHECKING:
+ from datetime import timedelta
+ from pathlib import Path
+
+ from typing_extensions import Unpack
+
+ from crawlee.browsers import StagehandOptions
+ from crawlee.crawlers._basic import BasicCrawlerOptions
+ from crawlee.crawlers._playwright._types import GotoOptions
+ from crawlee.statistics import StatisticsState
+
+
+@docs_group('Crawlers')
+class StagehandCrawler(
+ PlaywrightCrawler[
+ StagehandPreNavCrawlingContext,
+ StagehandPostNavCrawlingContext,
+ StagehandCrawlingContext,
+ ]
+):
+ """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee.
+
+ `StagehandCrawler` builds on top of `PlaywrightCrawler`, inheriting all of its features.
+ It uses `StagehandBrowserPlugin` to manage Stagehand sessions. Stagehand creates and manages
+ the browser instance - either locally via a bundled Chromium binary, or remotely via Browserbase
+ cloud - and Playwright connects to it via the Chrome DevTools Protocol (CDP).
+
+ Because Stagehand relies on CDP, only Chromium is supported. Not all Playwright browser and
+ context configuration options are available - browser settings are limited to the subset accepted
+ by Stagehand's ``BrowserLaunchOptions`` (such as ``headless``, ``args``, ``viewport``, ``proxy``,
+ ``locale``, and ``executable_path``). Full browser fingerprinting (canvas, WebGL, screen
+ properties) and incognito pages are not supported; fingerprint-consistent HTTP headers
+ (``User-Agent``, ``Accept``, ``sec-ch-ua``) are still injected automatically.
+
+ Each page in the crawling context is a `StagehandPage`, which extends the standard Playwright
+ `Page` with the following AI methods:
+
+ - `page.act(**kwargs)` - perform an action on the page using natural language
+ - `page.extract(**kwargs)` - extract structured data from the page with AI
+ - `page.observe(**kwargs)` - get AI-suggested actions available on the page
+ - `page.execute(**kwargs)` - run an autonomous multi-step agent
+
+ Stagehand configuration (model, API key, environment) is provided via `stagehand_options`.
+ By default, the crawler runs locally using the `openai/gpt-4.1-mini` model.
+
+ ### Usage
+
+ ```python
+ import asyncio
+ from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext
+ from crawlee.browsers import StagehandOptions
+
+ crawler = StagehandCrawler(
+ stagehand_options=StagehandOptions(
+ model_api_key='sk-...',
+ model='openai/gpt-4.1-mini',
+ ),
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: StagehandCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Use standard Playwright methods alongside AI methods.
+ await context.page.act(input='Click the accept cookies button if present')
+
+ data = await context.page.extract(instruction='Get the article title and author')
+
+ await context.push_data(data)
+
+ asyncio.run(crawler.run(['https://example.com']))
+ ```
+ """
+
+ _PRE_NAV_CONTEXT_CLASS = StagehandPreNavCrawlingContext
+ _POST_NAV_CONTEXT_CLASS = StagehandPostNavCrawlingContext
+ _CRAWLING_CONTEXT_CLASS = StagehandCrawlingContext
+
+ def __init__(
+ self,
+ *,
+ stagehand_options: StagehandOptions | None = None,
+ browser_pool: BrowserPool | None = None,
+ user_data_dir: str | Path | None = None,
+ headless: bool | None = None,
+ browser_launch_options: dict[str, Any] | None = None,
+ browser_new_context_options: dict[str, Any] | None = None,
+ goto_options: GotoOptions | None = None,
+ navigation_timeout: timedelta | None = None,
+ **kwargs: Unpack[BasicCrawlerOptions[StagehandCrawlingContext, StatisticsState]],
+ ) -> None:
+ """Initialize a new instance.
+
+ Args:
+ stagehand_options: Stagehand-specific configuration (model, API key, env, etc.).
+ Cannot be specified if `browser_pool` is provided.
+ browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of
+ `StagehandBrowserPlugin`. If omitted, a pool is created automatically from the
+ other browser arguments.
+ user_data_dir: Path to a user data directory, which stores browser session data like
+ cookies and local storage. Cannot be specified if `browser_pool` is provided.
+ headless: Whether to run the browser in headless mode. Defaults to the value from
+ Crawlee's global `Configuration`. Cannot be specified if `browser_pool` is provided.
+ browser_launch_options: Keyword arguments for browser launch passed to Stagehand's
+ `BrowserLaunchOptions` (a subset of Playwright's launch options). Supported keys
+ include `args`, `executable_path`, `proxy`, `viewport`, `locale`, and others.
+ Cannot be specified if `browser_pool` is provided.
+ browser_new_context_options: Keyword arguments for browser context creation, merged
+ with `browser_launch_options`. Options that map to `BrowserLaunchOptions` take
+ effect on the first page; subsequent pages reuse the existing session context.
+ Cannot be specified if `browser_pool` is provided.
+ goto_options: Additional options passed to Stagehand's `Page.goto()`. The `timeout`
+ option is not supported - use `navigation_timeout` instead.
+ navigation_timeout: Timeout for the navigation phase (from opening the page to calling
+ the request handler). Defaults to one minute.
+ kwargs: Additional keyword arguments forwarded to `BasicCrawler`.
+ """
+ if browser_pool is not None:
+ self._validate_browser_pool(browser_pool)
+ if any(
+ param is not None
+ for param in (
+ stagehand_options,
+ user_data_dir,
+ headless,
+ browser_launch_options,
+ browser_new_context_options,
+ )
+ ):
+ raise ValueError(
+ 'Cannot specify `stagehand_options`, `user_data_dir`, `headless`, '
+ '`browser_launch_options`, `browser_new_context_options` when `browser_pool` is provided.'
+ )
+ else:
+ launch_options = dict(browser_launch_options or {})
+ if headless is not None:
+ launch_options['headless'] = headless
+
+ browser_pool = BrowserPool(
+ plugins=[
+ StagehandBrowserPlugin(
+ stagehand_options=stagehand_options,
+ user_data_dir=user_data_dir,
+ browser_launch_options=launch_options or None,
+ browser_new_context_options=browser_new_context_options,
+ )
+ ]
+ )
+
+ kwargs.setdefault('_logger', logging.getLogger(__name__))
+
+ super().__init__(
+ browser_pool=browser_pool,
+ goto_options=goto_options,
+ navigation_timeout=navigation_timeout,
+ **kwargs,
+ )
+
+ @staticmethod
+ def _validate_browser_pool(pool: BrowserPool) -> None:
+ invalid = [p for p in pool.plugins if not isinstance(p, StagehandBrowserPlugin)]
+ if invalid:
+ raise ValueError(
+ f'All BrowserPool plugins must be StagehandBrowserPlugin instances. Invalid plugins: {invalid}'
+ )
diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py
new file mode 100644
index 0000000000..645fdbd357
--- /dev/null
+++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+from crawlee._utils.docs import docs_group
+from crawlee.browsers import StagehandPage
+from crawlee.crawlers import (
+ PlaywrightCrawlingContext,
+ PlaywrightPostNavCrawlingContext,
+ PlaywrightPreNavCrawlingContext,
+)
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class StagehandPreNavCrawlingContext(PlaywrightPreNavCrawlingContext):
+ """The pre navigation crawling context used by the `StagehandCrawler`."""
+
+ page: StagehandPage
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class StagehandPostNavCrawlingContext(PlaywrightPostNavCrawlingContext, StagehandPreNavCrawlingContext):
+ """The post navigation crawling context used by the `StagehandCrawler`."""
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class StagehandCrawlingContext(PlaywrightCrawlingContext, StagehandPostNavCrawlingContext):
+ """The crawling context used by the `StagehandCrawler`."""
diff --git a/tests/unit/browsers/test_stagehand_browser_controller.py b/tests/unit/browsers/test_stagehand_browser_controller.py
new file mode 100644
index 0000000000..fda9bf6a81
--- /dev/null
+++ b/tests/unit/browsers/test_stagehand_browser_controller.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timedelta, timezone
+from typing import TYPE_CHECKING, Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from playwright.async_api import Browser, Playwright, async_playwright
+from stagehand import AsyncSession, AsyncStagehand
+
+from crawlee.browsers import StagehandBrowserController, StagehandOptions, StagehandPage
+from crawlee.proxy_configuration import ProxyInfo
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+ from yarl import URL
+
+
+@pytest.fixture
+async def playwright() -> AsyncGenerator[Playwright, None]:
+ async with async_playwright() as playwright:
+ yield playwright
+
+
+@pytest.fixture
+async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
+ browser = await playwright.chromium.launch()
+ # Stagehand's CDP setup always provides a pre-existing context; simulate it.
+ await browser.new_context()
+ yield browser
+ await browser.close()
+
+
+@pytest.fixture
+def session_mock() -> MagicMock:
+ session = MagicMock(spec=AsyncSession)
+ session.data = MagicMock()
+ session.data.cdp_url = 'ws://fake-cdp'
+ session.end = AsyncMock()
+ return session
+
+
+@pytest.fixture
+def stagehand_client_mock(session_mock: MagicMock) -> MagicMock:
+ client = MagicMock(spec=AsyncStagehand)
+ client.sessions.start = AsyncMock(return_value=session_mock)
+ return client
+
+
+@pytest.fixture
+async def controller(
+ playwright: Playwright,
+ browser: Browser,
+ stagehand_client_mock: MagicMock,
+) -> AsyncGenerator[StagehandBrowserController, None]:
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client_mock,
+ stagehand_options=StagehandOptions(),
+ max_open_pages_per_browser=2,
+ )
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ yield controller
+ await controller.close(force=True)
+
+
+async def test_initial_state(playwright: Playwright, session_mock: MagicMock) -> None:
+ stagehand_client = MagicMock()
+ stagehand_client.sessions.start = AsyncMock(return_value=session_mock)
+
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client,
+ stagehand_options=StagehandOptions(),
+ )
+
+ assert controller.pages == []
+ assert controller.pages_count == 0
+ assert isinstance(controller.last_page_opened_at, datetime)
+ assert controller.idle_time < timedelta(seconds=1)
+ assert controller.has_free_capacity
+ assert controller.is_browser_connected # True before any session is started
+ assert controller.browser_type == 'chromium'
+
+
+async def test_open_and_close_page(controller: StagehandBrowserController) -> None:
+ page = await controller.new_page()
+
+ assert isinstance(page, StagehandPage)
+ assert page in controller.pages
+ assert controller.pages_count == 1
+ assert controller.last_page_opened_at <= datetime.now(timezone.utc)
+
+ await page.close()
+
+ assert page not in controller.pages
+ assert controller.pages_count == 0
+
+
+async def test_max_open_pages_limit(controller: StagehandBrowserController) -> None:
+ page1 = await controller.new_page()
+ assert controller.pages_count == 1
+
+ page2 = await controller.new_page()
+ assert controller.pages_count == 2
+
+ with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'):
+ await controller.new_page()
+
+ assert controller.pages_count == 2
+
+ await page1.close()
+ assert controller.pages_count == 1
+
+ page3 = await controller.new_page()
+ assert controller.pages_count == 2
+
+ await page2.close()
+ await page3.close()
+
+ assert controller.pages == []
+ assert controller.pages_count == 0
+
+
+async def test_idle_time(controller: StagehandBrowserController) -> None:
+ idle_time_before = controller.idle_time
+ await asyncio.sleep(1) # Simulate waiting
+ idle_time_after = controller.idle_time
+ assert idle_time_after > idle_time_before
+
+
+async def test_close_browser_with_open_pages(
+ playwright: Playwright,
+ browser: Browser,
+ stagehand_client_mock: MagicMock,
+ session_mock: MagicMock,
+) -> None:
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client_mock,
+ stagehand_options=StagehandOptions(),
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ _ = await controller.new_page()
+
+ with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'):
+ await controller.close()
+
+ assert controller.pages_count == 1
+ assert controller.is_browser_connected
+
+ await controller.close(force=True)
+
+ assert controller.pages_count == 0
+
+ assert session_mock.end.await_count == 1
+ assert not controller.is_browser_connected
+
+
+async def test_second_page_reuses_session(
+ controller: StagehandBrowserController,
+ stagehand_client_mock: MagicMock,
+) -> None:
+ page1 = await controller.new_page()
+ page2 = await controller.new_page()
+
+ assert stagehand_client_mock.sessions.start.await_count == 1
+ assert page1.context == page2.context
+ assert page1 is not page2 # Different Page instances
+
+ await page1.close()
+ await page2.close()
+
+
+async def test_concurrent_session_init(
+ playwright: Playwright,
+ browser: Browser,
+ session_mock: MagicMock,
+) -> None:
+ call_count = 0
+
+ async def delayed_start(**_kwargs: Any) -> Any:
+ nonlocal call_count
+ call_count += 1
+ await asyncio.sleep(5) # Simulate delay in session start
+ return session_mock
+
+ stagehand_client = MagicMock()
+ stagehand_client.sessions.start = delayed_start
+
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client,
+ stagehand_options=StagehandOptions(),
+ max_open_pages_per_browser=5,
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ pages = await asyncio.gather(controller.new_page(), controller.new_page())
+
+ # Only one session should be started despite concurrent calls to `new_page()`.
+ assert call_count == 1
+
+ for page in pages:
+ await page.close()
+ await controller.close(force=True)
+
+
+async def test_session_start_params_local(
+ playwright: Playwright,
+ browser: Browser,
+ stagehand_client_mock: MagicMock,
+) -> None:
+
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client_mock,
+ stagehand_options=StagehandOptions(
+ env='LOCAL',
+ model='openai/gpt-4o',
+ verbose=1,
+ self_heal=False,
+ dom_settle_timeout_ms=500.0,
+ system_prompt='Test prompt',
+ ),
+ max_open_pages_per_browser=2,
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ page = await controller.new_page()
+ await page.close()
+ await controller.close()
+
+ call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs
+ assert call_kwargs['model_name'] == 'openai/gpt-4o'
+ assert call_kwargs['verbose'] == 1
+ assert call_kwargs['self_heal'] is False
+ assert call_kwargs['dom_settle_timeout_ms'] == 500.0
+ assert call_kwargs['system_prompt'] == 'Test prompt'
+ assert call_kwargs['browser'] == {'type': 'local', 'launch_options': {}}
+
+
+async def test_session_start_params_browserbase(
+ playwright: Playwright,
+ browser: Browser,
+ stagehand_client_mock: MagicMock,
+) -> None:
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client_mock,
+ stagehand_options=StagehandOptions(env='BROWSERBASE'),
+ max_open_pages_per_browser=2,
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ page = await controller.new_page()
+ await page.close()
+ await controller.close()
+
+ call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs
+ assert call_kwargs['browser'] == {'type': 'browserbase', 'launch_options': {}}
+ assert 'browserbase_session_create_params' not in call_kwargs
+
+
+async def test_proxy_set_local(
+ playwright: Playwright,
+ browser: Browser,
+ session_mock: MagicMock,
+) -> None:
+ stagehand_client = MagicMock()
+ stagehand_client.sessions.start = AsyncMock(return_value=session_mock)
+
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client,
+ stagehand_options=StagehandOptions(env='LOCAL'),
+ max_open_pages_per_browser=2,
+ )
+
+ proxy = ProxyInfo(
+ url='http://proxy.example.com:8080',
+ scheme='http',
+ hostname='proxy.example.com',
+ port=8080,
+ username='user',
+ password='pass',
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ await controller.new_page(proxy_info=proxy)
+ await controller.close(force=True)
+
+ call_kwargs = stagehand_client.sessions.start.call_args.kwargs
+
+ browser_options = call_kwargs['browser']
+ assert browser_options['type'] == 'local'
+
+ proxy_options = browser_options['launch_options']['proxy']
+ assert proxy_options['server'] == 'http://proxy.example.com:8080'
+ assert proxy_options['username'] == 'user'
+ assert proxy_options['password'] == 'pass'
+
+
+async def test_proxy_set_browserbase(
+ playwright: Playwright,
+ browser: Browser,
+ session_mock: MagicMock,
+) -> None:
+ stagehand_client = MagicMock()
+ stagehand_client.sessions.start = AsyncMock(return_value=session_mock)
+
+ controller = StagehandBrowserController(
+ playwright=playwright,
+ stagehand_client=stagehand_client,
+ stagehand_options=StagehandOptions(env='BROWSERBASE'),
+ max_open_pages_per_browser=2,
+ )
+
+ proxy = ProxyInfo(
+ url='http://proxy.example.com:8080',
+ scheme='http',
+ hostname='proxy.example.com',
+ port=8080,
+ username='user',
+ password='pass',
+ )
+
+ with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)):
+ await controller.new_page(proxy_info=proxy)
+ await controller.close(force=True)
+
+ call_kwargs = stagehand_client.sessions.start.call_args.kwargs
+
+ browserbase_proxy_options = call_kwargs['browserbase_session_create_params']
+ assert browserbase_proxy_options['proxies'][0]['type'] == 'external'
+ assert browserbase_proxy_options['proxies'][0]['server'] == 'http://proxy.example.com:8080'
+ assert browserbase_proxy_options['proxies'][0]['username'] == 'user'
+ assert browserbase_proxy_options['proxies'][0]['password'] == 'pass'
+
+
+async def test_fingerprint_headers_set_on_new_page(controller: StagehandBrowserController, server_url: URL) -> None:
+ page = await controller.new_page()
+
+ response = await page.goto(str(server_url / 'headers'))
+
+ assert response is not None
+
+ response_json = await response.json()
+
+ assert 'Headless' not in response_json['user-agent']
diff --git a/tests/unit/browsers/test_stagehand_browser_plugin.py b/tests/unit/browsers/test_stagehand_browser_plugin.py
new file mode 100644
index 0000000000..40cec7b472
--- /dev/null
+++ b/tests/unit/browsers/test_stagehand_browser_plugin.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from stagehand import AsyncStagehand
+
+from crawlee.browsers import StagehandBrowserController, StagehandBrowserPlugin, StagehandOptions
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+
+_PATCH_MODULE = 'crawlee.browsers._stagehand_browser_plugin'
+
+
+@pytest.fixture
+def stagehand_client_mock() -> MagicMock:
+ client = MagicMock(spec=AsyncStagehand)
+ client.__aenter__ = AsyncMock(return_value=client)
+ client.__aexit__ = AsyncMock(return_value=None)
+ return client
+
+
+@pytest.fixture
+async def plugin() -> AsyncGenerator[StagehandBrowserPlugin, None]:
+ async with StagehandBrowserPlugin() as plugin:
+ yield plugin
+
+
+def test_initial_state() -> None:
+ plugin = StagehandBrowserPlugin(max_open_pages_per_browser=5)
+
+ assert plugin.active is False
+ assert plugin.browser_type == 'chromium'
+ assert plugin.max_open_pages_per_browser == 5
+
+ # headless should be True by default
+ assert plugin.browser_launch_options['headless'] is True
+
+
+def test_implicit_set_options() -> None:
+ plugin = StagehandBrowserPlugin(
+ browser_new_context_options={'viewport': {'width': 1280, 'height': 720}},
+ browser_launch_options={'headless': False},
+ user_data_dir='./test',
+ )
+
+ assert plugin.browser_launch_options['headless'] is False
+ assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720}
+ assert plugin.browser_launch_options['user_data_dir'] == './test'
+
+
+def test_order_priority_of_implicit_options() -> None:
+ # `browser_launch_options` takes priority over `browser_new_context_options` for shared keys,
+ # while non-conflicting keys from both dicts are merged.
+ plugin = StagehandBrowserPlugin(
+ browser_new_context_options={'headless': True, 'viewport': {'width': 1280, 'height': 720}},
+ browser_launch_options={'headless': False},
+ )
+
+ assert plugin.browser_launch_options['headless'] is False
+ assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720}
+
+
+def test_stagehand_options_defaults_when_not_provided() -> None:
+ plugin = StagehandBrowserPlugin()
+
+ assert isinstance(plugin.stagehand_options, StagehandOptions)
+ assert plugin.stagehand_options == StagehandOptions()
+
+
+async def test_stagehand_called_with_local_params(stagehand_client_mock: MagicMock) -> None:
+ with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock:
+ async with StagehandBrowserPlugin(
+ stagehand_options=StagehandOptions(env='LOCAL', local_ready_timeout_s=20.0, model_api_key='test_model_key')
+ ):
+ pass
+
+ call_kwargs = stagehand_mock.call_args.kwargs
+ assert call_kwargs['server'] == 'local'
+ assert call_kwargs['local_ready_timeout_s'] == 20.0
+ assert call_kwargs['model_api_key'] == 'test_model_key'
+ assert 'browserbase_api_key' not in call_kwargs
+ assert 'browserbase_project_id' not in call_kwargs
+
+ # In local environment, the plugin should set `local_chrome_path` to the path of the Playwright Chromium executable.
+ assert 'local_chrome_path' in call_kwargs
+
+
+async def test_stagehand_called_with_browserbase_params(stagehand_client_mock: MagicMock) -> None:
+ with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock:
+ async with StagehandBrowserPlugin(
+ stagehand_options=StagehandOptions(
+ env='BROWSERBASE',
+ browserbase_api_key='test_key',
+ project_id='test_project_id',
+ model_api_key='test_model_key',
+ )
+ ):
+ pass
+
+ call_kwargs = stagehand_mock.call_args.kwargs
+ assert call_kwargs['server'] == 'remote'
+ assert call_kwargs['browserbase_api_key'] == 'test_key'
+ assert call_kwargs['browserbase_project_id'] == 'test_project_id'
+ assert call_kwargs['model_api_key'] == 'test_model_key'
+
+ # In Browserbase environment, the plugin should NOT set `local_chrome_path` as it's not used.
+ assert 'local_chrome_path' not in call_kwargs
+
+
+async def test_methods_raise_error_when_not_active() -> None:
+ plugin = StagehandBrowserPlugin()
+
+ assert plugin.active is False
+
+ with pytest.raises(RuntimeError, match=r'Plugin is not active'):
+ await plugin.new_browser()
+
+ with pytest.raises(RuntimeError, match=r'Plugin is already active.'):
+ async with plugin, plugin:
+ pass
+
+ async with plugin:
+ assert plugin.active is True
+
+
+async def test_new_browser(plugin: StagehandBrowserPlugin) -> None:
+ browser_controller = await plugin.new_browser()
+
+ assert isinstance(browser_controller, StagehandBrowserController)
+
+ assert browser_controller.is_browser_connected
+ assert browser_controller.has_free_capacity
+
+ await browser_controller.close()
+
+
+async def test_multiple_new_browsers(plugin: StagehandBrowserPlugin) -> None:
+ browser_controller_1 = await plugin.new_browser()
+ browser_controller_2 = await plugin.new_browser()
+
+ assert browser_controller_1 is not browser_controller_2
diff --git a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py
new file mode 100644
index 0000000000..d1455db5b5
--- /dev/null
+++ b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from playwright.async_api import async_playwright
+from stagehand import AsyncSession, AsyncStagehand
+
+from crawlee.browsers import (
+ BrowserPool,
+ PlaywrightBrowserPlugin,
+ StagehandBrowserPlugin,
+ StagehandOptions,
+ StagehandPage,
+)
+from crawlee.crawlers import (
+ PlaywrightCrawlingContext,
+ StagehandCrawler,
+ StagehandCrawlingContext,
+ StagehandPostNavCrawlingContext,
+ StagehandPreNavCrawlingContext,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+ from yarl import URL
+
+
+_PLUGIN_MODULE = 'crawlee.browsers._stagehand_browser_plugin'
+
+
+@pytest.fixture
+def stagehand_session_mock() -> MagicMock:
+ session = MagicMock(spec=AsyncSession)
+ session.data = MagicMock()
+ session.data.cdp_url = 'ws://fake-cdp'
+ session.end = AsyncMock()
+ return session
+
+
+@pytest.fixture
+async def patched_crawler(stagehand_session_mock: MagicMock) -> AsyncGenerator[StagehandCrawler, None]:
+ """StagehandCrawler with real Playwright but Stagehand session mocked."""
+ stagehand_client = MagicMock(spec=AsyncStagehand)
+ stagehand_client.sessions.start = AsyncMock(return_value=stagehand_session_mock)
+ stagehand_client.__aenter__ = AsyncMock(return_value=stagehand_client)
+ stagehand_client.__aexit__ = AsyncMock()
+
+ async with async_playwright() as playwright:
+ browser = await playwright.chromium.launch()
+ # Stagehand's CDP always provides a pre-existing context; simulate it.
+ await browser.new_context()
+
+ mock_playwright_cm = AsyncMock()
+ mock_playwright_cm.__aenter__ = AsyncMock(return_value=playwright)
+ mock_playwright_cm.__aexit__ = AsyncMock()
+
+ with (
+ patch(f'{_PLUGIN_MODULE}.AsyncStagehand', return_value=stagehand_client),
+ patch(f'{_PLUGIN_MODULE}.async_playwright', return_value=mock_playwright_cm),
+ patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)),
+ ):
+ yield StagehandCrawler()
+
+
+def test_init_raises_with_browser_pool_and_params() -> None:
+ pool = BrowserPool(plugins=[StagehandBrowserPlugin()])
+
+ with pytest.raises(ValueError, match=r'Cannot specify'):
+ StagehandCrawler(browser_pool=pool, stagehand_options=StagehandOptions())
+
+ with pytest.raises(ValueError, match=r'Cannot specify'):
+ StagehandCrawler(browser_pool=pool, headless=True)
+
+ with pytest.raises(ValueError, match=r'Cannot specify'):
+ StagehandCrawler(browser_pool=pool, browser_launch_options={'headless': True})
+
+
+def test_init_raises_with_mixed_plugins_in_browser_pool() -> None:
+ pool = BrowserPool(plugins=[StagehandBrowserPlugin(), PlaywrightBrowserPlugin()])
+
+ with pytest.raises(ValueError, match=r'All BrowserPool plugins must be StagehandBrowserPlugin instances'):
+ StagehandCrawler(browser_pool=pool)
+
+
+def test_init_with_valid_browser_pool() -> None:
+ pool = BrowserPool(plugins=[StagehandBrowserPlugin()])
+ StagehandCrawler(browser_pool=pool)
+
+
+async def test_context_type(patched_crawler: StagehandCrawler, server_url: URL) -> None:
+ handler_mock = AsyncMock()
+ pre_navigation_hook_mock = AsyncMock()
+ post_navigation_hook_mock = AsyncMock()
+ patched_crawler.pre_navigation_hook(pre_navigation_hook_mock)
+ patched_crawler.post_navigation_hook(post_navigation_hook_mock)
+ patched_crawler.router.default_handler(handler_mock)
+
+ await patched_crawler.run([str(server_url)])
+
+ context = handler_mock.call_args[0][0]
+ assert isinstance(context, StagehandCrawlingContext)
+ assert isinstance(context, PlaywrightCrawlingContext)
+ assert isinstance(context.page, StagehandPage)
+
+
+async def test_pre_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None:
+ pre_nav_hook_mock = AsyncMock()
+ patched_crawler.pre_navigation_hook(pre_nav_hook_mock)
+ patched_crawler.router.default_handler(AsyncMock())
+
+ await patched_crawler.run([str(server_url)])
+
+ pre_nav_hook_mock.assert_called_once()
+ context = pre_nav_hook_mock.call_args[0][0]
+ assert isinstance(context, StagehandPreNavCrawlingContext)
+ assert isinstance(context.page, StagehandPage)
+
+
+async def test_post_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None:
+ post_nav_hook_mock = AsyncMock()
+ patched_crawler.post_navigation_hook(post_nav_hook_mock)
+ patched_crawler.router.default_handler(AsyncMock())
+
+ await patched_crawler.run([str(server_url)])
+
+ post_nav_hook_mock.assert_called_once()
+ context = post_nav_hook_mock.call_args[0][0]
+ assert isinstance(context, StagehandPostNavCrawlingContext)
+ assert isinstance(context.page, StagehandPage)
+
+
+async def test_stagehand_page_ai_methods_delegate_to_session(
+ patched_crawler: StagehandCrawler,
+ stagehand_session_mock: MagicMock,
+ server_url: URL,
+) -> None:
+ @patched_crawler.router.default_handler
+ async def handler(context: StagehandCrawlingContext) -> None:
+ await context.page.act(input='click button')
+ await context.page.extract(instruction='get title')
+ await context.page.observe(instruction='find links')
+ await context.page.execute(agent_config={}, execute_options={'instruction': 'run script'})
+
+ await patched_crawler.run([str(server_url)])
+
+ for method_name, argument in (
+ ('act', 'input'),
+ ('extract', 'instruction'),
+ ('observe', 'instruction'),
+ ('execute', 'execute_options'),
+ ):
+ method_mock = getattr(stagehand_session_mock, method_name)
+ method_mock.assert_awaited_once()
+ assert isinstance(method_mock.call_args.kwargs['page'], StagehandPage)
+
+ assert argument in method_mock.call_args.kwargs
diff --git a/uv.lock b/uv.lock
index a9e945e853..4c8d26d318 100644
--- a/uv.lock
+++ b/uv.lock
@@ -825,6 +825,7 @@ all = [
{ name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "sqlalchemy", extra = ["asyncio"] },
+ { name = "stagehand" },
{ name = "typer" },
{ name = "wrapt" },
]
@@ -879,6 +880,12 @@ sql-sqlite = [
{ name = "aiosqlite" },
{ name = "sqlalchemy", extra = ["asyncio"] },
]
+stagehand = [
+ { name = "apify-fingerprint-datapoints" },
+ { name = "browserforge" },
+ { name = "playwright" },
+ { name = "stagehand" },
+]
[package.dev-dependencies]
dev = [
@@ -916,16 +923,18 @@ requires-dist = [
{ name = "apify-fingerprint-datapoints", marker = "extra == 'adaptive-crawler'", specifier = ">=0.0.3" },
{ name = "apify-fingerprint-datapoints", marker = "extra == 'httpx'", specifier = ">=0.0.2" },
{ name = "apify-fingerprint-datapoints", marker = "extra == 'playwright'", specifier = ">=0.0.2" },
+ { name = "apify-fingerprint-datapoints", marker = "extra == 'stagehand'", specifier = ">=0.0.2" },
{ name = "async-timeout", specifier = ">=5.0.1" },
{ name = "asyncpg", marker = "extra == 'sql-postgres'", specifier = ">=0.24.0" },
{ name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'beautifulsoup'", specifier = ">=4.12.0" },
{ name = "browserforge", marker = "extra == 'adaptive-crawler'", specifier = ">=1.2.4" },
{ name = "browserforge", marker = "extra == 'httpx'", specifier = ">=1.2.3" },
{ name = "browserforge", marker = "extra == 'playwright'", specifier = ">=1.2.3" },
+ { name = "browserforge", marker = "extra == 'stagehand'", specifier = ">=1.2.3" },
{ name = "cachetools", specifier = ">=5.5.0" },
{ name = "colorama", specifier = ">=0.4.0" },
{ name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" },
- { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "redis"], marker = "extra == 'all'" },
+ { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "stagehand", "redis"], marker = "extra == 'all'" },
{ name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" },
{ name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" },
{ name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" },
@@ -943,6 +952,7 @@ requires-dist = [
{ name = "parsel", marker = "extra == 'parsel'", specifier = ">=1.10.0" },
{ name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" },
{ name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" },
+ { name = "playwright", marker = "extra == 'stagehand'", specifier = ">=1.27.0" },
{ name = "protego", specifier = ">=0.5.0" },
{ name = "psutil", specifier = ">=6.0.0" },
{ name = "pydantic", specifier = ">=2.11.0" },
@@ -954,13 +964,14 @@ requires-dist = [
{ name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-mysql'", specifier = ">=2.0.0,<3.0.0" },
{ name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-postgres'", specifier = ">=2.0.0,<3.0.0" },
{ name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" },
+ { name = "stagehand", marker = "extra == 'stagehand'", specifier = ">=3.19.5" },
{ name = "tldextract", specifier = ">=5.1.0" },
{ name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" },
{ name = "typing-extensions", specifier = ">=4.1.0" },
{ name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" },
{ name = "yarl", specifier = ">=1.18.0" },
]
-provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "sql-sqlite", "sql-mysql", "redis"]
+provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stagehand", "sql-sqlite", "sql-mysql", "redis"]
[package.metadata.requires-dev]
dev = [
@@ -1154,6 +1165,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" },
]
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
+]
+
[[package]]
name = "docspec"
version = "2.2.1"
@@ -3712,6 +3732,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
]
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
+]
+
[[package]]
name = "sortedcontainers"
version = "2.4.0"
@@ -3795,6 +3824,26 @@ asyncio = [
{ name = "greenlet" },
]
+[[package]]
+name = "stagehand"
+version = "3.19.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "anyio" },
+ { name = "distro" },
+ { name = "httpx" },
+ { name = "pydantic" },
+ { name = "sniffio" },
+ { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/f8/ccd2bb2758a4eaf0af3846e097ff206e0aa76c8d3b5aa2bded77fb47825e/stagehand-3.19.5.tar.gz", hash = "sha256:3cb8279ac82051e584b34d26e87dc764f0ccad766a01625198ca578eb35f0b6c", size = 281033, upload-time = "2026-04-03T20:21:09.792Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/d1/6f/a47bad258bfafc193ebb8e0e8c440e8028c9ab28b54a333b46aa3c0cff53/stagehand-3.19.5-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:14f39a4f8d30d77c089166185c705f66aade25432b903a663a937b3747439c26", size = 34495874, upload-time = "2026-04-03T20:21:07.366Z" },
+ { url = "https://files.pythonhosted.org/packages/72/f7/e39868903121f1a80ae6eda088383362cd2d3a578c04493a2f83c1aac1da/stagehand-3.19.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80ed0d732cb9c3e952ad851e071dad5775a9ea88d2787c006289d61097fd2609", size = 33193535, upload-time = "2026-04-03T20:21:18.536Z" },
+ { url = "https://files.pythonhosted.org/packages/c8/0b/35cb92bb53e9539c0147892dbd0a227b43bf0d8adcd0a8e867dc5f2bf7fd/stagehand-3.19.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:aa947a5f6241f5953ac238cd9b0ab72e0cb87f559f97e5ee875f83dbc0c351d1", size = 37273148, upload-time = "2026-04-03T20:21:11.939Z" },
+ { url = "https://files.pythonhosted.org/packages/7c/c7/dccf63cba1941b5710dc9968218e2883a937cf6534d644bb0c5222d3f40a/stagehand-3.19.5-py3-none-win_amd64.whl", hash = "sha256:e37bf630b99b4a9b7d95f151c56b296940db88b3049b68f0abb56f9e31cc6095", size = 30758357, upload-time = "2026-04-03T20:21:15.121Z" },
+]
+
[[package]]
name = "text-unidecode"
version = "1.3"