diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx index 0f1b235b60..4fefc0354e 100644 --- a/docs/guides/architecture_overview.mdx +++ b/docs/guides/architecture_overview.mdx @@ -53,6 +53,8 @@ class PlaywrightCrawler class AdaptivePlaywrightCrawler +class StagehandCrawler + %% ======================== %% Inheritance arrows %% ======================== @@ -63,6 +65,7 @@ BasicCrawler --|> AdaptivePlaywrightCrawler AbstractHttpCrawler --|> HttpCrawler AbstractHttpCrawler --|> ParselCrawler AbstractHttpCrawler --|> BeautifulSoupCrawler +PlaywrightCrawler --|> StagehandCrawler ``` ### HTTP crawlers @@ -79,7 +82,19 @@ You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawl ### Browser crawlers -Browser crawlers use a real browser to render pages, enabling scraping of sites that require JavaScript. They manage browser instances, pages, and context lifecycles. Currently, the only browser crawler is `PlaywrightCrawler`, which utilizes the [Playwright](https://playwright.dev/) library. Playwright provides a high-level API for controlling and navigating browsers. You can learn more about `PlaywrightCrawler`, its features, and how it internally manages browser instances in the [Playwright crawler guide](./playwright-crawler). +Browser crawlers use a real browser to render pages, enabling scraping of sites that require +JavaScript. They manage browser instances, pages, and context lifecycles. Crawlee provides +two browser crawlers: + +- `PlaywrightCrawler` utilizes the + [Playwright](https://playwright.dev/) library and provides a high-level API for controlling + and navigating browsers. You can learn more about it in the + [Playwright crawler guide](./playwright-crawler). +- `StagehandCrawler` extends + `PlaywrightCrawler` with AI-powered browser automation via + [Stagehand](https://github.com/browserbase/stagehand). It adds natural-language methods + (`act`, `extract`, `observe`, `execute`) directly on the page object. You can learn more + about it in the [Stagehand crawler guide](./stagehand-crawler). ### Adaptive crawler @@ -122,6 +137,12 @@ class AdaptivePlaywrightPreNavCrawlingContext class AdaptivePlaywrightCrawlingContext +class StagehandPreNavCrawlingContext + +class StagehandPostNavCrawlingContext + +class StagehandCrawlingContext + %% ======================== %% Inheritance arrows %% ======================== @@ -143,6 +164,12 @@ PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext BasicCrawlingContext --|> AdaptivePlaywrightPreNavCrawlingContext ParsedHttpCrawlingContext --|> AdaptivePlaywrightCrawlingContext + +PlaywrightPreNavCrawlingContext --|> StagehandPreNavCrawlingContext + +StagehandPreNavCrawlingContext --|> StagehandPostNavCrawlingContext + +StagehandPostNavCrawlingContext --|> StagehandCrawlingContext ``` They have a similar inheritance structure as the crawlers, with the base class being `BasicCrawlingContext`. The specific crawling contexts are: @@ -154,6 +181,12 @@ They have a similar inheritance structure as the crawlers, with the base class b - `PlaywrightCrawlingContext` for Playwright crawlers. - `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated. - `AdaptivePlaywrightCrawlingContext` for Adaptive Playwright crawlers. +- `StagehandPreNavCrawlingContext` + for Stagehand crawlers before the page is navigated. +- `StagehandPostNavCrawlingContext` + for Stagehand crawlers after the page is navigated. +- `StagehandCrawlingContext` + for Stagehand crawlers. ## Storages diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py b/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py deleted file mode 100644 index 67b76f1f47..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, cast - -from stagehand.context import StagehandContext -from typing_extensions import override - -from crawlee.browsers import ( - PlaywrightBrowserController, - PlaywrightBrowserPlugin, - PlaywrightPersistentBrowser, -) - -from .support_classes import CrawleeStagehandPage - -if TYPE_CHECKING: - from collections.abc import Mapping - - from playwright.async_api import Page - from stagehand import Stagehand - - from crawlee.proxy_configuration import ProxyInfo - - -class StagehandBrowserController(PlaywrightBrowserController): - @override - def __init__( - self, browser: PlaywrightPersistentBrowser, stagehand: Stagehand, **kwargs: Any - ) -> None: - # Initialize with browser context instead of browser instance - super().__init__(browser, **kwargs) - - self._stagehand = stagehand - self._stagehand_context: StagehandContext | None = None - - @override - async def new_page( - self, - browser_new_context_options: Mapping[str, Any] | None = None, - proxy_info: ProxyInfo | None = None, - ) -> Page: - # Initialize browser context if not already done - if not self._browser_context: - self._browser_context = await self._create_browser_context( - browser_new_context_options=browser_new_context_options, - proxy_info=proxy_info, - ) - - # Initialize Stagehand context if not already done - if not self._stagehand_context: - self._stagehand_context = await StagehandContext.init( - self._browser_context, self._stagehand - ) - - # Create a new page using Stagehand context - page = await self._stagehand_context.new_page() - - pw_page = page._page # noqa: SLF001 - - # Handle page close event - pw_page.on(event='close', f=self._on_page_close) - - # Update internal state - self._pages.append(pw_page) - self._last_page_opened_at = datetime.now(timezone.utc) - - self._total_opened_pages += 1 - - # Wrap StagehandPage to provide Playwright Page interface - return cast('Page', CrawleeStagehandPage(page)) - - -class StagehandPlugin(PlaywrightBrowserPlugin): - """Browser plugin that integrates Stagehand with Crawlee's browser management.""" - - @override - def __init__(self, stagehand: Stagehand, **kwargs: Any) -> None: - super().__init__(**kwargs) - - self._stagehand = stagehand - - @override - async def new_browser(self) -> StagehandBrowserController: - if not self._playwright: - raise RuntimeError('Playwright browser plugin is not initialized.') - - browser = PlaywrightPersistentBrowser( - # Stagehand can run only on a Chromium-based browser. - self._playwright.chromium, - self._user_data_dir, - self._browser_launch_options, - ) - - # Return custom controller with Stagehand - return StagehandBrowserController( - browser=browser, - stagehand=self._stagehand, - header_generator=None, - fingerprint_generator=None, - ) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py b/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py deleted file mode 100644 index 6cf8cc2689..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from typing import cast - -from stagehand import StagehandConfig, StagehandPage - -from crawlee import ConcurrencySettings -from crawlee.browsers import BrowserPool -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -from .browser_classes import StagehandPlugin -from .support_classes import CrawleeStagehand - - -async def main() -> None: - # Configure local Stagehand with Gemini model - config = StagehandConfig( - env='LOCAL', - model_name='google/gemini-2.5-flash-preview-05-20', - model_api_key=os.getenv('GEMINI_API_KEY'), - ) - - # Create Stagehand instance - stagehand = CrawleeStagehand(config) - - # Create crawler with custom browser pool using Stagehand - crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=10, - # Custom browser pool. Gives users full control over browsers used by the crawler. - concurrency_settings=ConcurrencySettings(max_tasks_per_minute=10), - browser_pool=BrowserPool( - plugins=[ - StagehandPlugin(stagehand, browser_launch_options={'headless': True}) - ], - ), - ) - - # Define the default request handler, which will be called for every request. - @crawler.router.default_handler - async def request_handler(context: PlaywrightCrawlingContext) -> None: - context.log.info(f'Processing {context.request.url} ...') - - # Cast to StagehandPage for proper type hints in IDE - page = cast('StagehandPage', context.page) - - # Use regular Playwright method - playwright_title = await page.title() - context.log.info(f'Playwright page title: {playwright_title}') - - # highlight-start - # Use AI-powered extraction with natural language - gemini_title = await page.extract('Extract page title') - context.log.info(f'Gemini page title: {gemini_title}') - # highlight-end - - await context.enqueue_links() - - # Run the crawler with the initial list of URLs. - await crawler.run(['https://crawlee.dev/']) - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py b/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py deleted file mode 100644 index cccb62e989..0000000000 --- a/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from stagehand import Stagehand, StagehandPage - -if TYPE_CHECKING: - from types import TracebackType - - -class CrawleeStagehandPage: - """StagehandPage wrapper for Crawlee.""" - - def __init__(self, page: StagehandPage) -> None: - self._page = page - - async def goto( - self, - url: str, - *, - referer: str | None = None, - timeout: int | None = None, - wait_until: str | None = None, - ) -> Any: - """Navigate to the specified URL.""" - # Override goto to return navigation result that `PlaywrightCrawler` expects - return await self._page._page.goto( # noqa: SLF001 - url, - referer=referer, - timeout=timeout, - wait_until=wait_until, - ) - - def __getattr__(self, name: str) -> Any: - """Delegate all other methods to the underlying StagehandPage.""" - return getattr(self._page, name) - - async def __aenter__(self) -> CrawleeStagehandPage: - """Enter the context manager.""" - return self - - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - exc_traceback: TracebackType | None, - ) -> None: - await self._page.close() - - -class CrawleeStagehand(Stagehand): - """Stagehand wrapper for Crawlee to disable the launch of Playwright.""" - - async def init(self) -> None: - # Skip Stagehand's own Playwright initialization - # Let Crawlee's PlaywrightBrowserPlugin manage the browser lifecycle - self._initialized = True diff --git a/docs/guides/code_examples/stagehand_crawler/basic_example.py b/docs/guides/code_examples/stagehand_crawler/basic_example.py new file mode 100644 index 0000000000..f6cbc10736 --- /dev/null +++ b/docs/guides/code_examples/stagehand_crawler/basic_example.py @@ -0,0 +1,47 @@ +import asyncio +from typing import cast + +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext + + +async def main() -> None: + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + model_api_key='your-openai-api-key', + model='openai/gpt-4.1-mini', + ), + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Dismiss overlays or interact with the page using natural language. + await context.page.act(input='Click the accept cookies button if present') + + # Extract data from the page using AI. + extracted = await context.page.extract( + instruction='Get the page title and the main heading text', + schema={ + 'type': 'object', + 'properties': { + 'title': {'type': 'string'}, + 'heading': {'type': 'string'}, + }, + }, + ) + + extract_result = extracted.data.result + + if isinstance(extract_result, dict): + # Push extracted data to the dataset + # Use `cast()` to provide a more specific type hint for the extracted data. + await context.push_data(cast('dict[str, str | None]', extract_result)) + + await crawler.run(['https://example.com']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/stagehand_crawler/browserbase_example.py b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py new file mode 100644 index 0000000000..ac465aba73 --- /dev/null +++ b/docs/guides/code_examples/stagehand_crawler/browserbase_example.py @@ -0,0 +1,37 @@ +import asyncio +from typing import cast + +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext + + +async def main() -> None: + # Use Browserbase cloud browser instead of a local Chromium instance. + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + env='BROWSERBASE', + browserbase_api_key='your-browserbase-api-key', + project_id='your-project-id', + model_api_key='your-openai-api-key', + model='openai/gpt-4.1-mini', + ), + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + extracted = await context.page.extract( + instruction='Get the main content of the page', + ) + + extract_result = extracted.data.result + + await context.push_data(cast('dict[str, str | None]', extract_result)) + + await crawler.run(['https://example.com']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/playwright_crawler_stagehand.mdx b/docs/guides/playwright_crawler_stagehand.mdx deleted file mode 100644 index 59a34b4cd2..0000000000 --- a/docs/guides/playwright_crawler_stagehand.mdx +++ /dev/null @@ -1,66 +0,0 @@ ---- -id: playwright-crawler-stagehand -title: Playwright with Stagehand -description: How to integrate Stagehand AI-powered automation with PlaywrightCrawler. ---- - -import ApiLink from '@site/src/components/ApiLink'; -import CodeBlock from '@theme/CodeBlock'; - -import SupportClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/support_classes.py'; -import BrowserClasses from '!!raw-loader!./code_examples/playwright_crawler_stagehand/browser_classes.py'; -import StagehandRun from '!!raw-loader!./code_examples/playwright_crawler_stagehand/stagehand_run.py'; - -[Stagehand](https://docs.stagehand.dev/) is a framework that combines [Playwright](https://playwright.dev/python/) with AI-driven natural language understanding and decision-making capabilities. With Stagehand, you can use natural language instructions to interact with web pages instead of writing complex selectors and automation logic. - -Stagehand supports multiple AI models through [`LiteLLM`](https://docs.litellm.ai/docs/). This guide demonstrates how to integrate Stagehand with `PlaywrightCrawler` using [Gemini](https://ai.google.dev/gemini-api/docs) as the AI model provider. - -:::info - -This guide is based on stagehand-python v0.4.0 with local configuration settings and may not be compatible with newer versions. - -::: - -## Get Gemini API key - -You need to register with [Google AI Studio](https://aistudio.google.com/) and navigate to [Get API key](https://aistudio.google.com/app/apikey) to obtain your API key. - -## Create support classes for Stagehand - -To integrate Stagehand with Crawlee, you need to create wrapper classes that allow `PlaywrightBrowserPlugin` to manage the Playwright lifecycle. - -Create `CrawleeStagehand` - a custom Stagehand subclass that overrides the `init` method to prevent Stagehand from launching its own Playwright instance. - -Create `CrawleeStagehandPage` - a wrapper class for `StagehandPage` that implements the [Playwright Page](https://playwright.dev/python/docs/next/api/class-page) behavior expected by `PlaywrightCrawler`. - - - {SupportClasses} - - -## Create browser integration classes - -You need to create a custom browser plugin and controller that properly initialize Stagehand and obtain browser pages from `StagehandContext`. - -Create `StagehandPlugin` - a subclass of `PlaywrightBrowserPlugin` that holds the Stagehand instance and creates `PlaywrightPersistentBrowser` instances. - -Create `StagehandBrowserController` - a subclass of `PlaywrightBrowserController` that lazily initializes `StagehandContext` and creates new pages with AI capabilities on demand. - - - {BrowserClasses} - - -## Create a crawler - -Now you can create a `PlaywrightCrawler` that uses Stagehand's AI capabilities to interact with web pages using natural language commands: - - - {StagehandRun} - - -The integration works through several key components: -- `CrawleeStagehand` prevents Stagehand from launching its own Playwright instance, allowing Crawlee to manage the browser lifecycle -- `StagehandPlugin` extends the Playwright browser plugin to create Stagehand-enabled browser instances -- `StagehandBrowserController` uses `StagehandContext` to create pages with AI capabilities -- `CrawleeStagehandPage` provides interface compatibility between Stagehand pages and Crawlee's expectations - -In the request handler, you can use natural language commands like `page.extract('Extract title page')` to perform intelligent data extraction without writing complex selectors. diff --git a/docs/guides/stagehand_crawler.mdx b/docs/guides/stagehand_crawler.mdx new file mode 100644 index 0000000000..ca609684f9 --- /dev/null +++ b/docs/guides/stagehand_crawler.mdx @@ -0,0 +1,131 @@ +--- +id: stagehand-crawler +title: Stagehand crawler +description: Learn how to use StagehandCrawler for AI-powered browser automation and data extraction. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import BasicExample from '!!raw-loader!./code_examples/stagehand_crawler/basic_example.py'; +import BrowserbaseExample from '!!raw-loader!./code_examples/stagehand_crawler/browserbase_example.py'; + +A `StagehandCrawler` extends `PlaywrightCrawler` with AI-powered browser automation via [Stagehand](https://www.browserbase.com/stagehand). Instead of writing CSS selectors or XPath expressions, you describe what you want in plain English and the AI model takes care of the rest. + +Each page in the crawling context is a `StagehandPage` - a drop-in replacement for the standard Playwright `Page` that adds four AI methods: + +- `page.act(**kwargs)` - perform an action using a natural language instruction +- `page.extract(**kwargs)` - extract structured data from the page using AI +- `page.observe(**kwargs)` - get a list of AI-suggested actions available on the page +- `page.execute(**kwargs)` - run an autonomous multi-step agent on the page + +All standard Playwright methods remain available alongside these AI methods. + +## When to use StagehandCrawler + +Use `StagehandCrawler` when: + +- **Selectors are brittle or unknown** - the AI can locate elements by their visual role or label rather than a specific CSS class. +- **Interactions are complex** - multi-step forms, dynamic menus, or context-dependent flows that are hard to script. +- **Rapid prototyping** - you want to build a scraper quickly without spending time reverse-engineering the page structure. + +For straightforward scraping tasks where the page structure is stable and well-known, `PlaywrightCrawler` is more efficient, read more in that [guide](./playwright-crawler). + +## Installation + +`StagehandCrawler` requires the `stagehand` optional dependency group: + +```bash +pip install 'crawlee[stagehand]' +``` + +or with uv: + +```bash +uv add 'crawlee[stagehand]' +``` + +## Basic usage + +The example below demonstrates the typical usage pattern: dismiss cookie banners with `act()` and extract structured data with `extract()`. + + + {BasicExample} + + +## StagehandOptions configuration + +Stagehand-specific settings are provided via `StagehandOptions`. Pass the instance to the `stagehand_options` argument of `StagehandCrawler`. + +## AI page operations + +### `act` - perform actions + +Use `act()` to interact with the page using a natural language instruction: + +```python +await context.page.act(input='Click the "Sign in" button') +``` + +### `extract` - structured data extraction + +Use `extract()` to pull structured data from the page. Pass a JSON Schema via schema to enforce the output shape: + +```python +data = await context.page.extract( + instruction='Extract the top comment on this page', + schema={ + 'type': 'object', + 'properties': { + 'comment_text': {'type': 'string'}, + 'author': {'type': 'string'}, + }, + 'required': ['comment_text'], + }, +) +``` + +### `observe` - inspect available actions + +Use `observe()` to get AI-suggested actions currently available on the page. Useful for debugging or building adaptive workflows: + +```python +actions = await context.page.observe( + instruction='What actions are available in the navigation menu?' +) +``` + +### `execute` - autonomous multi-step agent + +Use `execute()` for longer autonomous tasks that span multiple interactions: + +```python +result = await context.page.execute( + agent_config={}, + execute_options={ + 'instruction': 'Search for "web scraping" and return the titles of the first five results', + }) +``` + +## Browserbase integration + +By default, Stagehand launches a local Chromium browser. To use [Browserbase](https://www.browserbase.com/) - a managed cloud browser service - set `env='BROWSERBASE'` in `StagehandOptions` and supply the required credentials: + + + {BrowserbaseExample} + + +Browserbase credentials (`browserbase_api_key`, `project_id`) can also be provided via the `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` environment variables. + +## Browser configuration limitations + +Because Stagehand manages the browser session internally via CDP, only Chromium is supported. +Browser settings are limited to the subset accepted by Stagehand's `BrowserLaunchOptions` - +`headless`, `args`, `viewport`, `proxy`, `locale`, `executable_path`, and a few others. +Features like full browser fingerprinting (canvas, WebGL, screen properties) and incognito +pages are not supported. Fingerprint-consistent HTTP headers (`User-Agent`, `Accept`, `sec-ch-ua`) +are still injected automatically. + +## Conclusion + +This guide introduced `StagehandCrawler` and its AI page operations: `act()`, `extract()`, `observe()`, and `execute()`. You learned how to configure Stagehand via `StagehandOptions` and switch to Browserbase for cloud browser execution. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/pyproject.toml b/pyproject.toml index 699d68dffe..006769bcdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ ] [project.optional-dependencies] -all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"] +all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,stagehand,redis]"] adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", @@ -77,6 +77,12 @@ sql_postgres = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "asyncpg>=0.24.0" ] +stagehand = [ + "stagehand>=3.19.5", + "playwright>=1.27.0", + "apify_fingerprint_datapoints>=0.0.2", + "browserforge>=1.2.3", +] sql_sqlite = [ "sqlalchemy[asyncio]>=2.0.0,<3.0.0", "aiosqlite>=0.21.0", diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index 0ea1101df4..cef3446690 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -17,6 +17,13 @@ with _try_import(__name__, 'PlaywrightPersistentBrowser'): from ._playwright_browser import PlaywrightPersistentBrowser +with _try_import(__name__, 'StagehandBrowserController'): + from ._stagehand_browser_controller import StagehandBrowserController +with _try_import(__name__, 'StagehandBrowserPlugin'): + from ._stagehand_browser_plugin import StagehandBrowserPlugin +with _try_import(__name__, 'StagehandOptions', 'StagehandPage'): + from ._stagehand_types import StagehandOptions, StagehandPage + __all__ = [ 'BrowserPool', @@ -25,4 +32,8 @@ 'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'PlaywrightPersistentBrowser', + 'StagehandBrowserController', + 'StagehandBrowserPlugin', + 'StagehandOptions', + 'StagehandPage', ] diff --git a/src/crawlee/browsers/_stagehand_browser_controller.py b/src/crawlee/browsers/_stagehand_browser_controller.py new file mode 100644 index 0000000000..950c3b0823 --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_controller.py @@ -0,0 +1,292 @@ +from __future__ import annotations + +from asyncio import Lock +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.browsers._browser_controller import BrowserController +from crawlee.browsers._stagehand_types import StagehandPage +from crawlee.fingerprint_suite import HeaderGenerator +from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type + +if TYPE_CHECKING: + from collections.abc import Mapping + + from playwright.async_api import Browser, BrowserContext, Playwright + from stagehand import AsyncSession, AsyncStagehand + + from crawlee.browsers._stagehand_types import StagehandOptions + from crawlee.browsers._types import BrowserType + from crawlee.proxy_configuration import ProxyInfo + + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserController(BrowserController): + """Controller for managing a Stagehand-controlled browser instance. + + It creates and connects to the browser lazily on the first ``new_page`` call: Stagehand + starts a session, and Playwright then connects to it via CDP. All pages share a single + browser context, as Stagehand creates the browser and its context together during session + initialisation. + """ + + AUTOMATION_LIBRARY = 'stagehand' + _DEFAULT_HEADER_GENERATOR = HeaderGenerator() + + def __init__( + self, + *, + playwright: Playwright, + stagehand_client: AsyncStagehand, + stagehand_options: StagehandOptions, + max_open_pages_per_browser: int = 20, + header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, + ) -> None: + """Initialize a new instance. + + Args: + playwright: Active Playwright instance used to connect to the browser via CDP. + stagehand_client: Active Stagehand REST client used to start and end sessions. + stagehand_options: Stagehand-specific configuration (model, env, self-heal, etc.). + max_open_pages_per_browser: Maximum number of pages that can be open at the same time. + header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for + requests made by the browser. By default, a predefined header generator is used. Set to `None` to + disable automatic header modifications. + """ + self._playwright = playwright + self._stagehand_client = stagehand_client + self._stagehand_options = stagehand_options + self._max_open_pages_per_browser = max_open_pages_per_browser + self._header_generator = header_generator + + self._session: AsyncSession | None = None + self._browser: Browser | None = None + self._browser_context: BrowserContext | None = None + self._context_creation_lock = Lock() + + self._pages = list[StagehandPage]() + self._total_opened_pages = 0 + self._opening_pages_count = 0 + self._last_page_opened_at = datetime.now(timezone.utc) + + @property + @override + def pages(self) -> list[StagehandPage]: + return self._pages + + @property + @override + def total_opened_pages(self) -> int: + return self._total_opened_pages + + @property + @override + def pages_count(self) -> int: + return len(self._pages) + + @property + @override + def last_page_opened_at(self) -> datetime: + return self._last_page_opened_at + + @property + @override + def idle_time(self) -> timedelta: + return datetime.now(timezone.utc) - self._last_page_opened_at + + @property + @override + def has_free_capacity(self) -> bool: + return (self.pages_count + self._opening_pages_count) < self._max_open_pages_per_browser + + @property + @override + def is_browser_connected(self) -> bool: + # Session not yet started - controller is available for new pages. + return self._browser is None or self._browser.is_connected() + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + @override + async def new_page( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> StagehandPage: + """Create a new page in the Stagehand-managed browser. + + On the first call, starts the Stagehand session with the provided options. On subsequent + calls, ``browser_new_context_options`` and ``proxy_info`` are ignored because the session + context cannot be reconfigured once it is running. + + Args: + browser_new_context_options: Options merged on top of the plugin's launch options + when creating the first session. Ignored if the session already exists. + proxy_info: Proxy injected into the session on first creation. + + Raises: + ValueError: If the browser has reached the maximum number of open pages. + RuntimeError: If the Stagehand session fails to initialise. + """ + if not self.has_free_capacity: + raise ValueError('Cannot open more pages in this browser.') + + self._opening_pages_count += 1 + try: + # Lazily start a Stagehand session on the first page request, passing proxy and context options + # at that point. + if not self._browser_context: + async with self._context_creation_lock: + if not self._browser_context: + self._browser_context = await self._create_browser_context( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + + if self._browser is None or self._session is None or self._browser_context is None: + raise RuntimeError('Failed to initialize the browser session.') + + raw_page = await self._browser_context.new_page() + page = StagehandPage(raw_page, self._session) + raw_page.on('close', lambda _: self._on_page_close(page)) + + self._pages.append(page) + self._last_page_opened_at = datetime.now(timezone.utc) + self._total_opened_pages += 1 + finally: + self._opening_pages_count -= 1 + + return page + + @override + async def close(self, *, force: bool = False) -> None: + if self.pages_count > 0 and not force: + raise ValueError('Cannot close the browser while there are open pages.') + + if self._session is None and self._browser is None: + return + + try: + if self._session is not None: + try: + await self._session.end() + except Exception: + logger.warning('Failed to end Stagehand session gracefully.', exc_info=True) + + if self._browser is not None and self._browser.is_connected(): + await self._browser.close() + finally: + self._session = None + self._browser_context = None + + def _on_page_close(self, page: StagehandPage) -> None: + self._pages.remove(page) + + def _get_extra_http_headers( + self, + extra_http_headers: Mapping[str, str] | None = None, + ) -> dict[str, str]: + if extra_http_headers is not None: + return dict(extra_http_headers) + + if self._header_generator: + generated_headers = self._header_generator.get_specific_headers( + header_names={ + 'Accept', + 'Accept-Language', + 'User-Agent', + 'sec-ch-ua', + 'sec-ch-ua-mobile', + 'sec-ch-ua-platform', + }, + browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type), + ) + else: + generated_headers = {} + + return dict(generated_headers) + + def _build_session_start_params( + self, + browser_new_context_options: dict[str, Any], + proxy_info: ProxyInfo | None = None, + ) -> dict[str, Any]: + session_start_params: dict[str, Any] = { + 'model_name': self._stagehand_options.model, + 'verbose': self._stagehand_options.verbose, + 'self_heal': self._stagehand_options.self_heal, + } + launch_options = dict(browser_new_context_options) + + if self._stagehand_options.dom_settle_timeout_ms is not None: + session_start_params['dom_settle_timeout_ms'] = self._stagehand_options.dom_settle_timeout_ms + + if self._stagehand_options.system_prompt is not None: + session_start_params['system_prompt'] = self._stagehand_options.system_prompt + + if self._stagehand_options.env == 'LOCAL': + if proxy_info: + launch_options['proxy'] = { + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + session_start_params['browser'] = {'type': 'local', 'launch_options': launch_options} + else: + session_start_params['browser'] = {'type': 'browserbase', 'launch_options': launch_options} + if proxy_info: + session_start_params['browserbase_session_create_params'] = { + 'proxies': [ + { + 'type': 'external', + 'server': f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + 'username': proxy_info.username or '', + 'password': proxy_info.password or '', + } + ] + } + + return session_start_params + + async def _create_browser_context( + self, + browser_new_context_options: Mapping[str, Any] | None = None, + proxy_info: ProxyInfo | None = None, + ) -> BrowserContext: + browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} + + extra_http_headers = self._get_extra_http_headers(browser_new_context_options.pop('extra_http_headers', None)) + + session_start_params = self._build_session_start_params( + browser_new_context_options=browser_new_context_options, + proxy_info=proxy_info, + ) + + session: AsyncSession = await self._stagehand_client.sessions.start(**session_start_params) + + cdp_url = session.data.cdp_url + if not cdp_url: + raise RuntimeError( + f'No cdp_url returned from Stagehand (env={self._stagehand_options.env!r}). ' + 'Cannot connect Playwright to the browser.' + ) + + self._browser = await self._playwright.chromium.connect_over_cdp(cdp_url) + + context = self._browser.contexts[0] + + await context.set_extra_http_headers(extra_http_headers) + + self._session = session + + return context diff --git a/src/crawlee/browsers/_stagehand_browser_plugin.py b/src/crawlee/browsers/_stagehand_browser_plugin.py new file mode 100644 index 0000000000..3bfb72b09f --- /dev/null +++ b/src/crawlee/browsers/_stagehand_browser_plugin.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from playwright.async_api import Playwright, async_playwright +from stagehand import AsyncStagehand +from typing_extensions import override + +from crawlee import service_locator +from crawlee._utils.context import ensure_context +from crawlee._utils.docs import docs_group + +from ._browser_plugin import BrowserPlugin +from ._stagehand_browser_controller import StagehandBrowserController +from ._stagehand_types import StagehandOptions + +if TYPE_CHECKING: + from collections.abc import Mapping + from pathlib import Path + from types import TracebackType + + from ._browser_controller import BrowserController + from ._types import BrowserType + + +logger = getLogger(__name__) + + +@docs_group('Browser management') +class StagehandBrowserPlugin(BrowserPlugin): + """A plugin for managing Stagehand AI-powered browser automation. + + It acts as a factory for creating `StagehandBrowserController` instances and manages the + lifecycle of the shared `AsyncStagehand` REST client and the Playwright context. Depending + on the configured environment, the browser runs locally via a bundled Chromium binary + (``env='LOCAL'``) or in the Browserbase cloud (``env='BROWSERBASE'``). Playwright connects + to the running browser via CDP, so only Chromium is supported. + + Not all Playwright browser and context options are supported — only those accepted by + Stagehand's ``BrowserLaunchOptions``. Because Stagehand creates the browser and its context + together in a single ``sessions.start()`` call, both ``browser_launch_options`` and + ``browser_new_context_options`` are merged into one set of options applied at session start. + """ + + AUTOMATION_LIBRARY = 'stagehand' + + def __init__( + self, + *, + user_data_dir: str | Path | None = None, + stagehand_options: StagehandOptions | None = None, + browser_launch_options: dict[str, Any] | None = None, + browser_new_context_options: dict[str, Any] | None = None, + max_open_pages_per_browser: int = 20, + ) -> None: + """Initialize a new instance. + + Args: + user_data_dir: Path to a user data directory, which stores browser session data like + cookies and local storage. + stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). + browser_launch_options: Keyword arguments passed to Stagehand's ``BrowserLaunchOptions`` + on session start. Supported keys are a subset of Playwright's ``browser_type.launch`` + options. These take priority over ``browser_new_context_options`` for shared keys. + browser_new_context_options: Additional options merged with ``browser_launch_options`` + at lower priority. Subject to the same ``BrowserLaunchOptions`` constraints. + max_open_pages_per_browser: The maximum number of pages that can be open in a single + browser instance. Once reached, a new browser instance will be launched. + """ + config = service_locator.get_configuration() + + self._max_open_pages_per_browser = max_open_pages_per_browser + self._stagehand_options = stagehand_options or StagehandOptions() + + is_local = self._stagehand_options.env == 'LOCAL' + + # browser_launch_options take priority over browser_new_context_options for shared keys. + self._browser_launch_options: dict[str, Any] = { + 'headless': config.headless, + 'chromium_sandbox': not config.disable_browser_sandbox, + **(browser_new_context_options or {}), + **(browser_launch_options or {}), + } + + if config.default_browser_path: + self._browser_launch_options.setdefault('executable_path', config.default_browser_path) + + if user_data_dir is not None: + self._browser_launch_options['user_data_dir'] = str(user_data_dir) + + # Parameters for AsyncStagehand. + self._stagehand_init_params: dict[str, Any] = { + 'server': 'local' if is_local else 'remote', + 'local_headless': self._browser_launch_options['headless'], + 'local_ready_timeout_s': self._stagehand_options.local_ready_timeout_s, + } + if self._stagehand_options.model_api_key is not None: + self._stagehand_init_params['model_api_key'] = self._stagehand_options.model_api_key + + if not is_local: + self._stagehand_init_params['browserbase_api_key'] = self._stagehand_options.browserbase_api_key + self._stagehand_init_params['browserbase_project_id'] = self._stagehand_options.project_id + + self._stagehand_client: AsyncStagehand | None = None + self._playwright_context_manager = async_playwright() + self._playwright: Playwright | None = None + self._active = False + + @property + @override + def active(self) -> bool: + return self._active + + @property + @override + def browser_type(self) -> BrowserType: + return 'chromium' + + @property + @override + def browser_launch_options(self) -> Mapping[str, Any]: + """Return the options passed to Stagehand's ``BrowserLaunchOptions`` on session start. + + These are a subset of Playwright's ``browser_type.launch`` options — only keys recognised + by Stagehand's ``BrowserLaunchOptions`` take effect. + """ + return self._browser_launch_options + + @property + @override + def browser_new_context_options(self) -> Mapping[str, Any]: + """Return the browser context options passed to Stagehand's ``BrowserLaunchOptions``. + + Stagehand creates the browser and its context together in a single ``sessions.start()`` + call, so context-level options such as ``viewport`` and ``locale`` are part of + ``BrowserLaunchOptions`` and share the same dictionary as ``browser_launch_options``. + Pre-navigation hooks that modify these options before the first page will take effect, + because session creation is deferred until the first ``new_page`` call. + """ + return self._browser_launch_options + + @property + @override + def max_open_pages_per_browser(self) -> int: + return self._max_open_pages_per_browser + + @property + def stagehand_options(self) -> StagehandOptions: + """Return the Stagehand-specific configuration options.""" + return self._stagehand_options + + @override + async def __aenter__(self) -> StagehandBrowserPlugin: + if self._active: + raise RuntimeError(f'The {self.__class__.__name__} is already active.') + + self._active = True + self._playwright = await self._playwright_context_manager.__aenter__() + + if self._stagehand_options.env == 'LOCAL': + if 'executable_path' not in self._browser_launch_options: + chrome_path = self._playwright.chromium.executable_path + self._browser_launch_options['executable_path'] = chrome_path + logger.debug(f'Resolved Chromium path from Playwright: {chrome_path}') + + self._stagehand_init_params['local_chrome_path'] = self._browser_launch_options['executable_path'] + + client = AsyncStagehand(**self._stagehand_init_params) + await client.__aenter__() + self._stagehand_client = client + + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + if not self._active: + raise RuntimeError(f'The {self.__class__.__name__} is not active.') + + if self._stagehand_client is not None: + await self._stagehand_client.__aexit__(exc_type, exc_value, exc_traceback) + self._stagehand_client = None + + await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + self._playwright_context_manager = async_playwright() + self._playwright = None + self._active = False + + @override + @ensure_context + async def new_browser(self) -> BrowserController: + if not self._playwright or not self._stagehand_client: + raise RuntimeError(f'{self.__class__.__name__} is not initialized.') + + return StagehandBrowserController( + playwright=self._playwright, + stagehand_client=self._stagehand_client, + stagehand_options=self._stagehand_options, + max_open_pages_per_browser=self._max_open_pages_per_browser, + ) diff --git a/src/crawlee/browsers/_stagehand_types.py b/src/crawlee/browsers/_stagehand_types.py new file mode 100644 index 0000000000..7df6f5fd64 --- /dev/null +++ b/src/crawlee/browsers/_stagehand_types.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Literal + +from playwright.async_api import Page + +from crawlee._utils.docs import docs_group + +if TYPE_CHECKING: + from stagehand import AsyncSession + from stagehand.types.session_act_params import SessionActParamsNonStreaming + from stagehand.types.session_act_response import SessionActResponse + from stagehand.types.session_execute_params import SessionExecuteParamsNonStreaming + from stagehand.types.session_execute_response import SessionExecuteResponse + from stagehand.types.session_extract_params import SessionExtractParamsNonStreaming + from stagehand.types.session_extract_response import SessionExtractResponse + from stagehand.types.session_observe_params import SessionObserveParamsNonStreaming + from stagehand.types.session_observe_response import SessionObserveResponse + from typing_extensions import Unpack + + +@dataclass +@docs_group('Browser management') +class StagehandOptions: + """Configuration options for Stagehand AI-powered browser automation. + + Controls the execution environment (local or Browserbase cloud), the AI model used + for page operations, and session-level behaviour such as self-healing and DOM settling. + """ + + env: Literal['LOCAL', 'BROWSERBASE'] = 'LOCAL' + """Execution environment. + 'LOCAL' - Stagehand runs a local Chromium browser. + 'BROWSERBASE' - Stagehand uses a Browserbase cloud browser session. + """ + + browserbase_api_key: str | None = None + """API key for authenticating with Browserbase when `env='BROWSERBASE'`. If not provided, read from + the `BROWSERBASE_API_KEY` environment variable.""" + + model_api_key: str | None = None + """API key for the AI model provider (e.g. OpenAI, Anthropic). Must be provided explicitly - unlike + Browserbase credentials""" + + project_id: str | None = None + """Browserbase project ID, required when `env='BROWSERBASE'`. If not provided, read from + the `BROWSERBASE_PROJECT_ID` environment variable.""" + + model: str = 'openai/gpt-4.1-mini' + """The AI model to use for page operations.""" + + verbose: Literal[0, 1, 2] = 0 + """Verbosity level for logging Stagehand session activity. + 0 - quiet + 1 - normal + 2 - debug""" + + self_heal: bool = True + """When `True`, Stagehand automatically retries failed actions.""" + + dom_settle_timeout_ms: float | None = None + """Maximum time to wait for the DOM to settle before performing an action, in milliseconds.""" + + system_prompt: str | None = None + """Optional system prompt to guide the AI's behavior in sessions.""" + + local_ready_timeout_s: float = 10.0 + """Maximum time to wait for the local Stagehand browser to be ready, in seconds.""" + + +@docs_group('Browser management') +class StagehandPage(Page): + """A Playwright `Page` enhanced with Stagehand AI methods. + + Wraps a Playwright `Page` and an `AsyncSession`, proxying all standard Playwright + methods transparently while adding `act()`, `extract()`, `observe()`, and `execute()` + AI operations bound to the current page. + """ + + def __init__(self, page: Page, session: AsyncSession) -> None: + super().__init__(page._impl_obj) # noqa: SLF001 + self._session = session + + async def act(self, **kwargs: Unpack[SessionActParamsNonStreaming]) -> SessionActResponse: + """Perform an action on the page using natural language. Argument `page` is automatically set. + + Args: + **kwargs: Parameters passed to ``AsyncSession.act()``. + The most common is ``input`` — a natural language instruction string or + ``ActionParam`` object, e.g. ``input='click the login button'``. + + Returns: + The action result from Stagehand. + """ + return await self._session.act(page=self, **kwargs) + + async def observe(self, **kwargs: Unpack[SessionObserveParamsNonStreaming]) -> SessionObserveResponse: + """Observe the page and get AI-suggested actions. Argument `page` is automatically set. + + Args: + **kwargs: Parameters passed to ``AsyncSession.observe()``. + Optionally pass ``instruction`` to narrow the observation scope. + + Returns: + Observation result with suggested actions. + """ + return await self._session.observe(page=self, **kwargs) + + async def extract(self, **kwargs: Unpack[SessionExtractParamsNonStreaming]) -> SessionExtractResponse: + """Extract structured data from the page using natural language. Argument `page` is automatically set. + + Args: + **kwargs: Parameters passed to ``AsyncSession.extract()``. + Common parameters: ``instruction`` and ``schema`` (JSON Schema dict). + + Returns: + Extracted data matching the requested schema. + """ + return await self._session.extract(page=self, **kwargs) + + async def execute(self, **kwargs: Unpack[SessionExecuteParamsNonStreaming]) -> SessionExecuteResponse: + """Run an autonomous multi-step AI agent on the page. Argument `page` is automatically set. + + Args: + **kwargs: Parameters passed to ``AsyncSession.execute()``. + Required parameters: + + - ``agent_config`` — agent behaviour settings (pass ``{}`` for defaults). + Supports keys: ``model``, ``mode`` (``'dom'``/``'hybrid'``/``'cua'``), + ``system_prompt``, and others. + - ``execute_options`` — execution options dict with a required + ``instruction`` key: a natural language description of the multi-step + task to perform, e.g. + ``execute_options={'instruction': 'find the login form and sign in'}``. + + Returns: + The result of the agent execution. + """ + return await self._session.execute(page=self, **kwargs) diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index 6445ad5d64..ac97581bb0 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -51,6 +51,20 @@ RenderingTypePredictor, ) +with _try_import( + __name__, + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', +): + from ._stagehand import ( + StagehandCrawler, + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, + ) + __all__ = [ 'AbstractHttpCrawler', @@ -81,4 +95,8 @@ 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', ] diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 5222181a6f..93cda854b9 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -5,7 +5,7 @@ import warnings from datetime import timedelta from functools import partial -from typing import TYPE_CHECKING, Any, Generic, Literal, cast +from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload import playwright.async_api from more_itertools import partition @@ -35,14 +35,11 @@ from ._types import BlockRequestsFunction, GotoOptions from ._utils import block_requests, infinite_scroll -TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) -TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) - if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping from pathlib import Path - from playwright.async_api import Page, Route + from playwright.async_api import Page, Response, Route from playwright.async_api import Request as PlaywrightRequest from typing_extensions import Unpack @@ -57,8 +54,21 @@ from crawlee.browsers._types import BrowserType +TPreNavContext = TypeVar( + 'TPreNavContext', bound=PlaywrightPreNavCrawlingContext, default=PlaywrightPreNavCrawlingContext +) +TPostNavContext = TypeVar( + 'TPostNavContext', bound=PlaywrightPostNavCrawlingContext, default=PlaywrightPostNavCrawlingContext +) +TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext, default=PlaywrightCrawlingContext) +TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) + + @docs_group('Crawlers') -class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]): +class PlaywrightCrawler( + BasicCrawler[TCrawlingContext, StatisticsState], + Generic[TPreNavContext, TPostNavContext, TCrawlingContext], +): """A web crawler that leverages the `Playwright` browser automation library. The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. @@ -98,6 +108,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: ``` """ + # Define the context classes used in context factory. + _PRE_NAV_CONTEXT_CLASS = PlaywrightPreNavCrawlingContext + _POST_NAV_CONTEXT_CLASS = PlaywrightPostNavCrawlingContext + _CRAWLING_CONTEXT_CLASS = PlaywrightCrawlingContext + def __init__( self, *, @@ -111,7 +126,7 @@ def __init__( headless: bool | None = None, use_incognito_pages: bool | None = None, navigation_timeout: timedelta | None = None, - **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]], + **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]], ) -> None: """Initialize a new instance. @@ -203,8 +218,8 @@ def __init__( ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) - self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] - self._post_navigation_hooks: list[Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]] = [] + self._pre_navigation_hooks: list[Callable[[TPreNavContext], Awaitable[None]]] = [] + self._post_navigation_hooks: list[Callable[[TPostNavContext], Awaitable[None]]] = [] kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] @@ -229,31 +244,117 @@ async def _manage_shared_navigation_timeout( finally: self._shared_navigation_timeouts.pop(request_id, None) + @overload + def _build_context( + self, + context: BasicCrawlingContext, + *, + page: Page, + response: None = None, + ) -> TPreNavContext: ... + + @overload + def _build_context( + self, + context: TPreNavContext, + *, + response: Response, + ) -> TPostNavContext: ... + + @overload + def _build_context( + self, + context: TPostNavContext, + ) -> TCrawlingContext: ... + + def _build_context( + self, + context: BasicCrawlingContext | PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext, + *, + page: Page | None = None, + response: Response | None = None, + ) -> PlaywrightPreNavCrawlingContext | PlaywrightPostNavCrawlingContext | PlaywrightCrawlingContext: + """Build the crawling context by adding Playwright-specific properties.""" + # Order is important, as all context subclasses each other, + # so the most specific context should be checked first. + match context: + # Create the final crawling context from `PostNavCrawlingContext` + case self._POST_NAV_CONTEXT_CLASS(): + extract_links = self._create_extract_links_function(context) + return self._CRAWLING_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=context.page, + goto_options=context.goto_options, + response=context.response, + infinite_scroll=lambda: infinite_scroll(context.page), + extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), + block_requests=context.block_requests, + ) + # Create the post-navigation context from `PreNavCrawlingContext` + case self._PRE_NAV_CONTEXT_CLASS(): + if response is None: + raise ValueError('Response must be provided for post-navigation context.') + + return self._POST_NAV_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=context.page, + block_requests=context.block_requests, + goto_options=context.goto_options, + response=response, + ) + # Create the pre-navigation context from `BasicCrawlingContext` + case BasicCrawlingContext(): + if page is None: + raise ValueError('Page must be provided for pre-navigation context.') + + return self._PRE_NAV_CONTEXT_CLASS( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + register_deferred_cleanup=context.register_deferred_cleanup, + page=page, + block_requests=cast('BlockRequestsFunction', partial(block_requests, page=page)), + goto_options=GotoOptions(**self._goto_options), + ) + case _: + raise ValueError('Invalid context type') + async def _open_page( self, context: BasicCrawlingContext, - ) -> AsyncGenerator[PlaywrightPreNavCrawlingContext, None]: + ) -> AsyncGenerator[TPreNavContext, None]: if self._browser_pool is None: raise ValueError('Browser pool is not initialized.') # Create a new browser page crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) - pre_navigation_context = PlaywrightPreNavCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=crawlee_page.page, - block_requests=cast('BlockRequestsFunction', partial(block_requests, page=crawlee_page.page)), - goto_options=GotoOptions(**self._goto_options), - ) + pre_navigation_context = self._build_context(context, page=crawlee_page.page) request_id = id(pre_navigation_context.request) @@ -290,8 +391,8 @@ async def route_handler(route: Route, _: PlaywrightRequest) -> None: async def _navigate( self, - context: PlaywrightPreNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, Exception | None]: + context: TPreNavContext, + ) -> AsyncGenerator[TPostNavContext, Exception | None]: """Execute an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: @@ -352,24 +453,9 @@ async def _navigate( # Set the loaded URL to the actual URL after redirection. context.request.loaded_url = context.page.url - yield PlaywrightPostNavCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=context.page, - block_requests=context.block_requests, - goto_options=context.goto_options, - response=response, - ) + yield self._build_context(context, response=response) - def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: + def _create_extract_links_function(self, context: TPostNavContext) -> ExtractLinksFunction: """Create a callback function for extracting links from context. Args: @@ -451,9 +537,7 @@ async def extract_links( return extract_links - async def _handle_status_code_response( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + async def _handle_status_code_response(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed. Args: @@ -475,8 +559,8 @@ async def _handle_status_code_response( async def _handle_blocked_request_by_content( self, - context: PlaywrightPostNavCrawlingContext, - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + context: TPostNavContext, + ) -> AsyncGenerator[TPostNavContext, None]: """Try to detect if the request is blocked based on the response content. Args: @@ -502,9 +586,7 @@ async def _handle_blocked_request_by_content( yield context - async def _execute_post_navigation_hooks( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]: + async def _execute_post_navigation_hooks(self, context: TPostNavContext) -> AsyncGenerator[TPostNavContext, None]: request_id = id(context.request) for hook in self._post_navigation_hooks: @@ -513,36 +595,14 @@ async def _execute_post_navigation_hooks( yield context - async def _create_crawling_context( - self, context: PlaywrightPostNavCrawlingContext - ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: - extract_links = self._create_extract_links_function(context) - - yield PlaywrightCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=context.send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - register_deferred_cleanup=context.register_deferred_cleanup, - page=context.page, - goto_options=context.goto_options, - response=context.response, - infinite_scroll=lambda: infinite_scroll(context.page), - extract_links=extract_links, - enqueue_links=self._create_enqueue_links_function(context, extract_links), - block_requests=cast('BlockRequestsFunction', partial(block_requests, page=context.page)), - ) + async def _create_crawling_context(self, context: TPostNavContext) -> AsyncGenerator[TCrawlingContext, None]: + yield self._build_context(context) if context.session: pw_cookies = await self._get_cookies(context.page) context.session.cookies.set_cookies_from_playwright_format(pw_cookies) - def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None: + def pre_navigation_hook(self, hook: Callable[[TPreNavContext], Awaitable[None]]) -> None: """Register a hook to be called before each navigation. Args: @@ -550,7 +610,7 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], """ self._pre_navigation_hooks.append(hook) - def post_navigation_hook(self, hook: Callable[[PlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None: + def post_navigation_hook(self, hook: Callable[[TPostNavContext], Awaitable[None]]) -> None: """Register a hook to be called after each navigation. Args: diff --git a/src/crawlee/crawlers/_stagehand/__init__.py b/src/crawlee/crawlers/_stagehand/__init__.py new file mode 100644 index 0000000000..d64e39549b --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/__init__.py @@ -0,0 +1,25 @@ +from crawlee._utils.try_import import install_import_hook as _install_import_hook +from crawlee._utils.try_import import try_import as _try_import + +_install_import_hook(__name__) + +# The following imports are wrapped in try_import to handle optional dependencies, +# ensuring the module can still function even if these dependencies are missing. +with _try_import(__name__, 'StagehandCrawler'): + from ._stagehand_crawler import StagehandCrawler +with _try_import( + __name__, 'StagehandCrawlingContext', 'StagehandPostNavCrawlingContext', 'StagehandPreNavCrawlingContext' +): + from ._stagehand_crawling_context import ( + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, + ) + + +__all__ = [ + 'StagehandCrawler', + 'StagehandCrawlingContext', + 'StagehandPostNavCrawlingContext', + 'StagehandPreNavCrawlingContext', +] diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py new file mode 100644 index 0000000000..e155eed2ec --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawler.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from crawlee._utils.docs import docs_group +from crawlee.browsers import BrowserPool +from crawlee.browsers._stagehand_browser_plugin import StagehandBrowserPlugin +from crawlee.crawlers import PlaywrightCrawler + +from ._stagehand_crawling_context import ( + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, +) + +if TYPE_CHECKING: + from datetime import timedelta + from pathlib import Path + + from typing_extensions import Unpack + + from crawlee.browsers import StagehandOptions + from crawlee.crawlers._basic import BasicCrawlerOptions + from crawlee.crawlers._playwright._types import GotoOptions + from crawlee.statistics import StatisticsState + + +@docs_group('Crawlers') +class StagehandCrawler( + PlaywrightCrawler[ + StagehandPreNavCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandCrawlingContext, + ] +): + """A web crawler that integrates Stagehand AI-powered browser automation with Crawlee. + + `StagehandCrawler` builds on top of `PlaywrightCrawler`, inheriting all of its features. + It uses `StagehandBrowserPlugin` to manage Stagehand sessions. Stagehand creates and manages + the browser instance - either locally via a bundled Chromium binary, or remotely via Browserbase + cloud - and Playwright connects to it via the Chrome DevTools Protocol (CDP). + + Because Stagehand relies on CDP, only Chromium is supported. Not all Playwright browser and + context configuration options are available - browser settings are limited to the subset accepted + by Stagehand's ``BrowserLaunchOptions`` (such as ``headless``, ``args``, ``viewport``, ``proxy``, + ``locale``, and ``executable_path``). Full browser fingerprinting (canvas, WebGL, screen + properties) and incognito pages are not supported; fingerprint-consistent HTTP headers + (``User-Agent``, ``Accept``, ``sec-ch-ua``) are still injected automatically. + + Each page in the crawling context is a `StagehandPage`, which extends the standard Playwright + `Page` with the following AI methods: + + - `page.act(**kwargs)` - perform an action on the page using natural language + - `page.extract(**kwargs)` - extract structured data from the page with AI + - `page.observe(**kwargs)` - get AI-suggested actions available on the page + - `page.execute(**kwargs)` - run an autonomous multi-step agent + + Stagehand configuration (model, API key, environment) is provided via `stagehand_options`. + By default, the crawler runs locally using the `openai/gpt-4.1-mini` model. + + ### Usage + + ```python + import asyncio + from crawlee.crawlers import StagehandCrawler, StagehandCrawlingContext + from crawlee.browsers import StagehandOptions + + crawler = StagehandCrawler( + stagehand_options=StagehandOptions( + model_api_key='sk-...', + model='openai/gpt-4.1-mini', + ), + ) + + @crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Use standard Playwright methods alongside AI methods. + await context.page.act(input='Click the accept cookies button if present') + + data = await context.page.extract(instruction='Get the article title and author') + + await context.push_data(data) + + asyncio.run(crawler.run(['https://example.com'])) + ``` + """ + + _PRE_NAV_CONTEXT_CLASS = StagehandPreNavCrawlingContext + _POST_NAV_CONTEXT_CLASS = StagehandPostNavCrawlingContext + _CRAWLING_CONTEXT_CLASS = StagehandCrawlingContext + + def __init__( + self, + *, + stagehand_options: StagehandOptions | None = None, + browser_pool: BrowserPool | None = None, + user_data_dir: str | Path | None = None, + headless: bool | None = None, + browser_launch_options: dict[str, Any] | None = None, + browser_new_context_options: dict[str, Any] | None = None, + goto_options: GotoOptions | None = None, + navigation_timeout: timedelta | None = None, + **kwargs: Unpack[BasicCrawlerOptions[StagehandCrawlingContext, StatisticsState]], + ) -> None: + """Initialize a new instance. + + Args: + stagehand_options: Stagehand-specific configuration (model, API key, env, etc.). + Cannot be specified if `browser_pool` is provided. + browser_pool: A pre-configured `BrowserPool`. All plugins must be instances of + `StagehandBrowserPlugin`. If omitted, a pool is created automatically from the + other browser arguments. + user_data_dir: Path to a user data directory, which stores browser session data like + cookies and local storage. Cannot be specified if `browser_pool` is provided. + headless: Whether to run the browser in headless mode. Defaults to the value from + Crawlee's global `Configuration`. Cannot be specified if `browser_pool` is provided. + browser_launch_options: Keyword arguments for browser launch passed to Stagehand's + `BrowserLaunchOptions` (a subset of Playwright's launch options). Supported keys + include `args`, `executable_path`, `proxy`, `viewport`, `locale`, and others. + Cannot be specified if `browser_pool` is provided. + browser_new_context_options: Keyword arguments for browser context creation, merged + with `browser_launch_options`. Options that map to `BrowserLaunchOptions` take + effect on the first page; subsequent pages reuse the existing session context. + Cannot be specified if `browser_pool` is provided. + goto_options: Additional options passed to Stagehand's `Page.goto()`. The `timeout` + option is not supported - use `navigation_timeout` instead. + navigation_timeout: Timeout for the navigation phase (from opening the page to calling + the request handler). Defaults to one minute. + kwargs: Additional keyword arguments forwarded to `BasicCrawler`. + """ + if browser_pool is not None: + self._validate_browser_pool(browser_pool) + if any( + param is not None + for param in ( + stagehand_options, + user_data_dir, + headless, + browser_launch_options, + browser_new_context_options, + ) + ): + raise ValueError( + 'Cannot specify `stagehand_options`, `user_data_dir`, `headless`, ' + '`browser_launch_options`, `browser_new_context_options` when `browser_pool` is provided.' + ) + else: + launch_options = dict(browser_launch_options or {}) + if headless is not None: + launch_options['headless'] = headless + + browser_pool = BrowserPool( + plugins=[ + StagehandBrowserPlugin( + stagehand_options=stagehand_options, + user_data_dir=user_data_dir, + browser_launch_options=launch_options or None, + browser_new_context_options=browser_new_context_options, + ) + ] + ) + + kwargs.setdefault('_logger', logging.getLogger(__name__)) + + super().__init__( + browser_pool=browser_pool, + goto_options=goto_options, + navigation_timeout=navigation_timeout, + **kwargs, + ) + + @staticmethod + def _validate_browser_pool(pool: BrowserPool) -> None: + invalid = [p for p in pool.plugins if not isinstance(p, StagehandBrowserPlugin)] + if invalid: + raise ValueError( + f'All BrowserPool plugins must be StagehandBrowserPlugin instances. Invalid plugins: {invalid}' + ) diff --git a/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py new file mode 100644 index 0000000000..645fdbd357 --- /dev/null +++ b/src/crawlee/crawlers/_stagehand/_stagehand_crawling_context.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass + +from crawlee._utils.docs import docs_group +from crawlee.browsers import StagehandPage +from crawlee.crawlers import ( + PlaywrightCrawlingContext, + PlaywrightPostNavCrawlingContext, + PlaywrightPreNavCrawlingContext, +) + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandPreNavCrawlingContext(PlaywrightPreNavCrawlingContext): + """The pre navigation crawling context used by the `StagehandCrawler`.""" + + page: StagehandPage + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandPostNavCrawlingContext(PlaywrightPostNavCrawlingContext, StagehandPreNavCrawlingContext): + """The post navigation crawling context used by the `StagehandCrawler`.""" + + +@dataclass(frozen=True) +@docs_group('Crawling contexts') +class StagehandCrawlingContext(PlaywrightCrawlingContext, StagehandPostNavCrawlingContext): + """The crawling context used by the `StagehandCrawler`.""" diff --git a/tests/unit/browsers/test_stagehand_browser_controller.py b/tests/unit/browsers/test_stagehand_browser_controller.py new file mode 100644 index 0000000000..fda9bf6a81 --- /dev/null +++ b/tests/unit/browsers/test_stagehand_browser_controller.py @@ -0,0 +1,353 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING, Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from playwright.async_api import Browser, Playwright, async_playwright +from stagehand import AsyncSession, AsyncStagehand + +from crawlee.browsers import StagehandBrowserController, StagehandOptions, StagehandPage +from crawlee.proxy_configuration import ProxyInfo + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from yarl import URL + + +@pytest.fixture +async def playwright() -> AsyncGenerator[Playwright, None]: + async with async_playwright() as playwright: + yield playwright + + +@pytest.fixture +async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: + browser = await playwright.chromium.launch() + # Stagehand's CDP setup always provides a pre-existing context; simulate it. + await browser.new_context() + yield browser + await browser.close() + + +@pytest.fixture +def session_mock() -> MagicMock: + session = MagicMock(spec=AsyncSession) + session.data = MagicMock() + session.data.cdp_url = 'ws://fake-cdp' + session.end = AsyncMock() + return session + + +@pytest.fixture +def stagehand_client_mock(session_mock: MagicMock) -> MagicMock: + client = MagicMock(spec=AsyncStagehand) + client.sessions.start = AsyncMock(return_value=session_mock) + return client + + +@pytest.fixture +async def controller( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> AsyncGenerator[StagehandBrowserController, None]: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(), + max_open_pages_per_browser=2, + ) + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + yield controller + await controller.close(force=True) + + +async def test_initial_state(playwright: Playwright, session_mock: MagicMock) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(), + ) + + assert controller.pages == [] + assert controller.pages_count == 0 + assert isinstance(controller.last_page_opened_at, datetime) + assert controller.idle_time < timedelta(seconds=1) + assert controller.has_free_capacity + assert controller.is_browser_connected # True before any session is started + assert controller.browser_type == 'chromium' + + +async def test_open_and_close_page(controller: StagehandBrowserController) -> None: + page = await controller.new_page() + + assert isinstance(page, StagehandPage) + assert page in controller.pages + assert controller.pages_count == 1 + assert controller.last_page_opened_at <= datetime.now(timezone.utc) + + await page.close() + + assert page not in controller.pages + assert controller.pages_count == 0 + + +async def test_max_open_pages_limit(controller: StagehandBrowserController) -> None: + page1 = await controller.new_page() + assert controller.pages_count == 1 + + page2 = await controller.new_page() + assert controller.pages_count == 2 + + with pytest.raises(ValueError, match=r'Cannot open more pages in this browser.'): + await controller.new_page() + + assert controller.pages_count == 2 + + await page1.close() + assert controller.pages_count == 1 + + page3 = await controller.new_page() + assert controller.pages_count == 2 + + await page2.close() + await page3.close() + + assert controller.pages == [] + assert controller.pages_count == 0 + + +async def test_idle_time(controller: StagehandBrowserController) -> None: + idle_time_before = controller.idle_time + await asyncio.sleep(1) # Simulate waiting + idle_time_after = controller.idle_time + assert idle_time_after > idle_time_before + + +async def test_close_browser_with_open_pages( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, + session_mock: MagicMock, +) -> None: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(), + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + _ = await controller.new_page() + + with pytest.raises(ValueError, match=r'Cannot close the browser while there are open pages.'): + await controller.close() + + assert controller.pages_count == 1 + assert controller.is_browser_connected + + await controller.close(force=True) + + assert controller.pages_count == 0 + + assert session_mock.end.await_count == 1 + assert not controller.is_browser_connected + + +async def test_second_page_reuses_session( + controller: StagehandBrowserController, + stagehand_client_mock: MagicMock, +) -> None: + page1 = await controller.new_page() + page2 = await controller.new_page() + + assert stagehand_client_mock.sessions.start.await_count == 1 + assert page1.context == page2.context + assert page1 is not page2 # Different Page instances + + await page1.close() + await page2.close() + + +async def test_concurrent_session_init( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + call_count = 0 + + async def delayed_start(**_kwargs: Any) -> Any: + nonlocal call_count + call_count += 1 + await asyncio.sleep(5) # Simulate delay in session start + return session_mock + + stagehand_client = MagicMock() + stagehand_client.sessions.start = delayed_start + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(), + max_open_pages_per_browser=5, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + pages = await asyncio.gather(controller.new_page(), controller.new_page()) + + # Only one session should be started despite concurrent calls to `new_page()`. + assert call_count == 1 + + for page in pages: + await page.close() + await controller.close(force=True) + + +async def test_session_start_params_local( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> None: + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions( + env='LOCAL', + model='openai/gpt-4o', + verbose=1, + self_heal=False, + dom_settle_timeout_ms=500.0, + system_prompt='Test prompt', + ), + max_open_pages_per_browser=2, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + page = await controller.new_page() + await page.close() + await controller.close() + + call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs + assert call_kwargs['model_name'] == 'openai/gpt-4o' + assert call_kwargs['verbose'] == 1 + assert call_kwargs['self_heal'] is False + assert call_kwargs['dom_settle_timeout_ms'] == 500.0 + assert call_kwargs['system_prompt'] == 'Test prompt' + assert call_kwargs['browser'] == {'type': 'local', 'launch_options': {}} + + +async def test_session_start_params_browserbase( + playwright: Playwright, + browser: Browser, + stagehand_client_mock: MagicMock, +) -> None: + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client_mock, + stagehand_options=StagehandOptions(env='BROWSERBASE'), + max_open_pages_per_browser=2, + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + page = await controller.new_page() + await page.close() + await controller.close() + + call_kwargs = stagehand_client_mock.sessions.start.call_args.kwargs + assert call_kwargs['browser'] == {'type': 'browserbase', 'launch_options': {}} + assert 'browserbase_session_create_params' not in call_kwargs + + +async def test_proxy_set_local( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(env='LOCAL'), + max_open_pages_per_browser=2, + ) + + proxy = ProxyInfo( + url='http://proxy.example.com:8080', + scheme='http', + hostname='proxy.example.com', + port=8080, + username='user', + password='pass', + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + await controller.new_page(proxy_info=proxy) + await controller.close(force=True) + + call_kwargs = stagehand_client.sessions.start.call_args.kwargs + + browser_options = call_kwargs['browser'] + assert browser_options['type'] == 'local' + + proxy_options = browser_options['launch_options']['proxy'] + assert proxy_options['server'] == 'http://proxy.example.com:8080' + assert proxy_options['username'] == 'user' + assert proxy_options['password'] == 'pass' + + +async def test_proxy_set_browserbase( + playwright: Playwright, + browser: Browser, + session_mock: MagicMock, +) -> None: + stagehand_client = MagicMock() + stagehand_client.sessions.start = AsyncMock(return_value=session_mock) + + controller = StagehandBrowserController( + playwright=playwright, + stagehand_client=stagehand_client, + stagehand_options=StagehandOptions(env='BROWSERBASE'), + max_open_pages_per_browser=2, + ) + + proxy = ProxyInfo( + url='http://proxy.example.com:8080', + scheme='http', + hostname='proxy.example.com', + port=8080, + username='user', + password='pass', + ) + + with patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)): + await controller.new_page(proxy_info=proxy) + await controller.close(force=True) + + call_kwargs = stagehand_client.sessions.start.call_args.kwargs + + browserbase_proxy_options = call_kwargs['browserbase_session_create_params'] + assert browserbase_proxy_options['proxies'][0]['type'] == 'external' + assert browserbase_proxy_options['proxies'][0]['server'] == 'http://proxy.example.com:8080' + assert browserbase_proxy_options['proxies'][0]['username'] == 'user' + assert browserbase_proxy_options['proxies'][0]['password'] == 'pass' + + +async def test_fingerprint_headers_set_on_new_page(controller: StagehandBrowserController, server_url: URL) -> None: + page = await controller.new_page() + + response = await page.goto(str(server_url / 'headers')) + + assert response is not None + + response_json = await response.json() + + assert 'Headless' not in response_json['user-agent'] diff --git a/tests/unit/browsers/test_stagehand_browser_plugin.py b/tests/unit/browsers/test_stagehand_browser_plugin.py new file mode 100644 index 0000000000..40cec7b472 --- /dev/null +++ b/tests/unit/browsers/test_stagehand_browser_plugin.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from stagehand import AsyncStagehand + +from crawlee.browsers import StagehandBrowserController, StagehandBrowserPlugin, StagehandOptions + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +_PATCH_MODULE = 'crawlee.browsers._stagehand_browser_plugin' + + +@pytest.fixture +def stagehand_client_mock() -> MagicMock: + client = MagicMock(spec=AsyncStagehand) + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=None) + return client + + +@pytest.fixture +async def plugin() -> AsyncGenerator[StagehandBrowserPlugin, None]: + async with StagehandBrowserPlugin() as plugin: + yield plugin + + +def test_initial_state() -> None: + plugin = StagehandBrowserPlugin(max_open_pages_per_browser=5) + + assert plugin.active is False + assert plugin.browser_type == 'chromium' + assert plugin.max_open_pages_per_browser == 5 + + # headless should be True by default + assert plugin.browser_launch_options['headless'] is True + + +def test_implicit_set_options() -> None: + plugin = StagehandBrowserPlugin( + browser_new_context_options={'viewport': {'width': 1280, 'height': 720}}, + browser_launch_options={'headless': False}, + user_data_dir='./test', + ) + + assert plugin.browser_launch_options['headless'] is False + assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720} + assert plugin.browser_launch_options['user_data_dir'] == './test' + + +def test_order_priority_of_implicit_options() -> None: + # `browser_launch_options` takes priority over `browser_new_context_options` for shared keys, + # while non-conflicting keys from both dicts are merged. + plugin = StagehandBrowserPlugin( + browser_new_context_options={'headless': True, 'viewport': {'width': 1280, 'height': 720}}, + browser_launch_options={'headless': False}, + ) + + assert plugin.browser_launch_options['headless'] is False + assert plugin.browser_launch_options['viewport'] == {'width': 1280, 'height': 720} + + +def test_stagehand_options_defaults_when_not_provided() -> None: + plugin = StagehandBrowserPlugin() + + assert isinstance(plugin.stagehand_options, StagehandOptions) + assert plugin.stagehand_options == StagehandOptions() + + +async def test_stagehand_called_with_local_params(stagehand_client_mock: MagicMock) -> None: + with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock: + async with StagehandBrowserPlugin( + stagehand_options=StagehandOptions(env='LOCAL', local_ready_timeout_s=20.0, model_api_key='test_model_key') + ): + pass + + call_kwargs = stagehand_mock.call_args.kwargs + assert call_kwargs['server'] == 'local' + assert call_kwargs['local_ready_timeout_s'] == 20.0 + assert call_kwargs['model_api_key'] == 'test_model_key' + assert 'browserbase_api_key' not in call_kwargs + assert 'browserbase_project_id' not in call_kwargs + + # In local environment, the plugin should set `local_chrome_path` to the path of the Playwright Chromium executable. + assert 'local_chrome_path' in call_kwargs + + +async def test_stagehand_called_with_browserbase_params(stagehand_client_mock: MagicMock) -> None: + with patch(f'{_PATCH_MODULE}.AsyncStagehand', return_value=stagehand_client_mock) as stagehand_mock: + async with StagehandBrowserPlugin( + stagehand_options=StagehandOptions( + env='BROWSERBASE', + browserbase_api_key='test_key', + project_id='test_project_id', + model_api_key='test_model_key', + ) + ): + pass + + call_kwargs = stagehand_mock.call_args.kwargs + assert call_kwargs['server'] == 'remote' + assert call_kwargs['browserbase_api_key'] == 'test_key' + assert call_kwargs['browserbase_project_id'] == 'test_project_id' + assert call_kwargs['model_api_key'] == 'test_model_key' + + # In Browserbase environment, the plugin should NOT set `local_chrome_path` as it's not used. + assert 'local_chrome_path' not in call_kwargs + + +async def test_methods_raise_error_when_not_active() -> None: + plugin = StagehandBrowserPlugin() + + assert plugin.active is False + + with pytest.raises(RuntimeError, match=r'Plugin is not active'): + await plugin.new_browser() + + with pytest.raises(RuntimeError, match=r'Plugin is already active.'): + async with plugin, plugin: + pass + + async with plugin: + assert plugin.active is True + + +async def test_new_browser(plugin: StagehandBrowserPlugin) -> None: + browser_controller = await plugin.new_browser() + + assert isinstance(browser_controller, StagehandBrowserController) + + assert browser_controller.is_browser_connected + assert browser_controller.has_free_capacity + + await browser_controller.close() + + +async def test_multiple_new_browsers(plugin: StagehandBrowserPlugin) -> None: + browser_controller_1 = await plugin.new_browser() + browser_controller_2 = await plugin.new_browser() + + assert browser_controller_1 is not browser_controller_2 diff --git a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py new file mode 100644 index 0000000000..d1455db5b5 --- /dev/null +++ b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from playwright.async_api import async_playwright +from stagehand import AsyncSession, AsyncStagehand + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserPlugin, + StagehandBrowserPlugin, + StagehandOptions, + StagehandPage, +) +from crawlee.crawlers import ( + PlaywrightCrawlingContext, + StagehandCrawler, + StagehandCrawlingContext, + StagehandPostNavCrawlingContext, + StagehandPreNavCrawlingContext, +) + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from yarl import URL + + +_PLUGIN_MODULE = 'crawlee.browsers._stagehand_browser_plugin' + + +@pytest.fixture +def stagehand_session_mock() -> MagicMock: + session = MagicMock(spec=AsyncSession) + session.data = MagicMock() + session.data.cdp_url = 'ws://fake-cdp' + session.end = AsyncMock() + return session + + +@pytest.fixture +async def patched_crawler(stagehand_session_mock: MagicMock) -> AsyncGenerator[StagehandCrawler, None]: + """StagehandCrawler with real Playwright but Stagehand session mocked.""" + stagehand_client = MagicMock(spec=AsyncStagehand) + stagehand_client.sessions.start = AsyncMock(return_value=stagehand_session_mock) + stagehand_client.__aenter__ = AsyncMock(return_value=stagehand_client) + stagehand_client.__aexit__ = AsyncMock() + + async with async_playwright() as playwright: + browser = await playwright.chromium.launch() + # Stagehand's CDP always provides a pre-existing context; simulate it. + await browser.new_context() + + mock_playwright_cm = AsyncMock() + mock_playwright_cm.__aenter__ = AsyncMock(return_value=playwright) + mock_playwright_cm.__aexit__ = AsyncMock() + + with ( + patch(f'{_PLUGIN_MODULE}.AsyncStagehand', return_value=stagehand_client), + patch(f'{_PLUGIN_MODULE}.async_playwright', return_value=mock_playwright_cm), + patch.object(playwright.chromium, 'connect_over_cdp', AsyncMock(return_value=browser)), + ): + yield StagehandCrawler() + + +def test_init_raises_with_browser_pool_and_params() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin()]) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, stagehand_options=StagehandOptions()) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, headless=True) + + with pytest.raises(ValueError, match=r'Cannot specify'): + StagehandCrawler(browser_pool=pool, browser_launch_options={'headless': True}) + + +def test_init_raises_with_mixed_plugins_in_browser_pool() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin(), PlaywrightBrowserPlugin()]) + + with pytest.raises(ValueError, match=r'All BrowserPool plugins must be StagehandBrowserPlugin instances'): + StagehandCrawler(browser_pool=pool) + + +def test_init_with_valid_browser_pool() -> None: + pool = BrowserPool(plugins=[StagehandBrowserPlugin()]) + StagehandCrawler(browser_pool=pool) + + +async def test_context_type(patched_crawler: StagehandCrawler, server_url: URL) -> None: + handler_mock = AsyncMock() + pre_navigation_hook_mock = AsyncMock() + post_navigation_hook_mock = AsyncMock() + patched_crawler.pre_navigation_hook(pre_navigation_hook_mock) + patched_crawler.post_navigation_hook(post_navigation_hook_mock) + patched_crawler.router.default_handler(handler_mock) + + await patched_crawler.run([str(server_url)]) + + context = handler_mock.call_args[0][0] + assert isinstance(context, StagehandCrawlingContext) + assert isinstance(context, PlaywrightCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_pre_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None: + pre_nav_hook_mock = AsyncMock() + patched_crawler.pre_navigation_hook(pre_nav_hook_mock) + patched_crawler.router.default_handler(AsyncMock()) + + await patched_crawler.run([str(server_url)]) + + pre_nav_hook_mock.assert_called_once() + context = pre_nav_hook_mock.call_args[0][0] + assert isinstance(context, StagehandPreNavCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_post_navigation_hook(patched_crawler: StagehandCrawler, server_url: URL) -> None: + post_nav_hook_mock = AsyncMock() + patched_crawler.post_navigation_hook(post_nav_hook_mock) + patched_crawler.router.default_handler(AsyncMock()) + + await patched_crawler.run([str(server_url)]) + + post_nav_hook_mock.assert_called_once() + context = post_nav_hook_mock.call_args[0][0] + assert isinstance(context, StagehandPostNavCrawlingContext) + assert isinstance(context.page, StagehandPage) + + +async def test_stagehand_page_ai_methods_delegate_to_session( + patched_crawler: StagehandCrawler, + stagehand_session_mock: MagicMock, + server_url: URL, +) -> None: + @patched_crawler.router.default_handler + async def handler(context: StagehandCrawlingContext) -> None: + await context.page.act(input='click button') + await context.page.extract(instruction='get title') + await context.page.observe(instruction='find links') + await context.page.execute(agent_config={}, execute_options={'instruction': 'run script'}) + + await patched_crawler.run([str(server_url)]) + + for method_name, argument in ( + ('act', 'input'), + ('extract', 'instruction'), + ('observe', 'instruction'), + ('execute', 'execute_options'), + ): + method_mock = getattr(stagehand_session_mock, method_name) + method_mock.assert_awaited_once() + assert isinstance(method_mock.call_args.kwargs['page'], StagehandPage) + + assert argument in method_mock.call_args.kwargs diff --git a/uv.lock b/uv.lock index a9e945e853..4c8d26d318 100644 --- a/uv.lock +++ b/uv.lock @@ -825,6 +825,7 @@ all = [ { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "sqlalchemy", extra = ["asyncio"] }, + { name = "stagehand" }, { name = "typer" }, { name = "wrapt" }, ] @@ -879,6 +880,12 @@ sql-sqlite = [ { name = "aiosqlite" }, { name = "sqlalchemy", extra = ["asyncio"] }, ] +stagehand = [ + { name = "apify-fingerprint-datapoints" }, + { name = "browserforge" }, + { name = "playwright" }, + { name = "stagehand" }, +] [package.dev-dependencies] dev = [ @@ -916,16 +923,18 @@ requires-dist = [ { name = "apify-fingerprint-datapoints", marker = "extra == 'adaptive-crawler'", specifier = ">=0.0.3" }, { name = "apify-fingerprint-datapoints", marker = "extra == 'httpx'", specifier = ">=0.0.2" }, { name = "apify-fingerprint-datapoints", marker = "extra == 'playwright'", specifier = ">=0.0.2" }, + { name = "apify-fingerprint-datapoints", marker = "extra == 'stagehand'", specifier = ">=0.0.2" }, { name = "async-timeout", specifier = ">=5.0.1" }, { name = "asyncpg", marker = "extra == 'sql-postgres'", specifier = ">=0.24.0" }, { name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'beautifulsoup'", specifier = ">=4.12.0" }, { name = "browserforge", marker = "extra == 'adaptive-crawler'", specifier = ">=1.2.4" }, { name = "browserforge", marker = "extra == 'httpx'", specifier = ">=1.2.3" }, { name = "browserforge", marker = "extra == 'playwright'", specifier = ">=1.2.3" }, + { name = "browserforge", marker = "extra == 'stagehand'", specifier = ">=1.2.3" }, { name = "cachetools", specifier = ">=5.5.0" }, { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, - { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "redis"], marker = "extra == 'all'" }, + { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "stagehand", "redis"], marker = "extra == 'all'" }, { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, @@ -943,6 +952,7 @@ requires-dist = [ { name = "parsel", marker = "extra == 'parsel'", specifier = ">=1.10.0" }, { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" }, + { name = "playwright", marker = "extra == 'stagehand'", specifier = ">=1.27.0" }, { name = "protego", specifier = ">=0.5.0" }, { name = "psutil", specifier = ">=6.0.0" }, { name = "pydantic", specifier = ">=2.11.0" }, @@ -954,13 +964,14 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-mysql'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-postgres'", specifier = ">=2.0.0,<3.0.0" }, { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, + { name = "stagehand", marker = "extra == 'stagehand'", specifier = ">=3.19.5" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "sql-sqlite", "sql-mysql", "redis"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-postgres", "stagehand", "sql-sqlite", "sql-mysql", "redis"] [package.metadata.requires-dev] dev = [ @@ -1154,6 +1165,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + [[package]] name = "docspec" version = "2.2.1" @@ -3712,6 +3732,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -3795,6 +3824,26 @@ asyncio = [ { name = "greenlet" }, ] +[[package]] +name = "stagehand" +version = "3.19.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/f8/ccd2bb2758a4eaf0af3846e097ff206e0aa76c8d3b5aa2bded77fb47825e/stagehand-3.19.5.tar.gz", hash = "sha256:3cb8279ac82051e584b34d26e87dc764f0ccad766a01625198ca578eb35f0b6c", size = 281033, upload-time = "2026-04-03T20:21:09.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/6f/a47bad258bfafc193ebb8e0e8c440e8028c9ab28b54a333b46aa3c0cff53/stagehand-3.19.5-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:14f39a4f8d30d77c089166185c705f66aade25432b903a663a937b3747439c26", size = 34495874, upload-time = "2026-04-03T20:21:07.366Z" }, + { url = "https://files.pythonhosted.org/packages/72/f7/e39868903121f1a80ae6eda088383362cd2d3a578c04493a2f83c1aac1da/stagehand-3.19.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:80ed0d732cb9c3e952ad851e071dad5775a9ea88d2787c006289d61097fd2609", size = 33193535, upload-time = "2026-04-03T20:21:18.536Z" }, + { url = "https://files.pythonhosted.org/packages/c8/0b/35cb92bb53e9539c0147892dbd0a227b43bf0d8adcd0a8e867dc5f2bf7fd/stagehand-3.19.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:aa947a5f6241f5953ac238cd9b0ab72e0cb87f559f97e5ee875f83dbc0c351d1", size = 37273148, upload-time = "2026-04-03T20:21:11.939Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c7/dccf63cba1941b5710dc9968218e2883a937cf6534d644bb0c5222d3f40a/stagehand-3.19.5-py3-none-win_amd64.whl", hash = "sha256:e37bf630b99b4a9b7d95f151c56b296940db88b3049b68f0abb56f9e31cc6095", size = 30758357, upload-time = "2026-04-03T20:21:15.121Z" }, +] + [[package]] name = "text-unidecode" version = "1.3"