From 5543019a7e15cab0c35457e8bcbb7153efdf78f0 Mon Sep 17 00:00:00 2001 From: junhsss Date: Thu, 4 Sep 2025 20:27:24 +0900 Subject: [PATCH] feat: add SteelScrapeWebsiteTool for scraping websites --- crewai_tools/__init__.py | 1 + crewai_tools/tools/__init__.py | 1 + .../tools/steel_scrape_website_tool/README.md | 30 +++++ .../steel_scrape_website_tool/__init__.py | 3 + .../steel_scrape_website_tool.py | 83 +++++++++++++ pyproject.toml | 4 + tests/tools/test_steel_scrape_website_tool.py | 66 +++++++++++ tool.specs.json | 112 ++++++++++++++++++ 8 files changed, 300 insertions(+) create mode 100644 crewai_tools/tools/steel_scrape_website_tool/README.md create mode 100644 crewai_tools/tools/steel_scrape_website_tool/__init__.py create mode 100644 crewai_tools/tools/steel_scrape_website_tool/steel_scrape_website_tool.py create mode 100644 tests/tools/test_steel_scrape_website_tool.py diff --git a/crewai_tools/__init__.py b/crewai_tools/__init__.py index f4c03ba0..f5dbbbf1 100644 --- a/crewai_tools/__init__.py +++ b/crewai_tools/__init__.py @@ -83,6 +83,7 @@ SnowflakeSearchTool, SpiderTool, StagehandTool, + SteelScrapeWebsiteTool, TavilyExtractorTool, TavilySearchTool, TXTSearchTool, diff --git a/crewai_tools/tools/__init__.py b/crewai_tools/tools/__init__.py index bf1a166d..75d46058 100644 --- a/crewai_tools/tools/__init__.py +++ b/crewai_tools/tools/__init__.py @@ -109,6 +109,7 @@ ) from .spider_tool.spider_tool import SpiderTool from .stagehand_tool.stagehand_tool import StagehandTool +from .steel_scrape_website_tool.steel_scrape_website_tool import SteelScrapeWebsiteTool from .tavily_extractor_tool.tavily_extractor_tool import TavilyExtractorTool from .tavily_search_tool.tavily_search_tool import TavilySearchTool from .txt_search_tool.txt_search_tool import TXTSearchTool diff --git a/crewai_tools/tools/steel_scrape_website_tool/README.md b/crewai_tools/tools/steel_scrape_website_tool/README.md new file mode 100644 index 00000000..6544fed8 --- /dev/null +++ b/crewai_tools/tools/steel_scrape_website_tool/README.md @@ -0,0 +1,30 @@ +# SteelScrapeWebsiteTool + +## Description + +[Steel](https://steel.dev) is an open-source browser API that makes it easy to build AI apps and agents that interact with the web. Instead of building automation infrastructure from scratch, you can focus on your AI application while Steel handles the complexity. + +## Installation + +- Get an API key from [steel.dev](https://app.steel.dev) and set it in environment variables (`STEEL_API_KEY`). +- Install the [Steel SDK](https://github.com/steel-dev/steel-python) along with `crewai[tools]`: + +```bash +pip install steel-sdk 'crewai[tools]' +``` + +## Example + +U the SteelScrapeWebsiteTool as follows to allow your agent to load websites: + +```python +from crewai_tools import SteelScrapeWebsiteTool + +tool = SteelScrapeWebsiteTool(formats=["markdown"], proxy=True) +``` + +## Arguments + +- `api_key` Optional. Steel API key. Default is `STEEL_API_KEY` env variable. +- `formats` Optional[List[str]]. Content formats to return. Default: `["markdown"]`. +- `proxy` Optional. Enable/Disable proxies. diff --git a/crewai_tools/tools/steel_scrape_website_tool/__init__.py b/crewai_tools/tools/steel_scrape_website_tool/__init__.py new file mode 100644 index 00000000..c86ade07 --- /dev/null +++ b/crewai_tools/tools/steel_scrape_website_tool/__init__.py @@ -0,0 +1,3 @@ +from .steel_scrape_website_tool import SteelScrapeWebsiteTool + +__all__ = ["SteelScrapeWebsiteTool"] diff --git a/crewai_tools/tools/steel_scrape_website_tool/steel_scrape_website_tool.py b/crewai_tools/tools/steel_scrape_website_tool/steel_scrape_website_tool.py new file mode 100644 index 00000000..d7284eae --- /dev/null +++ b/crewai_tools/tools/steel_scrape_website_tool/steel_scrape_website_tool.py @@ -0,0 +1,83 @@ +import os +from typing import TYPE_CHECKING, List, Optional, Type + +from crewai.tools import BaseTool, EnvVar +from pydantic import BaseModel, ConfigDict, Field, PrivateAttr + +if TYPE_CHECKING: + from steel import Steel + +try: + from steel import Steel + + STEEL_AVAILABLE = True +except ImportError: + STEEL_AVAILABLE = False + +class SteelScrapeWebsiteToolSchema(BaseModel): + url: str = Field(description="Website URL") + + +class SteelScrapeWebsiteTool(BaseTool): + model_config = ConfigDict(arbitrary_types_allowed=True, validate_assignment=True, frozen=False) + name: str = "Steel web scrape tool" + description: str = "Scrape webpages using Steel and return the contents" + args_schema: Type[BaseModel] = SteelScrapeWebsiteToolSchema + api_key: Optional[str] = None + formats: Optional[List[str]] = None + proxy: Optional[bool] = None + + _steel: Optional["Steel"] = PrivateAttr(None) + package_dependencies: List[str] = ["steel-sdk"] + env_vars: List[EnvVar] = [ + EnvVar(name="STEEL_API_KEY", description="API key for Steel services", required=True), + ] + + def __init__( + self, + api_key: Optional[str] = None, + formats: Optional[List[str]] = None, + proxy: Optional[bool] = None, + **kwargs + ): + super().__init__(**kwargs) + self.api_key = api_key or os.getenv("STEEL_API_KEY") + if not self.api_key: + raise EnvironmentError("STEEL_API_KEY environment variable or api_key is required") + + try: + from steel import Steel # type: ignore + except ImportError: + import click + + if click.confirm( + "You are missing the 'steel-sdk' package. Would you like to install it?" + ): + import subprocess + + subprocess.run(["uv", "add", "steel-sdk"], check=True) + from steel import Steel # type: ignore + else: + raise ImportError( + "`steel-sdk` package not found, please run `uv add steel-sdk`" + ) + + self._steel = Steel(steel_api_key=self.api_key) + self.formats = formats or ["markdown"] + self.proxy = proxy + + + def _run(self, url: str): + if not self._steel: + raise RuntimeError("Steel not properly initialized") + + return self._steel.scrape(url=url, use_proxy=self.proxy, format=self.formats) + +try: + from steel import Steel + + if not hasattr(SteelScrapeWebsiteTool, "_model_rebuilt"): + SteelScrapeWebsiteTool.model_rebuild() + SteelScrapeWebsiteTool._model_rebuilt = True +except ImportError: + pass diff --git a/pyproject.toml b/pyproject.toml index e3ece2f9..85dcab59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "tiktoken>=0.8.0", "stagehand>=0.4.1", "portalocker==2.7.0", + "steel-sdk>=0.9.2", ] [project.urls] @@ -34,6 +35,9 @@ Documentation = "https://docs.crewai.com" [project.scripts] [project.optional-dependencies] +steel-sdk = [ + "steel-sdk>=0.9.2", +] scrapfly-sdk = [ "scrapfly-sdk>=0.8.19", ] diff --git a/tests/tools/test_steel_scrape_website_tool.py b/tests/tools/test_steel_scrape_website_tool.py new file mode 100644 index 00000000..dbe6c0cb --- /dev/null +++ b/tests/tools/test_steel_scrape_website_tool.py @@ -0,0 +1,66 @@ +import os +import sys +from unittest.mock import MagicMock, patch + +import pytest + +from crewai_tools.tools.steel_scrape_website_tool.steel_scrape_website_tool import ( + SteelScrapeWebsiteTool, +) + + +@pytest.fixture(autouse=True) +def mock_env_api_key(): + with patch.dict(os.environ, {"STEEL_API_KEY": "test_key"}): + yield + +@pytest.fixture(autouse=True) +def mock_steel_module(): + original = sys.modules.get("steel") + mock_module = MagicMock() + mock_module.Steel = MagicMock() + sys.modules["steel"] = mock_module + try: + yield mock_module.Steel + finally: + if original is not None: + sys.modules["steel"] = original + else: + del sys.modules["steel"] + +def test_init_prefers_arg_over_env(): + tool = SteelScrapeWebsiteTool(api_key="arg_key") + assert tool.api_key == "arg_key" + + +def test_init_raises_without_api_key(monkeypatch): + monkeypatch.delenv("STEEL_API_KEY", raising=False) + with pytest.raises(EnvironmentError): + SteelScrapeWebsiteTool() + + +def test_run_success(mock_steel_module): + mock_client = mock_steel_module.return_value + mock_client.scrape.return_value = {"markdown": "Hello"} + + tool = SteelScrapeWebsiteTool(api_key="k", formats=["markdown"], proxy=True) + result = tool._run("https://example.com") + + assert result == {"markdown": "Hello"} + mock_client.scrape.assert_called_once_with( + url="https://example.com", use_proxy=True, format=["markdown"] + ) + + +def test_run_raises_when_not_initialized(monkeypatch): + tool = SteelScrapeWebsiteTool(api_key="k") + tool._steel = None + with pytest.raises(RuntimeError): + tool._run("https://example.com") + + +def test_defaults_and_env(): + tool = SteelScrapeWebsiteTool() + assert tool.api_key == "test_key" + assert tool.formats == ["markdown"] + assert tool.proxy is None diff --git a/tool.specs.json b/tool.specs.json index c16df5ee..45dd406c 100644 --- a/tool.specs.json +++ b/tool.specs.json @@ -8429,6 +8429,118 @@ "type": "object" } }, + { + "description": "Scrape webpages using Steel and return the contents", + "env_vars": [ + { + "default": null, + "description": "API key for Steel services", + "name": "STEEL_API_KEY", + "required": true + } + ], + "humanized_name": "Steel web scrape tool", + "init_params_schema": { + "$defs": { + "EnvVar": { + "properties": { + "default": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Default" + }, + "description": { + "title": "Description", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "required": { + "default": true, + "title": "Required", + "type": "boolean" + } + }, + "required": [ + "name", + "description" + ], + "title": "EnvVar", + "type": "object" + } + }, + "properties": { + "api_key": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Api Key" + }, + "formats": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Formats" + }, + "proxy": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Proxy" + } + }, + "title": "SteelScrapeWebsiteTool", + "type": "object" + }, + "name": "SteelScrapeWebsiteTool", + "package_dependencies": [ + "steel-sdk" + ], + "run_params_schema": { + "properties": { + "url": { + "description": "Website URL", + "title": "Url", + "type": "string" + } + }, + "required": [ + "url" + ], + "title": "SteelScrapeWebsiteToolSchema", + "type": "object" + } + }, { "description": "A tool that can be used to semantic search a query from a txt's content.", "env_vars": [],