From 40d642f8e561622bea83c2a0db80e291f05dfa24 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Mon, 27 Jan 2025 09:24:04 +0100 Subject: [PATCH 01/15] Craw4Ai integration WIP --- autogen/tools/__init__.py | 2 +- autogen/tools/crawl4ai.py | 25 +++++++++++++++++++++++++ pyproject.toml | 5 +++++ setup_ag2.py | 1 + setup_autogen.py | 1 + 5 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 autogen/tools/crawl4ai.py diff --git a/autogen/tools/__init__.py b/autogen/tools/__init__.py index ecb29373ff..04542b4d3b 100644 --- a/autogen/tools/__init__.py +++ b/autogen/tools/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors +# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai # # SPDX-License-Identifier: Apache-2.0 diff --git a/autogen/tools/crawl4ai.py b/autogen/tools/crawl4ai.py new file mode 100644 index 0000000000..445dcc8906 --- /dev/null +++ b/autogen/tools/crawl4ai.py @@ -0,0 +1,25 @@ +# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from crawl4ai import AsyncWebCrawler + +from . import Tool + + +class Crawl4AI(Tool): + def __init__(self) -> None: + async def crawl4ai(url: str) -> Any: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + ) + return result.markdown() + + super().__init__( + name="crawl4ai", + description="Crawl a website and extract information.", + func_or_tool=crawl4ai, + ) diff --git a/pyproject.toml b/pyproject.toml index 7b6d6b8393..eb79015355 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,6 +124,11 @@ rag = [ "webdriver-manager==4.0.2", ] + +crawl4ai = [ + "crawl4ai>=0.4.247,<0.5", +] + neo4j = [ "docx2txt==0.8", "llama-index==0.12.11", diff --git a/setup_ag2.py b/setup_ag2.py index 2e17b70c61..b5b0c6afbc 100644 --- a/setup_ag2.py +++ b/setup_ag2.py @@ -36,6 +36,7 @@ "retrievechat-qdrant": ["pyautogen[retrievechat-qdrant]==" + __version__], "graph-rag-falkor-db": ["pyautogen[graph-rag-falkor-db]==" + __version__], "rag": ["pyautogen[rag]==" + __version__], + "crawl4ai": ["pyautogen[crawl4ai]==" + __version__], "neo4j": ["pyautogen[neo4j]==" + __version__], "twilio": ["pyautogen[twilio]==" + __version__], "interop-crewai": ["pyautogen[interop-crewai]==" + __version__], diff --git a/setup_autogen.py b/setup_autogen.py index 9f166ee2e0..3062684b42 100644 --- a/setup_autogen.py +++ b/setup_autogen.py @@ -36,6 +36,7 @@ "retrievechat-qdrant": ["pyautogen[retrievechat-qdrant]==" + __version__], "graph-rag-falkor-db": ["pyautogen[graph-rag-falkor-db]==" + __version__], "rag": ["pyautogen[rag]==" + __version__], + "crawl4ai": ["pyautogen[crawl4ai]==" + __version__], "neo4j": ["pyautogen[neo4j]==" + __version__], "twilio": ["pyautogen[twilio]==" + __version__], "interop-crewai": ["pyautogen[interop-crewai]==" + __version__], From 565b56aa77e52d78f62ab5c9ddf528cd6a78966e Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Mon, 27 Jan 2025 14:26:48 +0100 Subject: [PATCH 02/15] Craw4Ai integration WIP --- autogen/tools/crawl4ai.py | 8 +++++--- test/tools/test_crawl4ai.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 test/tools/test_crawl4ai.py diff --git a/autogen/tools/crawl4ai.py b/autogen/tools/crawl4ai.py index 445dcc8906..2af58fbdda 100644 --- a/autogen/tools/crawl4ai.py +++ b/autogen/tools/crawl4ai.py @@ -2,16 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from typing import Annotated, Any from crawl4ai import AsyncWebCrawler from . import Tool -class Crawl4AI(Tool): +class Crawl4AITool(Tool): def __init__(self) -> None: - async def crawl4ai(url: str) -> Any: + async def crawl4ai( + url: Annotated[str, "The url to crawl and extract information from."], + ) -> Any: async with AsyncWebCrawler() as crawler: result = await crawler.arun( url=url, diff --git a/test/tools/test_crawl4ai.py b/test/tools/test_crawl4ai.py new file mode 100644 index 0000000000..2a681bd4a1 --- /dev/null +++ b/test/tools/test_crawl4ai.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors +# +# SPDX-License-Identifier: Apache-2.0 + + +from autogen.tools.crawl4ai import Crawl4AITool + + +class TestCrawl4AITool: + def test__init__(self): + tool = Crawl4AITool() + + assert isinstance(tool, Crawl4AITool) + assert tool.name == "crawl4ai" + assert tool.description == "Crawl a website and extract information." + assert callable(tool.func) + expected_schema = { + "function": { + "description": "Crawl a website and extract information.", + "name": "crawl4ai", + "parameters": { + "properties": { + "url": {"description": "The url to crawl and extract information from.", "type": "string"} + }, + "required": ["url"], + "type": "object", + }, + }, + "type": "function", + } + assert tool.tool_schema == expected_schema From d57644a1120f65094869b6bb5e0a5789d2d94006 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Mon, 27 Jan 2025 15:57:47 +0100 Subject: [PATCH 03/15] Craw4Ai integration WIP --- autogen/tools/experimental/__init__.py | 3 ++- autogen/tools/experimental/crawl4ai/crawl4ai.py | 10 +++++++--- pyproject.toml | 1 + test/tools/experimental/crawl4ai/test_crawl4ai.py | 12 ++++++++++++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/autogen/tools/experimental/__init__.py b/autogen/tools/experimental/__init__.py index b36378767f..ef7aa570fc 100644 --- a/autogen/tools/experimental/__init__.py +++ b/autogen/tools/experimental/__init__.py @@ -3,5 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 from .browser_use import BrowserUseTool +from .crawl4ai import Crawl4AITool -__all__ = ["BrowserUseTool"] +__all__ = ["BrowserUseTool", "Crawl4AITool"] diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 16713d895b..b712eef16d 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -4,10 +4,14 @@ from typing import Annotated, Any -from crawl4ai import AsyncWebCrawler - +from ....import_utils import optional_import_block from ... import Tool +with optional_import_block(): + from crawl4ai import AsyncWebCrawler + +__all__ = ["Crawl4AITool"] + class Crawl4AITool(Tool): def __init__(self) -> None: @@ -18,7 +22,7 @@ async def crawl4ai( result = await crawler.arun( url=url, ) - return result.markdown() + return result.markdown super().__init__( name="crawl4ai", diff --git a/pyproject.toml b/pyproject.toml index 01a6106183..89cdaaa98e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -275,6 +275,7 @@ markers = [ "redis", "docker", "browser_use", + "crawl4ai", ] [tool.black] diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 0a1251895b..884dac1fc3 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -3,10 +3,22 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest + +from autogen.import_utils import optional_import_block, skip_on_missing_imports + +with optional_import_block(): + from crawl4ai import AsyncWebCrawler + from autogen.tools.experimental.crawl4ai import Crawl4AITool +@pytest.mark.crawl4ai # todo: remove me after we merge the PR that ads it automatically +@skip_on_missing_imports(["crawl4ai"], "crawl4ai") class TestCrawl4AITool: + def _use_imports(self) -> None: + self._AsyncWebCrawler = AsyncWebCrawler + def test__init__(self) -> None: tool = Crawl4AITool() From 117a49f3da4667bf3cd949b8077897b0f80ab500 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Mon, 27 Jan 2025 15:58:15 +0100 Subject: [PATCH 04/15] Craw4Ai integration WIP --- .../experimental/crawl4ai/test_crawl4ai.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 884dac1fc3..67b092d79b 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -19,13 +19,13 @@ class TestCrawl4AITool: def _use_imports(self) -> None: self._AsyncWebCrawler = AsyncWebCrawler - def test__init__(self) -> None: - tool = Crawl4AITool() - - assert isinstance(tool, Crawl4AITool) - assert tool.name == "crawl4ai" - assert tool.description == "Crawl a website and extract information." - assert callable(tool.func) + @pytest.mark.asyncio + async def test__init__(self) -> None: + tool_without_llm = Crawl4AITool() + assert isinstance(tool_without_llm, Crawl4AITool) + assert tool_without_llm.name == "crawl4ai" + assert tool_without_llm.description == "Crawl a website and extract information." + assert callable(tool_without_llm.func) expected_schema = { "function": { "description": "Crawl a website and extract information.", @@ -40,4 +40,7 @@ def test__init__(self) -> None: }, "type": "function", } - assert tool.tool_schema == expected_schema + assert tool_without_llm.tool_schema == expected_schema + + result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home") + assert isinstance(result, str) From a8c0fa6a67499f38585fb8c8af3aad7b886588d5 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Tue, 28 Jan 2025 13:45:55 +0100 Subject: [PATCH 05/15] Refactoring --- autogen/tools/dependency_injection.py | 13 ++++++++++++- .../tools/experimental/browser_use/browser_use.py | 13 ++----------- autogen/tools/experimental/crawl4ai/crawl4ai.py | 11 +++++++++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/autogen/tools/dependency_injection.py b/autogen/tools/dependency_injection.py index af70eabb7b..078895a07b 100644 --- a/autogen/tools/dependency_injection.py +++ b/autogen/tools/dependency_injection.py @@ -7,7 +7,7 @@ from abc import ABC from collections.abc import Iterable from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union, get_type_hints from fast_depends import Depends as FastDepends from fast_depends import inject @@ -26,6 +26,7 @@ "Field", "get_context_params", "inject_params", + "on", ] @@ -75,6 +76,16 @@ def last_message(self) -> Optional[dict[str, Any]]: return self._agent.last_message() +T = TypeVar("T") + + +def on(x: T) -> Callable[[], T]: + def inner(_x: T = x) -> T: + return _x + + return inner + + @export_module("autogen.tools") def Depends(x: Any) -> Any: # noqa: N802 """Creates a dependency for injection based on the provided context or type. diff --git a/autogen/tools/experimental/browser_use/browser_use.py b/autogen/tools/experimental/browser_use/browser_use.py index 92d002c4bd..e83e7c3e83 100644 --- a/autogen/tools/experimental/browser_use/browser_use.py +++ b/autogen/tools/experimental/browser_use/browser_use.py @@ -2,12 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Annotated, Any, Callable, Optional, TypeVar +from typing import Annotated, Any, Optional from pydantic import BaseModel from ....import_utils import optional_import_block, require_optional_import from ... import Depends, Tool +from ...dependency_injection import on with optional_import_block(): from browser_use import Agent @@ -33,16 +34,6 @@ class BrowserUseResult(BaseModel): final_result: Optional[str] -T = TypeVar("T") - - -def on(x: T) -> Callable[[], T]: - def inner(_x: T = x) -> T: - return _x - - return inner - - @require_optional_import(["langchain_openai", "browser_use"], "browser-use") # todo: add export_module decorator # @export_module("autogen.tools.experimental") diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index b712eef16d..f506703be7 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -2,10 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Annotated, Any +from typing import Annotated, Any, Optional from ....import_utils import optional_import_block from ... import Tool +from ...dependency_injection import Depends, on with optional_import_block(): from crawl4ai import AsyncWebCrawler @@ -14,11 +15,17 @@ class Crawl4AITool(Tool): - def __init__(self) -> None: + def __init__( + self, + llm_config: Optional[dict[str, Any]] = None, + ) -> None: async def crawl4ai( url: Annotated[str, "The url to crawl and extract information from."], + llm_config: Annotated[Optional[dict[str, Any]], Depends[on(llm_config)]], ) -> Any: async with AsyncWebCrawler() as crawler: + if llm_config is not None: + raise ValueError("llm_config is not supported yet.") result = await crawler.arun( url=url, ) From 220c554c4433013fdc7585949179a254336044a8 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Tue, 28 Jan 2025 14:18:02 +0100 Subject: [PATCH 06/15] crawl4ai with LLMs WIP --- .../tools/experimental/crawl4ai/crawl4ai.py | 61 +++++++++++++++++-- .../experimental/crawl4ai/test_crawl4ai.py | 23 +++++-- 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index f506703be7..3cd8d068d3 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import os from typing import Annotated, Any, Optional from ....import_utils import optional_import_block @@ -9,7 +10,8 @@ from ...dependency_injection import Depends, on with optional_import_block(): - from crawl4ai import AsyncWebCrawler + from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig + from crawl4ai.extraction_strategy import LLMExtractionStrategy __all__ = ["Crawl4AITool"] @@ -21,18 +23,65 @@ def __init__( ) -> None: async def crawl4ai( url: Annotated[str, "The url to crawl and extract information from."], - llm_config: Annotated[Optional[dict[str, Any]], Depends[on(llm_config)]], + llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))], ) -> Any: - async with AsyncWebCrawler() as crawler: - if llm_config is not None: - raise ValueError("llm_config is not supported yet.") + if llm_config is not None: + browser_cfg = BrowserConfig(headless=True) + crawl_config = Crawl4AITool._get_llm_strategy(llm_config) + else: + browser_cfg = None + crawl_config = None + + async with AsyncWebCrawler(config=browser_cfg) as crawler: result = await crawler.arun( url=url, + config=crawl_config, ) - return result.markdown + if result.success: + return result.extracted_content + + return result.error_message super().__init__( name="crawl4ai", description="Crawl a website and extract information.", func_or_tool=crawl4ai, ) + + @staticmethod + def _get_llm_strategy(llm_config: dict[str, Any]) -> CrawlerRunConfig: # type: ignore[no-any-unimported] + if "config_list" not in llm_config: + if "model" in llm_config: + model = llm_config["model"] + api_type = "openai" + api_key = os.getenv("OPENAI_API_KEY") + raise ValueError("llm_config must be a valid config dictionary.") + else: + try: + model = llm_config["config_list"][0]["model"] + api_type = llm_config["config_list"][0].get("api_type", "openai") + api_key = llm_config["config_list"][0]["api_key"] + + except (KeyError, TypeError): + raise ValueError("llm_config must be a valid config dictionary.") + + provider = f"{api_type}/{model}" + + # 1. Define the LLM extraction strategy + llm_strategy = LLMExtractionStrategy( + provider=provider, + api_token=api_key, + # schema=Product.schema_json(), # Or use model_json_schema() + # extraction_type="schema", + instruction="Get the most relevant information from the page.", + chunk_token_threshold=1000, + overlap_rate=0.0, + apply_chunking=True, + input_format="markdown", # or "html", "fit_markdown" + extra_args={"temperature": 0.0, "max_tokens": 800}, + ) + + # 2. Build the crawler config + crawl_config = CrawlerRunConfig(extraction_strategy=llm_strategy, cache_mode=CacheMode.BYPASS) + + return crawl_config diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 67b092d79b..f6861f72d4 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -6,11 +6,13 @@ import pytest from autogen.import_utils import optional_import_block, skip_on_missing_imports +from autogen.tools.experimental.crawl4ai import Crawl4AITool -with optional_import_block(): - from crawl4ai import AsyncWebCrawler +from ....conftest import Credentials -from autogen.tools.experimental.crawl4ai import Crawl4AITool +with optional_import_block(): + from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig + from crawl4ai.extraction_strategy import LLMExtractionStrategy @pytest.mark.crawl4ai # todo: remove me after we merge the PR that ads it automatically @@ -18,9 +20,13 @@ class TestCrawl4AITool: def _use_imports(self) -> None: self._AsyncWebCrawler = AsyncWebCrawler + self._BrowserConfig = BrowserConfig + self._CrawlerRunConfig = CrawlerRunConfig + self._CacheMode = CacheMode + self._LLMExtractionStrategy = LLMExtractionStrategy @pytest.mark.asyncio - async def test__init__(self) -> None: + async def test_without_llm(self) -> None: tool_without_llm = Crawl4AITool() assert isinstance(tool_without_llm, Crawl4AITool) assert tool_without_llm.name == "crawl4ai" @@ -44,3 +50,12 @@ async def test__init__(self) -> None: result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home") assert isinstance(result, str) + + @pytest.mark.asyncio + async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None: + tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config) + assert isinstance(tool_with_llm, Crawl4AITool) + + result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home") + # assert isinstance(result, str) + assert result is None, result From ec9d5f72d8d14094e15efd529391a504e37ea2ab Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 08:41:22 +0100 Subject: [PATCH 07/15] WIP --- autogen/tools/experimental/crawl4ai/crawl4ai.py | 4 ++-- test/tools/experimental/crawl4ai/test_crawl4ai.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 3cd8d068d3..42349f78df 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -27,7 +27,7 @@ async def crawl4ai( ) -> Any: if llm_config is not None: browser_cfg = BrowserConfig(headless=True) - crawl_config = Crawl4AITool._get_llm_strategy(llm_config) + crawl_config = Crawl4AITool._get_crawl_config(llm_config) else: browser_cfg = None crawl_config = None @@ -49,7 +49,7 @@ async def crawl4ai( ) @staticmethod - def _get_llm_strategy(llm_config: dict[str, Any]) -> CrawlerRunConfig: # type: ignore[no-any-unimported] + def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig: # type: ignore[no-any-unimported] if "config_list" not in llm_config: if "model" in llm_config: model = llm_config["model"] diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index f6861f72d4..9e9ee4c083 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -51,6 +51,12 @@ async def test_without_llm(self) -> None: result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home") assert isinstance(result, str) + def test_get_crawl_config(self, mock_credentials: Credentials) -> None: + config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config) + assert isinstance(config, CrawlerRunConfig) + assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}" + + @pytest.mark.openai @pytest.mark.asyncio async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None: tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config) From eb032cc08d9b0eb75d80ac516720dae03dc9b2ac Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 09:11:15 +0100 Subject: [PATCH 08/15] Refactoring --- .../tools/experimental/crawl4ai/crawl4ai.py | 35 ++++++++++++------- .../experimental/crawl4ai/test_crawl4ai.py | 3 +- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 42349f78df..6aa3fe64bb 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -21,26 +21,35 @@ def __init__( self, llm_config: Optional[dict[str, Any]] = None, ) -> None: - async def crawl4ai( - url: Annotated[str, "The url to crawl and extract information from."], - llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))], + async def crawl4ai_helper( # type: ignore[no-any-unimported] + url: str, + browser_cfg: Optional[BrowserConfig] = None, + crawl_config: Optional[CrawlerRunConfig] = None, ) -> Any: - if llm_config is not None: - browser_cfg = BrowserConfig(headless=True) - crawl_config = Crawl4AITool._get_crawl_config(llm_config) - else: - browser_cfg = None - crawl_config = None - async with AsyncWebCrawler(config=browser_cfg) as crawler: result = await crawler.arun( url=url, config=crawl_config, ) - if result.success: - return result.extracted_content - return result.error_message + if crawl_config is None: + response = result.markdown + else: + response = result.extracted_content if result.success else result.error_message + + return response + + async def crawl4ai( + url: Annotated[str, "The url to crawl and extract information from."], + llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))], + ) -> Any: + if llm_config is None: + return await crawl4ai_helper(url=url) + else: + browser_cfg = BrowserConfig(headless=True) + crawl_config = Crawl4AITool._get_crawl_config(llm_config) + + return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config) super().__init__( name="crawl4ai", diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 9e9ee4c083..179b151706 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -63,5 +63,4 @@ async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None: assert isinstance(tool_with_llm, Crawl4AITool) result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home") - # assert isinstance(result, str) - assert result is None, result + assert isinstance(result, str) From 5d357a2d10e2968369b8aaab758d0279f348654d Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 09:56:47 +0100 Subject: [PATCH 09/15] Add crawling with extraction schema --- .../tools/experimental/crawl4ai/crawl4ai.py | 46 +++++++++++++------ .../experimental/crawl4ai/test_crawl4ai.py | 28 +++++++++-- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 6aa3fe64bb..7f494d7ecf 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -3,7 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Annotated, Any, Optional +from typing import Annotated, Any, Optional, Type + +from pydantic import BaseModel from ....import_utils import optional_import_block from ... import Tool @@ -20,6 +22,7 @@ class Crawl4AITool(Tool): def __init__( self, llm_config: Optional[dict[str, Any]] = None, + extraction_model: Optional[Type[BaseModel]] = None, ) -> None: async def crawl4ai_helper( # type: ignore[no-any-unimported] url: str, @@ -39,26 +42,36 @@ async def crawl4ai_helper( # type: ignore[no-any-unimported] return response - async def crawl4ai( + async def crawl4ai_without_llm( url: Annotated[str, "The url to crawl and extract information from."], - llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))], ) -> Any: - if llm_config is None: - return await crawl4ai_helper(url=url) - else: - browser_cfg = BrowserConfig(headless=True) - crawl_config = Crawl4AITool._get_crawl_config(llm_config) + return await crawl4ai_helper(url=url) + + async def crawl4ai_with_llm( + url: Annotated[str, "The url to crawl and extract information from."], + instruction: Annotated[str, "The instruction to provide on how and what to extract."], + llm_config: Annotated[dict[str, Any], Depends(on(llm_config))], + extraction_model: Annotated[Optional[Type[BaseModel]], Depends(on(extraction_model))], + ) -> Any: + browser_cfg = BrowserConfig(headless=True) + crawl_config = Crawl4AITool._get_crawl_config( + llm_config=llm_config, instruction=instruction, extraction_model=extraction_model + ) - return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config) + return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config) super().__init__( name="crawl4ai", description="Crawl a website and extract information.", - func_or_tool=crawl4ai, + func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm, ) @staticmethod - def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig: # type: ignore[no-any-unimported] + def _get_crawl_config( # type: ignore[no-any-unimported] + llm_config: dict[str, Any], + instruction: str, + extraction_model: Optional[Type[BaseModel]] = None, + ) -> CrawlerRunConfig: if "config_list" not in llm_config: if "model" in llm_config: model = llm_config["model"] @@ -76,13 +89,18 @@ def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig: # type: provider = f"{api_type}/{model}" + schema = ( + extraction_model.model_json_schema() + if (extraction_model and issubclass(extraction_model, BaseModel)) + else None + ) # 1. Define the LLM extraction strategy llm_strategy = LLMExtractionStrategy( provider=provider, api_token=api_key, - # schema=Product.schema_json(), # Or use model_json_schema() - # extraction_type="schema", - instruction="Get the most relevant information from the page.", + schema=schema, + extraction_type="schema" if schema else "block", + instruction=instruction, chunk_token_threshold=1000, overlap_rate=0.0, apply_chunking=True, diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 179b151706..3514f3a355 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 - import pytest +from pydantic import BaseModel from autogen.import_utils import optional_import_block, skip_on_missing_imports from autogen.tools.experimental.crawl4ai import Crawl4AITool @@ -15,7 +15,6 @@ from crawl4ai.extraction_strategy import LLMExtractionStrategy -@pytest.mark.crawl4ai # todo: remove me after we merge the PR that ads it automatically @skip_on_missing_imports(["crawl4ai"], "crawl4ai") class TestCrawl4AITool: def _use_imports(self) -> None: @@ -52,7 +51,7 @@ async def test_without_llm(self) -> None: assert isinstance(result, str) def test_get_crawl_config(self, mock_credentials: Credentials) -> None: - config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config) + config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config, instruction="dummy") assert isinstance(config, CrawlerRunConfig) assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}" @@ -62,5 +61,26 @@ async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None: tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config) assert isinstance(tool_with_llm, Crawl4AITool) - result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home") + result = await tool_with_llm( + url="https://docs.ag2.ai/docs/Home", instruction="Get the most relevant information from the page." + ) + assert isinstance(result, str) + + @pytest.mark.openai + @pytest.mark.asyncio + async def test_with_llm_and_extraction_schema(self, credentials_gpt_4o_mini: Credentials) -> None: + class Product(BaseModel): + name: str + price: str + + tool_with_llm = Crawl4AITool( + llm_config=credentials_gpt_4o_mini.llm_config, + extraction_model=Product, + ) + assert isinstance(tool_with_llm, Crawl4AITool) + + result = await tool_with_llm( + url="https://www.ikea.com/gb/en/", + instruction="Extract all product objects with 'name' and 'price' from the content.", + ) assert isinstance(result, str) From 9395941a4a7f0f2a53e89ca6665c6b9c9628c809 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 11:12:29 +0100 Subject: [PATCH 10/15] Refactoring --- .../tools/experimental/crawl4ai/crawl4ai.py | 65 +++++++++++++++++-- .../experimental/crawl4ai/test_crawl4ai.py | 41 ++++++++++++ 2 files changed, 99 insertions(+), 7 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 7f494d7ecf..d455bea1d6 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -19,11 +19,26 @@ class Crawl4AITool(Tool): + """ + Crawl a website and extract information using the crawl4ai library. + """ + def __init__( self, llm_config: Optional[dict[str, Any]] = None, extraction_model: Optional[Type[BaseModel]] = None, + llm_strategy_kwargs: Optional[dict[str, Any]] = None, ) -> None: + """ + Initialize the Crawl4AITool. + + Args: + llm_config: The config dictionary for the LLM model. If None, the tool will run without LLM. + extraction_model: The Pydantic model to use for extraction. If None, the tool will use the default schema. + llm_strategy_kwargs: The keyword arguments to pass to the LLM extraction strategy. + """ + Crawl4AITool._validate_llm_strategy_kwargs(llm_strategy_kwargs, llm_config_provided=(llm_config is not None)) + async def crawl4ai_helper( # type: ignore[no-any-unimported] url: str, browser_cfg: Optional[BrowserConfig] = None, @@ -51,11 +66,15 @@ async def crawl4ai_with_llm( url: Annotated[str, "The url to crawl and extract information from."], instruction: Annotated[str, "The instruction to provide on how and what to extract."], llm_config: Annotated[dict[str, Any], Depends(on(llm_config))], + llm_strategy_kwargs: Annotated[Optional[dict[str, Any]], Depends(on(llm_strategy_kwargs))], extraction_model: Annotated[Optional[Type[BaseModel]], Depends(on(extraction_model))], ) -> Any: browser_cfg = BrowserConfig(headless=True) crawl_config = Crawl4AITool._get_crawl_config( - llm_config=llm_config, instruction=instruction, extraction_model=extraction_model + llm_config=llm_config, + instruction=instruction, + extraction_model=extraction_model, + llm_strategy_kwargs=llm_strategy_kwargs, ) return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config) @@ -66,10 +85,40 @@ async def crawl4ai_with_llm( func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm, ) + @staticmethod + def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool) -> None: + if not llm_strategy_kwargs: + return + + if not llm_config_provided: + raise ValueError("llm_strategy_kwargs can only be provided if llm_config is also provided.") + + check_parameters_error_msg = "".join( + f"'{key}' should not be provided in llm_strategy_kwargs. It is automatically set based on llm_config.\n" + for key in ["provider", "api_token"] + if key in llm_strategy_kwargs + ) + + check_parameters_error_msg += "".join( + "'schema' should not be provided in llm_strategy_kwargs. It is automatically set based on extraction_model type.\n" + if "schema" in llm_strategy_kwargs + else "" + ) + + check_parameters_error_msg += "".join( + "'instruction' should not be provided in llm_strategy_kwargs. It is provided at the time of calling the tool.\n" + if "instruction" in llm_strategy_kwargs + else "" + ) + + if check_parameters_error_msg: + raise ValueError(check_parameters_error_msg) + @staticmethod def _get_crawl_config( # type: ignore[no-any-unimported] llm_config: dict[str, Any], instruction: str, + llm_strategy_kwargs: Optional[dict[str, Any]] = None, extraction_model: Optional[Type[BaseModel]] = None, ) -> CrawlerRunConfig: if "config_list" not in llm_config: @@ -89,23 +138,25 @@ def _get_crawl_config( # type: ignore[no-any-unimported] provider = f"{api_type}/{model}" + if llm_strategy_kwargs is None: + llm_strategy_kwargs = {} + schema = ( extraction_model.model_json_schema() if (extraction_model and issubclass(extraction_model, BaseModel)) else None ) + + extraction_type = llm_strategy_kwargs.pop("extraction_type", "schema" if schema else "block") + # 1. Define the LLM extraction strategy llm_strategy = LLMExtractionStrategy( provider=provider, api_token=api_key, schema=schema, - extraction_type="schema" if schema else "block", + extraction_type=extraction_type, instruction=instruction, - chunk_token_threshold=1000, - overlap_rate=0.0, - apply_chunking=True, - input_format="markdown", # or "html", "fit_markdown" - extra_args={"temperature": 0.0, "max_tokens": 800}, + **llm_strategy_kwargs, ) # 2. Build the crawler config diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 3514f3a355..3a32fb8ba6 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +from typing import Any, Optional + import pytest from pydantic import BaseModel @@ -84,3 +86,42 @@ class Product(BaseModel): instruction="Extract all product objects with 'name' and 'price' from the content.", ) assert isinstance(result, str) + + @pytest.mark.parametrize( + ("llm_strategy_kwargs", "llm_config_provided", "expected_error"), + [ + (None, True, None), + ({"some_param": "dummy_value"}, True, None), + ( + {"provider": "openai/gpt-4o", "api_token": "dummy_token"}, + False, + "llm_strategy_kwargs can only be provided if llm_config is also provided.", + ), + ( + {"schema": "dummy_schema"}, + True, + "'schema' should not be provided in llm_strategy_kwargs.", + ), + ( + {"instruction": "dummy_instruction"}, + True, + "'instruction' should not be provided in llm_strategy_kwargs.", + ), + ], + ) + def test_validate_llm_strategy_kwargs( + self, llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool, expected_error: Optional[str] + ) -> None: + if expected_error is None: + Crawl4AITool._validate_llm_strategy_kwargs( + llm_strategy_kwargs=llm_strategy_kwargs, llm_config_provided=llm_config_provided + ) + return + + with pytest.raises( + ValueError, + match=expected_error, + ): + Crawl4AITool._validate_llm_strategy_kwargs( + llm_strategy_kwargs=llm_strategy_kwargs, llm_config_provided=llm_config_provided + ) From 3924eac7a1c68519df7b70dd64dc2cb066af6ddf Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 11:36:52 +0100 Subject: [PATCH 11/15] Refactor and add tests --- .../tools/experimental/crawl4ai/crawl4ai.py | 17 +++++++---- .../experimental/crawl4ai/test_crawl4ai.py | 29 +++++++++++++++++-- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index d455bea1d6..e1e6dc909e 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -115,12 +115,7 @@ def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]], raise ValueError(check_parameters_error_msg) @staticmethod - def _get_crawl_config( # type: ignore[no-any-unimported] - llm_config: dict[str, Any], - instruction: str, - llm_strategy_kwargs: Optional[dict[str, Any]] = None, - extraction_model: Optional[Type[BaseModel]] = None, - ) -> CrawlerRunConfig: + def _get_provider_and_api_key(llm_config: dict[str, Any]) -> tuple[str, str]: if "config_list" not in llm_config: if "model" in llm_config: model = llm_config["model"] @@ -137,6 +132,16 @@ def _get_crawl_config( # type: ignore[no-any-unimported] raise ValueError("llm_config must be a valid config dictionary.") provider = f"{api_type}/{model}" + return provider, api_key # type: ignore[return-value] + + @staticmethod + def _get_crawl_config( # type: ignore[no-any-unimported] + llm_config: dict[str, Any], + instruction: str, + llm_strategy_kwargs: Optional[dict[str, Any]] = None, + extraction_model: Optional[Type[BaseModel]] = None, + ) -> CrawlerRunConfig: + provider, api_key = Crawl4AITool._get_provider_and_api_key(llm_config) if llm_strategy_kwargs is None: llm_strategy_kwargs = {} diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 3a32fb8ba6..6ef1e82dc0 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -52,11 +52,36 @@ async def test_without_llm(self) -> None: result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home") assert isinstance(result, str) - def test_get_crawl_config(self, mock_credentials: Credentials) -> None: - config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config, instruction="dummy") + def test_get_provider_and_api_key(self, mock_credentials: Credentials) -> None: + provider, api_key = Crawl4AITool._get_provider_and_api_key(mock_credentials.llm_config) + assert provider == "openai/gpt-4o", provider + assert isinstance(api_key, str) + + @pytest.mark.parametrize( + "use_extraction_model", + [ + False, + True, + ], + ) + def test_get_crawl_config(self, mock_credentials: Credentials, use_extraction_model: bool) -> None: + class Product(BaseModel): + name: str + price: str + + extraction_model = Product if use_extraction_model else None + + config = Crawl4AITool._get_crawl_config( + mock_credentials.llm_config, instruction="dummy", extraction_model=extraction_model + ) assert isinstance(config, CrawlerRunConfig) assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}" + if use_extraction_model: + assert config.extraction_strategy.schema == Product.model_json_schema() + else: + assert config.extraction_strategy.schema is None + @pytest.mark.openai @pytest.mark.asyncio async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None: From 2cbafb01017a599aaa69c1fd2887344c662e75f7 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 12:34:05 +0100 Subject: [PATCH 12/15] Add initial crawl4ai notebook tutorial --- notebook/tools_crawl4ai.ipynb | 247 ++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 notebook/tools_crawl4ai.ipynb diff --git a/notebook/tools_crawl4ai.ipynb b/notebook/tools_crawl4ai.ipynb new file mode 100644 index 0000000000..1d9e8789a7 --- /dev/null +++ b/notebook/tools_crawl4ai.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Supercharging Web Crawling with Crawl4AI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instalation\n", + "\n", + "To get started with the `crawl4ai` integration in AG2, follow these steps:\n", + "\n", + "1. Install AG2 with the `crawl4ai` extra:\n", + " ```bash\n", + " pip install ag2[crawl4ai]\n", + " ````\n", + "2. Set up Playwright:\n", + " \n", + " ```bash\n", + " playwright install\n", + " ```\n", + "\n", + "3. For running the code in Jupyther, use `nest_asyncio` to allow nested event loops.\n", + " ```bash\n", + " pip install nest_asyncio\n", + " ```\n", + "\n", + "\n", + "You're all set! Now you can start using browsing features in AG2.\n", + "\n", + "\n", + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import nest_asyncio\n", + "from pydantic import BaseModel\n", + "\n", + "from autogen import AssistantAgent, UserProxyAgent\n", + "from autogen.tools.experimental import Crawl4AITool\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LLM-Free Crawl4AI\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n", + "\n", + "llm_config = {\n", + " \"config_list\": config_list,\n", + "}\n", + "\n", + "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n", + "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "crawlai_tool = Crawl4AITool()\n", + "\n", + "crawlai_tool.register_for_execution(user_proxy)\n", + "crawlai_tool.register_for_llm(assistant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = user_proxy.initiate_chat(\n", + " recipient=assistant,\n", + " message=\"Get info from https://docs.ag2.ai/docs/Home\",\n", + " max_turns=2,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crawl4AI with LLM\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n", + "\n", + "llm_config = {\n", + " \"config_list\": config_list,\n", + "}\n", + "\n", + "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n", + "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set llm_config to Crawl4AITool\n", + "crawlai_tool = Crawl4AITool(llm_config=llm_config)\n", + "\n", + "crawlai_tool.register_for_execution(user_proxy)\n", + "crawlai_tool.register_for_llm(assistant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = user_proxy.initiate_chat(\n", + " recipient=assistant,\n", + " message=\"Get info from https://docs.ag2.ai/docs/Home\",\n", + " max_turns=2,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crawl4AI with LLM & Schema for Structured Data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n", + "\n", + "llm_config = {\n", + " \"config_list\": config_list,\n", + "}\n", + "\n", + "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n", + "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Blog(BaseModel):\n", + " title: str\n", + " url: str\n", + "\n", + "\n", + "# Set llm_config and extraction_model to Crawl4AITool\n", + "crawlai_tool = Crawl4AITool(llm_config=llm_config, extraction_model=Blog)\n", + "\n", + "crawlai_tool.register_for_execution(user_proxy)\n", + "crawlai_tool.register_for_llm(assistant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "message = \"Extract all blog posts from https://docs.ag2.ai/blog\"\n", + "result = user_proxy.initiate_chat(\n", + " recipient=assistant,\n", + " message=message,\n", + " max_turns=2,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "front_matter": { + "description": "Supercharging Web Crawling with Crawl4AI", + "tags": [ + "tools", + "browser-use", + "webscraping", + "function calling" + ] + }, + "kernelspec": { + "display_name": ".venv-crawl4ai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ccd1cc568b73f9eb12c6b6f7c39a20b42cd59998 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 12:39:21 +0100 Subject: [PATCH 13/15] export_module for browser-use and crawl4ai modules --- autogen/tools/experimental/browser_use/browser_use.py | 7 +++---- autogen/tools/experimental/crawl4ai/crawl4ai.py | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/autogen/tools/experimental/browser_use/browser_use.py b/autogen/tools/experimental/browser_use/browser_use.py index e83e7c3e83..003d0fc4ea 100644 --- a/autogen/tools/experimental/browser_use/browser_use.py +++ b/autogen/tools/experimental/browser_use/browser_use.py @@ -6,6 +6,7 @@ from pydantic import BaseModel +from ....doc_utils import export_module from ....import_utils import optional_import_block, require_optional_import from ... import Depends, Tool from ...dependency_injection import on @@ -20,8 +21,7 @@ __all__ = ["BrowserUseResult", "BrowserUseTool"] -# todo: add export_module decorator -# @export_module("autogen.tools.experimental.browser_use") +@export_module("autogen.tools.experimental.browser_use") class BrowserUseResult(BaseModel): """The result of using the browser to perform a task. @@ -35,8 +35,7 @@ class BrowserUseResult(BaseModel): @require_optional_import(["langchain_openai", "browser_use"], "browser-use") -# todo: add export_module decorator -# @export_module("autogen.tools.experimental") +@export_module("autogen.tools.experimental") class BrowserUseTool(Tool): """BrowserUseTool is a tool that uses the browser to perform a task.""" diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index e1e6dc909e..79326e4bda 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -7,6 +7,7 @@ from pydantic import BaseModel +from ....doc_utils import export_module from ....import_utils import optional_import_block from ... import Tool from ...dependency_injection import Depends, on @@ -18,6 +19,7 @@ __all__ = ["Crawl4AITool"] +@export_module("autogen.tools.experimental") class Crawl4AITool(Tool): """ Crawl a website and extract information using the crawl4ai library. From b660a4309ca9d9850394a261049696925cbdccef Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 12:51:38 +0100 Subject: [PATCH 14/15] Fix tests --- autogen/tools/experimental/crawl4ai/crawl4ai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 79326e4bda..62f6f47925 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -43,8 +43,8 @@ def __init__( async def crawl4ai_helper( # type: ignore[no-any-unimported] url: str, - browser_cfg: Optional[BrowserConfig] = None, - crawl_config: Optional[CrawlerRunConfig] = None, + browser_cfg: Optional["BrowserConfig"] = None, + crawl_config: Optional["CrawlerRunConfig"] = None, ) -> Any: async with AsyncWebCrawler(config=browser_cfg) as crawler: result = await crawler.arun( @@ -142,7 +142,7 @@ def _get_crawl_config( # type: ignore[no-any-unimported] instruction: str, llm_strategy_kwargs: Optional[dict[str, Any]] = None, extraction_model: Optional[Type[BaseModel]] = None, - ) -> CrawlerRunConfig: + ) -> "CrawlerRunConfig": provider, api_key = Crawl4AITool._get_provider_and_api_key(llm_config) if llm_strategy_kwargs is None: From 97337940db83d0add17018fd7cf2e5f6b1e591c3 Mon Sep 17 00:00:00 2001 From: Robert Jambrecic Date: Wed, 29 Jan 2025 13:22:40 +0100 Subject: [PATCH 15/15] Cleanup --- autogen/tools/__init__.py | 2 +- autogen/tools/experimental/crawl4ai/crawl4ai.py | 3 ++- .../experimental/browser_use/test_browser_use.py | 11 +---------- test/tools/experimental/crawl4ai/test_crawl4ai.py | 10 +--------- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/autogen/tools/__init__.py b/autogen/tools/__init__.py index 04542b4d3b..ecb29373ff 100644 --- a/autogen/tools/__init__.py +++ b/autogen/tools/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai +# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors # # SPDX-License-Identifier: Apache-2.0 diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py index 62f6f47925..f616613eb7 100644 --- a/autogen/tools/experimental/crawl4ai/crawl4ai.py +++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py @@ -8,7 +8,7 @@ from pydantic import BaseModel from ....doc_utils import export_module -from ....import_utils import optional_import_block +from ....import_utils import optional_import_block, require_optional_import from ... import Tool from ...dependency_injection import Depends, on @@ -19,6 +19,7 @@ __all__ = ["Crawl4AITool"] +@require_optional_import(["crawl4ai"], "crawl4ai") @export_module("autogen.tools.experimental") class Crawl4AITool(Tool): """ diff --git a/test/tools/experimental/browser_use/test_browser_use.py b/test/tools/experimental/browser_use/test_browser_use.py index 0426b6c31e..7d194c6464 100644 --- a/test/tools/experimental/browser_use/test_browser_use.py +++ b/test/tools/experimental/browser_use/test_browser_use.py @@ -8,23 +8,14 @@ import pytest from autogen import AssistantAgent, UserProxyAgent -from autogen.import_utils import optional_import_block, skip_on_missing_imports +from autogen.import_utils import skip_on_missing_imports from autogen.tools.experimental.browser_use import BrowserUseResult, BrowserUseTool from ....conftest import Credentials, credentials_browser_use -with optional_import_block(): - from browser_use import Agent - from langchain_openai import ChatOpenAI - -@pytest.mark.browser_use # todo: remove me after we merge the PR that ads it automatically @skip_on_missing_imports(["langchain_openai", "browser_use"], "browser-use") class TestBrowserUseToolOpenai: - def _use_imports(self) -> None: - self._ChatOpenAI = ChatOpenAI - self._Agent = Agent - def test_broser_use_tool_init(self, mock_credentials: Credentials) -> None: browser_use_tool = BrowserUseTool(llm_config=mock_credentials.llm_config) assert browser_use_tool.name == "browser_use" diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py index 6ef1e82dc0..befb757fbe 100644 --- a/test/tools/experimental/crawl4ai/test_crawl4ai.py +++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py @@ -13,19 +13,11 @@ from ....conftest import Credentials with optional_import_block(): - from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig - from crawl4ai.extraction_strategy import LLMExtractionStrategy + from crawl4ai import CrawlerRunConfig @skip_on_missing_imports(["crawl4ai"], "crawl4ai") class TestCrawl4AITool: - def _use_imports(self) -> None: - self._AsyncWebCrawler = AsyncWebCrawler - self._BrowserConfig = BrowserConfig - self._CrawlerRunConfig = CrawlerRunConfig - self._CacheMode = CacheMode - self._LLMExtractionStrategy = LLMExtractionStrategy - @pytest.mark.asyncio async def test_without_llm(self) -> None: tool_without_llm = Crawl4AITool()