From 40d642f8e561622bea83c2a0db80e291f05dfa24 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Mon, 27 Jan 2025 09:24:04 +0100
Subject: [PATCH 01/15] Craw4Ai integration WIP

---
 autogen/tools/__init__.py |  2 +-
 autogen/tools/crawl4ai.py | 25 +++++++++++++++++++++++++
 pyproject.toml            |  5 +++++
 setup_ag2.py              |  1 +
 setup_autogen.py          |  1 +
 5 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 autogen/tools/crawl4ai.py

diff --git a/autogen/tools/__init__.py b/autogen/tools/__init__.py
index ecb29373ff..04542b4d3b 100644
--- a/autogen/tools/__init__.py
+++ b/autogen/tools/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/autogen/tools/crawl4ai.py b/autogen/tools/crawl4ai.py
new file mode 100644
index 0000000000..445dcc8906
--- /dev/null
+++ b/autogen/tools/crawl4ai.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+from crawl4ai import AsyncWebCrawler
+
+from . import Tool
+
+
+class Crawl4AI(Tool):
+    def __init__(self) -> None:
+        async def crawl4ai(url: str) -> Any:
+            async with AsyncWebCrawler() as crawler:
+                result = await crawler.arun(
+                    url=url,
+                )
+                return result.markdown()
+
+        super().__init__(
+            name="crawl4ai",
+            description="Crawl a website and extract information.",
+            func_or_tool=crawl4ai,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 7b6d6b8393..eb79015355 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -124,6 +124,11 @@ rag = [
     "webdriver-manager==4.0.2",
 ]
 
+
+crawl4ai = [
+    "crawl4ai>=0.4.247,<0.5",
+]
+
 neo4j = [
     "docx2txt==0.8",
     "llama-index==0.12.11",
diff --git a/setup_ag2.py b/setup_ag2.py
index 2e17b70c61..b5b0c6afbc 100644
--- a/setup_ag2.py
+++ b/setup_ag2.py
@@ -36,6 +36,7 @@
         "retrievechat-qdrant": ["pyautogen[retrievechat-qdrant]==" + __version__],
         "graph-rag-falkor-db": ["pyautogen[graph-rag-falkor-db]==" + __version__],
         "rag": ["pyautogen[rag]==" + __version__],
+        "crawl4ai": ["pyautogen[crawl4ai]==" + __version__],
         "neo4j": ["pyautogen[neo4j]==" + __version__],
         "twilio": ["pyautogen[twilio]==" + __version__],
         "interop-crewai": ["pyautogen[interop-crewai]==" + __version__],
diff --git a/setup_autogen.py b/setup_autogen.py
index 9f166ee2e0..3062684b42 100644
--- a/setup_autogen.py
+++ b/setup_autogen.py
@@ -36,6 +36,7 @@
         "retrievechat-qdrant": ["pyautogen[retrievechat-qdrant]==" + __version__],
         "graph-rag-falkor-db": ["pyautogen[graph-rag-falkor-db]==" + __version__],
         "rag": ["pyautogen[rag]==" + __version__],
+        "crawl4ai": ["pyautogen[crawl4ai]==" + __version__],
         "neo4j": ["pyautogen[neo4j]==" + __version__],
         "twilio": ["pyautogen[twilio]==" + __version__],
         "interop-crewai": ["pyautogen[interop-crewai]==" + __version__],

From 565b56aa77e52d78f62ab5c9ddf528cd6a78966e Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Mon, 27 Jan 2025 14:26:48 +0100
Subject: [PATCH 02/15] Craw4Ai integration WIP

---
 autogen/tools/crawl4ai.py   |  8 +++++---
 test/tools/test_crawl4ai.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 test/tools/test_crawl4ai.py

diff --git a/autogen/tools/crawl4ai.py b/autogen/tools/crawl4ai.py
index 445dcc8906..2af58fbdda 100644
--- a/autogen/tools/crawl4ai.py
+++ b/autogen/tools/crawl4ai.py
@@ -2,16 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any
+from typing import Annotated, Any
 
 from crawl4ai import AsyncWebCrawler
 
 from . import Tool
 
 
-class Crawl4AI(Tool):
+class Crawl4AITool(Tool):
     def __init__(self) -> None:
-        async def crawl4ai(url: str) -> Any:
+        async def crawl4ai(
+            url: Annotated[str, "The url to crawl and extract information from."],
+        ) -> Any:
             async with AsyncWebCrawler() as crawler:
                 result = await crawler.arun(
                     url=url,
diff --git a/test/tools/test_crawl4ai.py b/test/tools/test_crawl4ai.py
new file mode 100644
index 0000000000..2a681bd4a1
--- /dev/null
+++ b/test/tools/test_crawl4ai.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from autogen.tools.crawl4ai import Crawl4AITool
+
+
+class TestCrawl4AITool:
+    def test__init__(self):
+        tool = Crawl4AITool()
+
+        assert isinstance(tool, Crawl4AITool)
+        assert tool.name == "crawl4ai"
+        assert tool.description == "Crawl a website and extract information."
+        assert callable(tool.func)
+        expected_schema = {
+            "function": {
+                "description": "Crawl a website and extract information.",
+                "name": "crawl4ai",
+                "parameters": {
+                    "properties": {
+                        "url": {"description": "The url to crawl and extract information from.", "type": "string"}
+                    },
+                    "required": ["url"],
+                    "type": "object",
+                },
+            },
+            "type": "function",
+        }
+        assert tool.tool_schema == expected_schema

From d57644a1120f65094869b6bb5e0a5789d2d94006 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Mon, 27 Jan 2025 15:57:47 +0100
Subject: [PATCH 03/15] Craw4Ai integration WIP

---
 autogen/tools/experimental/__init__.py            |  3 ++-
 autogen/tools/experimental/crawl4ai/crawl4ai.py   | 10 +++++++---
 pyproject.toml                                    |  1 +
 test/tools/experimental/crawl4ai/test_crawl4ai.py | 12 ++++++++++++
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/autogen/tools/experimental/__init__.py b/autogen/tools/experimental/__init__.py
index b36378767f..ef7aa570fc 100644
--- a/autogen/tools/experimental/__init__.py
+++ b/autogen/tools/experimental/__init__.py
@@ -3,5 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .browser_use import BrowserUseTool
+from .crawl4ai import Crawl4AITool
 
-__all__ = ["BrowserUseTool"]
+__all__ = ["BrowserUseTool", "Crawl4AITool"]
diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 16713d895b..b712eef16d 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -4,10 +4,14 @@
 
 from typing import Annotated, Any
 
-from crawl4ai import AsyncWebCrawler
-
+from ....import_utils import optional_import_block
 from ... import Tool
 
+with optional_import_block():
+    from crawl4ai import AsyncWebCrawler
+
+__all__ = ["Crawl4AITool"]
+
 
 class Crawl4AITool(Tool):
     def __init__(self) -> None:
@@ -18,7 +22,7 @@ async def crawl4ai(
                 result = await crawler.arun(
                     url=url,
                 )
-                return result.markdown()
+                return result.markdown
 
         super().__init__(
             name="crawl4ai",
diff --git a/pyproject.toml b/pyproject.toml
index 01a6106183..89cdaaa98e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -275,6 +275,7 @@ markers = [
     "redis",
     "docker",
     "browser_use",
+    "crawl4ai",
 ]
 
 [tool.black]
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 0a1251895b..884dac1fc3 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -3,10 +3,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
+import pytest
+
+from autogen.import_utils import optional_import_block, skip_on_missing_imports
+
+with optional_import_block():
+    from crawl4ai import AsyncWebCrawler
+
 from autogen.tools.experimental.crawl4ai import Crawl4AITool
 
 
+@pytest.mark.crawl4ai  # todo: remove me after we merge the PR that ads it automatically
+@skip_on_missing_imports(["crawl4ai"], "crawl4ai")
 class TestCrawl4AITool:
+    def _use_imports(self) -> None:
+        self._AsyncWebCrawler = AsyncWebCrawler
+
     def test__init__(self) -> None:
         tool = Crawl4AITool()
 

From 117a49f3da4667bf3cd949b8077897b0f80ab500 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Mon, 27 Jan 2025 15:58:15 +0100
Subject: [PATCH 04/15] Craw4Ai integration WIP

---
 .../experimental/crawl4ai/test_crawl4ai.py    | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 884dac1fc3..67b092d79b 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -19,13 +19,13 @@ class TestCrawl4AITool:
     def _use_imports(self) -> None:
         self._AsyncWebCrawler = AsyncWebCrawler
 
-    def test__init__(self) -> None:
-        tool = Crawl4AITool()
-
-        assert isinstance(tool, Crawl4AITool)
-        assert tool.name == "crawl4ai"
-        assert tool.description == "Crawl a website and extract information."
-        assert callable(tool.func)
+    @pytest.mark.asyncio
+    async def test__init__(self) -> None:
+        tool_without_llm = Crawl4AITool()
+        assert isinstance(tool_without_llm, Crawl4AITool)
+        assert tool_without_llm.name == "crawl4ai"
+        assert tool_without_llm.description == "Crawl a website and extract information."
+        assert callable(tool_without_llm.func)
         expected_schema = {
             "function": {
                 "description": "Crawl a website and extract information.",
@@ -40,4 +40,7 @@ def test__init__(self) -> None:
             },
             "type": "function",
         }
-        assert tool.tool_schema == expected_schema
+        assert tool_without_llm.tool_schema == expected_schema
+
+        result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home")
+        assert isinstance(result, str)

From a8c0fa6a67499f38585fb8c8af3aad7b886588d5 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Tue, 28 Jan 2025 13:45:55 +0100
Subject: [PATCH 05/15] Refactoring

---
 autogen/tools/dependency_injection.py               | 13 ++++++++++++-
 .../tools/experimental/browser_use/browser_use.py   | 13 ++-----------
 autogen/tools/experimental/crawl4ai/crawl4ai.py     | 11 +++++++++--
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/autogen/tools/dependency_injection.py b/autogen/tools/dependency_injection.py
index af70eabb7b..078895a07b 100644
--- a/autogen/tools/dependency_injection.py
+++ b/autogen/tools/dependency_injection.py
@@ -7,7 +7,7 @@
 from abc import ABC
 from collections.abc import Iterable
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union, get_type_hints
 
 from fast_depends import Depends as FastDepends
 from fast_depends import inject
@@ -26,6 +26,7 @@
     "Field",
     "get_context_params",
     "inject_params",
+    "on",
 ]
 
 
@@ -75,6 +76,16 @@ def last_message(self) -> Optional[dict[str, Any]]:
         return self._agent.last_message()
 
 
+T = TypeVar("T")
+
+
+def on(x: T) -> Callable[[], T]:
+    def inner(_x: T = x) -> T:
+        return _x
+
+    return inner
+
+
 @export_module("autogen.tools")
 def Depends(x: Any) -> Any:  # noqa: N802
     """Creates a dependency for injection based on the provided context or type.
diff --git a/autogen/tools/experimental/browser_use/browser_use.py b/autogen/tools/experimental/browser_use/browser_use.py
index 92d002c4bd..e83e7c3e83 100644
--- a/autogen/tools/experimental/browser_use/browser_use.py
+++ b/autogen/tools/experimental/browser_use/browser_use.py
@@ -2,12 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Annotated, Any, Callable, Optional, TypeVar
+from typing import Annotated, Any, Optional
 
 from pydantic import BaseModel
 
 from ....import_utils import optional_import_block, require_optional_import
 from ... import Depends, Tool
+from ...dependency_injection import on
 
 with optional_import_block():
     from browser_use import Agent
@@ -33,16 +34,6 @@ class BrowserUseResult(BaseModel):
     final_result: Optional[str]
 
 
-T = TypeVar("T")
-
-
-def on(x: T) -> Callable[[], T]:
-    def inner(_x: T = x) -> T:
-        return _x
-
-    return inner
-
-
 @require_optional_import(["langchain_openai", "browser_use"], "browser-use")
 # todo: add export_module decorator
 # @export_module("autogen.tools.experimental")
diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index b712eef16d..f506703be7 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -2,10 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Annotated, Any
+from typing import Annotated, Any, Optional
 
 from ....import_utils import optional_import_block
 from ... import Tool
+from ...dependency_injection import Depends, on
 
 with optional_import_block():
     from crawl4ai import AsyncWebCrawler
@@ -14,11 +15,17 @@
 
 
 class Crawl4AITool(Tool):
-    def __init__(self) -> None:
+    def __init__(
+        self,
+        llm_config: Optional[dict[str, Any]] = None,
+    ) -> None:
         async def crawl4ai(
             url: Annotated[str, "The url to crawl and extract information from."],
+            llm_config: Annotated[Optional[dict[str, Any]], Depends[on(llm_config)]],
         ) -> Any:
             async with AsyncWebCrawler() as crawler:
+                if llm_config is not None:
+                    raise ValueError("llm_config is not supported yet.")
                 result = await crawler.arun(
                     url=url,
                 )

From 220c554c4433013fdc7585949179a254336044a8 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Tue, 28 Jan 2025 14:18:02 +0100
Subject: [PATCH 06/15] crawl4ai with LLMs WIP

---
 .../tools/experimental/crawl4ai/crawl4ai.py   | 61 +++++++++++++++++--
 .../experimental/crawl4ai/test_crawl4ai.py    | 23 +++++--
 2 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index f506703be7..3cd8d068d3 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 from typing import Annotated, Any, Optional
 
 from ....import_utils import optional_import_block
@@ -9,7 +10,8 @@
 from ...dependency_injection import Depends, on
 
 with optional_import_block():
-    from crawl4ai import AsyncWebCrawler
+    from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
+    from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 __all__ = ["Crawl4AITool"]
 
@@ -21,18 +23,65 @@ def __init__(
     ) -> None:
         async def crawl4ai(
             url: Annotated[str, "The url to crawl and extract information from."],
-            llm_config: Annotated[Optional[dict[str, Any]], Depends[on(llm_config)]],
+            llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))],
         ) -> Any:
-            async with AsyncWebCrawler() as crawler:
-                if llm_config is not None:
-                    raise ValueError("llm_config is not supported yet.")
+            if llm_config is not None:
+                browser_cfg = BrowserConfig(headless=True)
+                crawl_config = Crawl4AITool._get_llm_strategy(llm_config)
+            else:
+                browser_cfg = None
+                crawl_config = None
+
+            async with AsyncWebCrawler(config=browser_cfg) as crawler:
                 result = await crawler.arun(
                     url=url,
+                    config=crawl_config,
                 )
-                return result.markdown
+                if result.success:
+                    return result.extracted_content
+
+                return result.error_message
 
         super().__init__(
             name="crawl4ai",
             description="Crawl a website and extract information.",
             func_or_tool=crawl4ai,
         )
+
+    @staticmethod
+    def _get_llm_strategy(llm_config: dict[str, Any]) -> CrawlerRunConfig:  # type: ignore[no-any-unimported]
+        if "config_list" not in llm_config:
+            if "model" in llm_config:
+                model = llm_config["model"]
+                api_type = "openai"
+                api_key = os.getenv("OPENAI_API_KEY")
+            raise ValueError("llm_config must be a valid config dictionary.")
+        else:
+            try:
+                model = llm_config["config_list"][0]["model"]
+                api_type = llm_config["config_list"][0].get("api_type", "openai")
+                api_key = llm_config["config_list"][0]["api_key"]
+
+            except (KeyError, TypeError):
+                raise ValueError("llm_config must be a valid config dictionary.")
+
+        provider = f"{api_type}/{model}"
+
+        # 1. Define the LLM extraction strategy
+        llm_strategy = LLMExtractionStrategy(
+            provider=provider,
+            api_token=api_key,
+            # schema=Product.schema_json(),            # Or use model_json_schema()
+            # extraction_type="schema",
+            instruction="Get the most relevant information from the page.",
+            chunk_token_threshold=1000,
+            overlap_rate=0.0,
+            apply_chunking=True,
+            input_format="markdown",  # or "html", "fit_markdown"
+            extra_args={"temperature": 0.0, "max_tokens": 800},
+        )
+
+        # 2. Build the crawler config
+        crawl_config = CrawlerRunConfig(extraction_strategy=llm_strategy, cache_mode=CacheMode.BYPASS)
+
+        return crawl_config
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 67b092d79b..f6861f72d4 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -6,11 +6,13 @@
 import pytest
 
 from autogen.import_utils import optional_import_block, skip_on_missing_imports
+from autogen.tools.experimental.crawl4ai import Crawl4AITool
 
-with optional_import_block():
-    from crawl4ai import AsyncWebCrawler
+from ....conftest import Credentials
 
-from autogen.tools.experimental.crawl4ai import Crawl4AITool
+with optional_import_block():
+    from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
+    from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 
 @pytest.mark.crawl4ai  # todo: remove me after we merge the PR that ads it automatically
@@ -18,9 +20,13 @@
 class TestCrawl4AITool:
     def _use_imports(self) -> None:
         self._AsyncWebCrawler = AsyncWebCrawler
+        self._BrowserConfig = BrowserConfig
+        self._CrawlerRunConfig = CrawlerRunConfig
+        self._CacheMode = CacheMode
+        self._LLMExtractionStrategy = LLMExtractionStrategy
 
     @pytest.mark.asyncio
-    async def test__init__(self) -> None:
+    async def test_without_llm(self) -> None:
         tool_without_llm = Crawl4AITool()
         assert isinstance(tool_without_llm, Crawl4AITool)
         assert tool_without_llm.name == "crawl4ai"
@@ -44,3 +50,12 @@ async def test__init__(self) -> None:
 
         result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home")
         assert isinstance(result, str)
+
+    @pytest.mark.asyncio
+    async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None:
+        tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config)
+        assert isinstance(tool_with_llm, Crawl4AITool)
+
+        result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home")
+        # assert isinstance(result, str)
+        assert result is None, result

From ec9d5f72d8d14094e15efd529391a504e37ea2ab Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 08:41:22 +0100
Subject: [PATCH 07/15] WIP

---
 autogen/tools/experimental/crawl4ai/crawl4ai.py   | 4 ++--
 test/tools/experimental/crawl4ai/test_crawl4ai.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 3cd8d068d3..42349f78df 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -27,7 +27,7 @@ async def crawl4ai(
         ) -> Any:
             if llm_config is not None:
                 browser_cfg = BrowserConfig(headless=True)
-                crawl_config = Crawl4AITool._get_llm_strategy(llm_config)
+                crawl_config = Crawl4AITool._get_crawl_config(llm_config)
             else:
                 browser_cfg = None
                 crawl_config = None
@@ -49,7 +49,7 @@ async def crawl4ai(
         )
 
     @staticmethod
-    def _get_llm_strategy(llm_config: dict[str, Any]) -> CrawlerRunConfig:  # type: ignore[no-any-unimported]
+    def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig:  # type: ignore[no-any-unimported]
         if "config_list" not in llm_config:
             if "model" in llm_config:
                 model = llm_config["model"]
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index f6861f72d4..9e9ee4c083 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -51,6 +51,12 @@ async def test_without_llm(self) -> None:
         result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home")
         assert isinstance(result, str)
 
+    def test_get_crawl_config(self, mock_credentials: Credentials) -> None:
+        config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config)
+        assert isinstance(config, CrawlerRunConfig)
+        assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}"
+
+    @pytest.mark.openai
     @pytest.mark.asyncio
     async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None:
         tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config)

From eb032cc08d9b0eb75d80ac516720dae03dc9b2ac Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 09:11:15 +0100
Subject: [PATCH 08/15] Refactoring

---
 .../tools/experimental/crawl4ai/crawl4ai.py   | 35 ++++++++++++-------
 .../experimental/crawl4ai/test_crawl4ai.py    |  3 +-
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 42349f78df..6aa3fe64bb 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -21,26 +21,35 @@ def __init__(
         self,
         llm_config: Optional[dict[str, Any]] = None,
     ) -> None:
-        async def crawl4ai(
-            url: Annotated[str, "The url to crawl and extract information from."],
-            llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))],
+        async def crawl4ai_helper(  # type: ignore[no-any-unimported]
+            url: str,
+            browser_cfg: Optional[BrowserConfig] = None,
+            crawl_config: Optional[CrawlerRunConfig] = None,
         ) -> Any:
-            if llm_config is not None:
-                browser_cfg = BrowserConfig(headless=True)
-                crawl_config = Crawl4AITool._get_crawl_config(llm_config)
-            else:
-                browser_cfg = None
-                crawl_config = None
-
             async with AsyncWebCrawler(config=browser_cfg) as crawler:
                 result = await crawler.arun(
                     url=url,
                     config=crawl_config,
                 )
-                if result.success:
-                    return result.extracted_content
 
-                return result.error_message
+            if crawl_config is None:
+                response = result.markdown
+            else:
+                response = result.extracted_content if result.success else result.error_message
+
+            return response
+
+        async def crawl4ai(
+            url: Annotated[str, "The url to crawl and extract information from."],
+            llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))],
+        ) -> Any:
+            if llm_config is None:
+                return await crawl4ai_helper(url=url)
+            else:
+                browser_cfg = BrowserConfig(headless=True)
+                crawl_config = Crawl4AITool._get_crawl_config(llm_config)
+
+                return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)
 
         super().__init__(
             name="crawl4ai",
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 9e9ee4c083..179b151706 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -63,5 +63,4 @@ async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None:
         assert isinstance(tool_with_llm, Crawl4AITool)
 
         result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home")
-        # assert isinstance(result, str)
-        assert result is None, result
+        assert isinstance(result, str)

From 5d357a2d10e2968369b8aaab758d0279f348654d Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 09:56:47 +0100
Subject: [PATCH 09/15] Add crawling with extraction schema

---
 .../tools/experimental/crawl4ai/crawl4ai.py   | 46 +++++++++++++------
 .../experimental/crawl4ai/test_crawl4ai.py    | 28 +++++++++--
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 6aa3fe64bb..7f494d7ecf 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Annotated, Any, Optional
+from typing import Annotated, Any, Optional, Type
+
+from pydantic import BaseModel
 
 from ....import_utils import optional_import_block
 from ... import Tool
@@ -20,6 +22,7 @@ class Crawl4AITool(Tool):
     def __init__(
         self,
         llm_config: Optional[dict[str, Any]] = None,
+        extraction_model: Optional[Type[BaseModel]] = None,
     ) -> None:
         async def crawl4ai_helper(  # type: ignore[no-any-unimported]
             url: str,
@@ -39,26 +42,36 @@ async def crawl4ai_helper(  # type: ignore[no-any-unimported]
 
             return response
 
-        async def crawl4ai(
+        async def crawl4ai_without_llm(
             url: Annotated[str, "The url to crawl and extract information from."],
-            llm_config: Annotated[Optional[dict[str, Any]], Depends(on(llm_config))],
         ) -> Any:
-            if llm_config is None:
-                return await crawl4ai_helper(url=url)
-            else:
-                browser_cfg = BrowserConfig(headless=True)
-                crawl_config = Crawl4AITool._get_crawl_config(llm_config)
+            return await crawl4ai_helper(url=url)
+
+        async def crawl4ai_with_llm(
+            url: Annotated[str, "The url to crawl and extract information from."],
+            instruction: Annotated[str, "The instruction to provide on how and what to extract."],
+            llm_config: Annotated[dict[str, Any], Depends(on(llm_config))],
+            extraction_model: Annotated[Optional[Type[BaseModel]], Depends(on(extraction_model))],
+        ) -> Any:
+            browser_cfg = BrowserConfig(headless=True)
+            crawl_config = Crawl4AITool._get_crawl_config(
+                llm_config=llm_config, instruction=instruction, extraction_model=extraction_model
+            )
 
-                return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)
+            return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)
 
         super().__init__(
             name="crawl4ai",
             description="Crawl a website and extract information.",
-            func_or_tool=crawl4ai,
+            func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm,
         )
 
     @staticmethod
-    def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig:  # type: ignore[no-any-unimported]
+    def _get_crawl_config(  # type: ignore[no-any-unimported]
+        llm_config: dict[str, Any],
+        instruction: str,
+        extraction_model: Optional[Type[BaseModel]] = None,
+    ) -> CrawlerRunConfig:
         if "config_list" not in llm_config:
             if "model" in llm_config:
                 model = llm_config["model"]
@@ -76,13 +89,18 @@ def _get_crawl_config(llm_config: dict[str, Any]) -> CrawlerRunConfig:  # type:
 
         provider = f"{api_type}/{model}"
 
+        schema = (
+            extraction_model.model_json_schema()
+            if (extraction_model and issubclass(extraction_model, BaseModel))
+            else None
+        )
         # 1. Define the LLM extraction strategy
         llm_strategy = LLMExtractionStrategy(
             provider=provider,
             api_token=api_key,
-            # schema=Product.schema_json(),            # Or use model_json_schema()
-            # extraction_type="schema",
-            instruction="Get the most relevant information from the page.",
+            schema=schema,
+            extraction_type="schema" if schema else "block",
+            instruction=instruction,
             chunk_token_threshold=1000,
             overlap_rate=0.0,
             apply_chunking=True,
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 179b151706..3514f3a355 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-
 import pytest
+from pydantic import BaseModel
 
 from autogen.import_utils import optional_import_block, skip_on_missing_imports
 from autogen.tools.experimental.crawl4ai import Crawl4AITool
@@ -15,7 +15,6 @@
     from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 
-@pytest.mark.crawl4ai  # todo: remove me after we merge the PR that ads it automatically
 @skip_on_missing_imports(["crawl4ai"], "crawl4ai")
 class TestCrawl4AITool:
     def _use_imports(self) -> None:
@@ -52,7 +51,7 @@ async def test_without_llm(self) -> None:
         assert isinstance(result, str)
 
     def test_get_crawl_config(self, mock_credentials: Credentials) -> None:
-        config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config)
+        config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config, instruction="dummy")
         assert isinstance(config, CrawlerRunConfig)
         assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}"
 
@@ -62,5 +61,26 @@ async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None:
         tool_with_llm = Crawl4AITool(llm_config=credentials_gpt_4o_mini.llm_config)
         assert isinstance(tool_with_llm, Crawl4AITool)
 
-        result = await tool_with_llm(url="https://docs.ag2.ai/docs/Home")
+        result = await tool_with_llm(
+            url="https://docs.ag2.ai/docs/Home", instruction="Get the most relevant information from the page."
+        )
+        assert isinstance(result, str)
+
+    @pytest.mark.openai
+    @pytest.mark.asyncio
+    async def test_with_llm_and_extraction_schema(self, credentials_gpt_4o_mini: Credentials) -> None:
+        class Product(BaseModel):
+            name: str
+            price: str
+
+        tool_with_llm = Crawl4AITool(
+            llm_config=credentials_gpt_4o_mini.llm_config,
+            extraction_model=Product,
+        )
+        assert isinstance(tool_with_llm, Crawl4AITool)
+
+        result = await tool_with_llm(
+            url="https://www.ikea.com/gb/en/",
+            instruction="Extract all product objects with 'name' and 'price' from the content.",
+        )
         assert isinstance(result, str)

From 9395941a4a7f0f2a53e89ca6665c6b9c9628c809 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 11:12:29 +0100
Subject: [PATCH 10/15] Refactoring

---
 .../tools/experimental/crawl4ai/crawl4ai.py   | 65 +++++++++++++++++--
 .../experimental/crawl4ai/test_crawl4ai.py    | 41 ++++++++++++
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 7f494d7ecf..d455bea1d6 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -19,11 +19,26 @@
 
 
 class Crawl4AITool(Tool):
+    """
+    Crawl a website and extract information using the crawl4ai library.
+    """
+
     def __init__(
         self,
         llm_config: Optional[dict[str, Any]] = None,
         extraction_model: Optional[Type[BaseModel]] = None,
+        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
     ) -> None:
+        """
+        Initialize the Crawl4AITool.
+
+        Args:
+            llm_config: The config dictionary for the LLM model. If None, the tool will run without LLM.
+            extraction_model: The Pydantic model to use for extraction. If None, the tool will use the default schema.
+            llm_strategy_kwargs: The keyword arguments to pass to the LLM extraction strategy.
+        """
+        Crawl4AITool._validate_llm_strategy_kwargs(llm_strategy_kwargs, llm_config_provided=(llm_config is not None))
+
         async def crawl4ai_helper(  # type: ignore[no-any-unimported]
             url: str,
             browser_cfg: Optional[BrowserConfig] = None,
@@ -51,11 +66,15 @@ async def crawl4ai_with_llm(
             url: Annotated[str, "The url to crawl and extract information from."],
             instruction: Annotated[str, "The instruction to provide on how and what to extract."],
             llm_config: Annotated[dict[str, Any], Depends(on(llm_config))],
+            llm_strategy_kwargs: Annotated[Optional[dict[str, Any]], Depends(on(llm_strategy_kwargs))],
             extraction_model: Annotated[Optional[Type[BaseModel]], Depends(on(extraction_model))],
         ) -> Any:
             browser_cfg = BrowserConfig(headless=True)
             crawl_config = Crawl4AITool._get_crawl_config(
-                llm_config=llm_config, instruction=instruction, extraction_model=extraction_model
+                llm_config=llm_config,
+                instruction=instruction,
+                extraction_model=extraction_model,
+                llm_strategy_kwargs=llm_strategy_kwargs,
             )
 
             return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)
@@ -66,10 +85,40 @@ async def crawl4ai_with_llm(
             func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm,
         )
 
+    @staticmethod
+    def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool) -> None:
+        if not llm_strategy_kwargs:
+            return
+
+        if not llm_config_provided:
+            raise ValueError("llm_strategy_kwargs can only be provided if llm_config is also provided.")
+
+        check_parameters_error_msg = "".join(
+            f"'{key}' should not be provided in llm_strategy_kwargs. It is automatically set based on llm_config.\n"
+            for key in ["provider", "api_token"]
+            if key in llm_strategy_kwargs
+        )
+
+        check_parameters_error_msg += "".join(
+            "'schema' should not be provided in llm_strategy_kwargs. It is automatically set based on extraction_model type.\n"
+            if "schema" in llm_strategy_kwargs
+            else ""
+        )
+
+        check_parameters_error_msg += "".join(
+            "'instruction' should not be provided in llm_strategy_kwargs. It is provided at the time of calling the tool.\n"
+            if "instruction" in llm_strategy_kwargs
+            else ""
+        )
+
+        if check_parameters_error_msg:
+            raise ValueError(check_parameters_error_msg)
+
     @staticmethod
     def _get_crawl_config(  # type: ignore[no-any-unimported]
         llm_config: dict[str, Any],
         instruction: str,
+        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
         extraction_model: Optional[Type[BaseModel]] = None,
     ) -> CrawlerRunConfig:
         if "config_list" not in llm_config:
@@ -89,23 +138,25 @@ def _get_crawl_config(  # type: ignore[no-any-unimported]
 
         provider = f"{api_type}/{model}"
 
+        if llm_strategy_kwargs is None:
+            llm_strategy_kwargs = {}
+
         schema = (
             extraction_model.model_json_schema()
             if (extraction_model and issubclass(extraction_model, BaseModel))
             else None
         )
+
+        extraction_type = llm_strategy_kwargs.pop("extraction_type", "schema" if schema else "block")
+
         # 1. Define the LLM extraction strategy
         llm_strategy = LLMExtractionStrategy(
             provider=provider,
             api_token=api_key,
             schema=schema,
-            extraction_type="schema" if schema else "block",
+            extraction_type=extraction_type,
             instruction=instruction,
-            chunk_token_threshold=1000,
-            overlap_rate=0.0,
-            apply_chunking=True,
-            input_format="markdown",  # or "html", "fit_markdown"
-            extra_args={"temperature": 0.0, "max_tokens": 800},
+            **llm_strategy_kwargs,
         )
 
         # 2. Build the crawler config
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 3514f3a355..3a32fb8ba6 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Any, Optional
+
 import pytest
 from pydantic import BaseModel
 
@@ -84,3 +86,42 @@ class Product(BaseModel):
             instruction="Extract all product objects with 'name' and 'price' from the content.",
         )
         assert isinstance(result, str)
+
+    @pytest.mark.parametrize(
+        ("llm_strategy_kwargs", "llm_config_provided", "expected_error"),
+        [
+            (None, True, None),
+            ({"some_param": "dummy_value"}, True, None),
+            (
+                {"provider": "openai/gpt-4o", "api_token": "dummy_token"},
+                False,
+                "llm_strategy_kwargs can only be provided if llm_config is also provided.",
+            ),
+            (
+                {"schema": "dummy_schema"},
+                True,
+                "'schema' should not be provided in llm_strategy_kwargs.",
+            ),
+            (
+                {"instruction": "dummy_instruction"},
+                True,
+                "'instruction' should not be provided in llm_strategy_kwargs.",
+            ),
+        ],
+    )
+    def test_validate_llm_strategy_kwargs(
+        self, llm_strategy_kwargs: Optional[dict[str, Any]], llm_config_provided: bool, expected_error: Optional[str]
+    ) -> None:
+        if expected_error is None:
+            Crawl4AITool._validate_llm_strategy_kwargs(
+                llm_strategy_kwargs=llm_strategy_kwargs, llm_config_provided=llm_config_provided
+            )
+            return
+
+        with pytest.raises(
+            ValueError,
+            match=expected_error,
+        ):
+            Crawl4AITool._validate_llm_strategy_kwargs(
+                llm_strategy_kwargs=llm_strategy_kwargs, llm_config_provided=llm_config_provided
+            )

From 3924eac7a1c68519df7b70dd64dc2cb066af6ddf Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 11:36:52 +0100
Subject: [PATCH 11/15] Refactor and add tests

---
 .../tools/experimental/crawl4ai/crawl4ai.py   | 17 +++++++----
 .../experimental/crawl4ai/test_crawl4ai.py    | 29 +++++++++++++++++--
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index d455bea1d6..e1e6dc909e 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -115,12 +115,7 @@ def _validate_llm_strategy_kwargs(llm_strategy_kwargs: Optional[dict[str, Any]],
             raise ValueError(check_parameters_error_msg)
 
     @staticmethod
-    def _get_crawl_config(  # type: ignore[no-any-unimported]
-        llm_config: dict[str, Any],
-        instruction: str,
-        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
-        extraction_model: Optional[Type[BaseModel]] = None,
-    ) -> CrawlerRunConfig:
+    def _get_provider_and_api_key(llm_config: dict[str, Any]) -> tuple[str, str]:
         if "config_list" not in llm_config:
             if "model" in llm_config:
                 model = llm_config["model"]
@@ -137,6 +132,16 @@ def _get_crawl_config(  # type: ignore[no-any-unimported]
                 raise ValueError("llm_config must be a valid config dictionary.")
 
         provider = f"{api_type}/{model}"
+        return provider, api_key  # type: ignore[return-value]
+
+    @staticmethod
+    def _get_crawl_config(  # type: ignore[no-any-unimported]
+        llm_config: dict[str, Any],
+        instruction: str,
+        llm_strategy_kwargs: Optional[dict[str, Any]] = None,
+        extraction_model: Optional[Type[BaseModel]] = None,
+    ) -> CrawlerRunConfig:
+        provider, api_key = Crawl4AITool._get_provider_and_api_key(llm_config)
 
         if llm_strategy_kwargs is None:
             llm_strategy_kwargs = {}
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 3a32fb8ba6..6ef1e82dc0 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -52,11 +52,36 @@ async def test_without_llm(self) -> None:
         result = await tool_without_llm(url="https://docs.ag2.ai/docs/Home")
         assert isinstance(result, str)
 
-    def test_get_crawl_config(self, mock_credentials: Credentials) -> None:
-        config = Crawl4AITool._get_crawl_config(mock_credentials.llm_config, instruction="dummy")
+    def test_get_provider_and_api_key(self, mock_credentials: Credentials) -> None:
+        provider, api_key = Crawl4AITool._get_provider_and_api_key(mock_credentials.llm_config)
+        assert provider == "openai/gpt-4o", provider
+        assert isinstance(api_key, str)
+
+    @pytest.mark.parametrize(
+        "use_extraction_model",
+        [
+            False,
+            True,
+        ],
+    )
+    def test_get_crawl_config(self, mock_credentials: Credentials, use_extraction_model: bool) -> None:
+        class Product(BaseModel):
+            name: str
+            price: str
+
+        extraction_model = Product if use_extraction_model else None
+
+        config = Crawl4AITool._get_crawl_config(
+            mock_credentials.llm_config, instruction="dummy", extraction_model=extraction_model
+        )
         assert isinstance(config, CrawlerRunConfig)
         assert config.extraction_strategy.provider == f"openai/{mock_credentials.model}"
 
+        if use_extraction_model:
+            assert config.extraction_strategy.schema == Product.model_json_schema()
+        else:
+            assert config.extraction_strategy.schema is None
+
     @pytest.mark.openai
     @pytest.mark.asyncio
     async def test_with_llm(self, credentials_gpt_4o_mini: Credentials) -> None:

From 2cbafb01017a599aaa69c1fd2887344c662e75f7 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 12:34:05 +0100
Subject: [PATCH 12/15] Add initial crawl4ai notebook tutorial

---
 notebook/tools_crawl4ai.ipynb | 247 ++++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 notebook/tools_crawl4ai.ipynb

diff --git a/notebook/tools_crawl4ai.ipynb b/notebook/tools_crawl4ai.ipynb
new file mode 100644
index 0000000000..1d9e8789a7
--- /dev/null
+++ b/notebook/tools_crawl4ai.ipynb
@@ -0,0 +1,247 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Supercharging Web Crawling with Crawl4AI"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Instalation\n",
+    "\n",
+    "To get started with the `crawl4ai` integration in AG2, follow these steps:\n",
+    "\n",
+    "1. Install AG2 with the `crawl4ai` extra:\n",
+    "   ```bash\n",
+    "   pip install ag2[crawl4ai]\n",
+    "   ````\n",
+    "2. Set up Playwright:\n",
+    "   \n",
+    "   ```bash\n",
+    "   playwright install\n",
+    "   ```\n",
+    "\n",
+    "3. For running the code in Jupyther, use `nest_asyncio` to allow nested event loops.\n",
+    "    ```bash\n",
+    "    pip install nest_asyncio\n",
+    "    ```\n",
+    "\n",
+    "\n",
+    "You're all set! Now you can start using browsing features in AG2.\n",
+    "\n",
+    "\n",
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import nest_asyncio\n",
+    "from pydantic import BaseModel\n",
+    "\n",
+    "from autogen import AssistantAgent, UserProxyAgent\n",
+    "from autogen.tools.experimental import Crawl4AITool\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM-Free Crawl4AI\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n",
+    "\n",
+    "llm_config = {\n",
+    "    \"config_list\": config_list,\n",
+    "}\n",
+    "\n",
+    "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n",
+    "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "crawlai_tool = Crawl4AITool()\n",
+    "\n",
+    "crawlai_tool.register_for_execution(user_proxy)\n",
+    "crawlai_tool.register_for_llm(assistant)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = user_proxy.initiate_chat(\n",
+    "    recipient=assistant,\n",
+    "    message=\"Get info from https://docs.ag2.ai/docs/Home\",\n",
+    "    max_turns=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Crawl4AI with LLM\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n",
+    "\n",
+    "llm_config = {\n",
+    "    \"config_list\": config_list,\n",
+    "}\n",
+    "\n",
+    "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n",
+    "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set llm_config to Crawl4AITool\n",
+    "crawlai_tool = Crawl4AITool(llm_config=llm_config)\n",
+    "\n",
+    "crawlai_tool.register_for_execution(user_proxy)\n",
+    "crawlai_tool.register_for_llm(assistant)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = user_proxy.initiate_chat(\n",
+    "    recipient=assistant,\n",
+    "    message=\"Get info from https://docs.ag2.ai/docs/Home\",\n",
+    "    max_turns=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Crawl4AI with LLM & Schema for Structured Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_list = [{\"model\": \"gpt-4o-mini\", \"api_key\": os.environ[\"OPENAI_API_KEY\"]}]\n",
+    "\n",
+    "llm_config = {\n",
+    "    \"config_list\": config_list,\n",
+    "}\n",
+    "\n",
+    "user_proxy = UserProxyAgent(name=\"user_proxy\", human_input_mode=\"NEVER\")\n",
+    "assistant = AssistantAgent(name=\"assistant\", llm_config=llm_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Blog(BaseModel):\n",
+    "    title: str\n",
+    "    url: str\n",
+    "\n",
+    "\n",
+    "# Set llm_config and extraction_model to Crawl4AITool\n",
+    "crawlai_tool = Crawl4AITool(llm_config=llm_config, extraction_model=Blog)\n",
+    "\n",
+    "crawlai_tool.register_for_execution(user_proxy)\n",
+    "crawlai_tool.register_for_llm(assistant)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "message = \"Extract all blog posts from https://docs.ag2.ai/blog\"\n",
+    "result = user_proxy.initiate_chat(\n",
+    "    recipient=assistant,\n",
+    "    message=message,\n",
+    "    max_turns=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "front_matter": {
+   "description": "Supercharging Web Crawling with Crawl4AI",
+   "tags": [
+    "tools",
+    "browser-use",
+    "webscraping",
+    "function calling"
+   ]
+  },
+  "kernelspec": {
+   "display_name": ".venv-crawl4ai",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ccd1cc568b73f9eb12c6b6f7c39a20b42cd59998 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 12:39:21 +0100
Subject: [PATCH 13/15] export_module for browser-use and crawl4ai modules

---
 autogen/tools/experimental/browser_use/browser_use.py | 7 +++----
 autogen/tools/experimental/crawl4ai/crawl4ai.py       | 2 ++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/autogen/tools/experimental/browser_use/browser_use.py b/autogen/tools/experimental/browser_use/browser_use.py
index e83e7c3e83..003d0fc4ea 100644
--- a/autogen/tools/experimental/browser_use/browser_use.py
+++ b/autogen/tools/experimental/browser_use/browser_use.py
@@ -6,6 +6,7 @@
 
 from pydantic import BaseModel
 
+from ....doc_utils import export_module
 from ....import_utils import optional_import_block, require_optional_import
 from ... import Depends, Tool
 from ...dependency_injection import on
@@ -20,8 +21,7 @@
 __all__ = ["BrowserUseResult", "BrowserUseTool"]
 
 
-# todo: add export_module decorator
-# @export_module("autogen.tools.experimental.browser_use")
+@export_module("autogen.tools.experimental.browser_use")
 class BrowserUseResult(BaseModel):
     """The result of using the browser to perform a task.
 
@@ -35,8 +35,7 @@ class BrowserUseResult(BaseModel):
 
 
 @require_optional_import(["langchain_openai", "browser_use"], "browser-use")
-# todo: add export_module decorator
-# @export_module("autogen.tools.experimental")
+@export_module("autogen.tools.experimental")
 class BrowserUseTool(Tool):
     """BrowserUseTool is a tool that uses the browser to perform a task."""
 
diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index e1e6dc909e..79326e4bda 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -7,6 +7,7 @@
 
 from pydantic import BaseModel
 
+from ....doc_utils import export_module
 from ....import_utils import optional_import_block
 from ... import Tool
 from ...dependency_injection import Depends, on
@@ -18,6 +19,7 @@
 __all__ = ["Crawl4AITool"]
 
 
+@export_module("autogen.tools.experimental")
 class Crawl4AITool(Tool):
     """
     Crawl a website and extract information using the crawl4ai library.

From b660a4309ca9d9850394a261049696925cbdccef Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 12:51:38 +0100
Subject: [PATCH 14/15] Fix tests

---
 autogen/tools/experimental/crawl4ai/crawl4ai.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 79326e4bda..62f6f47925 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -43,8 +43,8 @@ def __init__(
 
         async def crawl4ai_helper(  # type: ignore[no-any-unimported]
             url: str,
-            browser_cfg: Optional[BrowserConfig] = None,
-            crawl_config: Optional[CrawlerRunConfig] = None,
+            browser_cfg: Optional["BrowserConfig"] = None,
+            crawl_config: Optional["CrawlerRunConfig"] = None,
         ) -> Any:
             async with AsyncWebCrawler(config=browser_cfg) as crawler:
                 result = await crawler.arun(
@@ -142,7 +142,7 @@ def _get_crawl_config(  # type: ignore[no-any-unimported]
         instruction: str,
         llm_strategy_kwargs: Optional[dict[str, Any]] = None,
         extraction_model: Optional[Type[BaseModel]] = None,
-    ) -> CrawlerRunConfig:
+    ) -> "CrawlerRunConfig":
         provider, api_key = Crawl4AITool._get_provider_and_api_key(llm_config)
 
         if llm_strategy_kwargs is None:

From 97337940db83d0add17018fd7cf2e5f6b1e591c3 Mon Sep 17 00:00:00 2001
From: Robert Jambrecic <robert@airt.ai>
Date: Wed, 29 Jan 2025 13:22:40 +0100
Subject: [PATCH 15/15] Cleanup

---
 autogen/tools/__init__.py                             |  2 +-
 autogen/tools/experimental/crawl4ai/crawl4ai.py       |  3 ++-
 .../experimental/browser_use/test_browser_use.py      | 11 +----------
 test/tools/experimental/crawl4ai/test_crawl4ai.py     | 10 +---------
 4 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/autogen/tools/__init__.py b/autogen/tools/__init__.py
index 04542b4d3b..ecb29373ff 100644
--- a/autogen/tools/__init__.py
+++ b/autogen/tools/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
 #
 # SPDX-License-Identifier: Apache-2.0
 
diff --git a/autogen/tools/experimental/crawl4ai/crawl4ai.py b/autogen/tools/experimental/crawl4ai/crawl4ai.py
index 62f6f47925..f616613eb7 100644
--- a/autogen/tools/experimental/crawl4ai/crawl4ai.py
+++ b/autogen/tools/experimental/crawl4ai/crawl4ai.py
@@ -8,7 +8,7 @@
 from pydantic import BaseModel
 
 from ....doc_utils import export_module
-from ....import_utils import optional_import_block
+from ....import_utils import optional_import_block, require_optional_import
 from ... import Tool
 from ...dependency_injection import Depends, on
 
@@ -19,6 +19,7 @@
 __all__ = ["Crawl4AITool"]
 
 
+@require_optional_import(["crawl4ai"], "crawl4ai")
 @export_module("autogen.tools.experimental")
 class Crawl4AITool(Tool):
     """
diff --git a/test/tools/experimental/browser_use/test_browser_use.py b/test/tools/experimental/browser_use/test_browser_use.py
index 0426b6c31e..7d194c6464 100644
--- a/test/tools/experimental/browser_use/test_browser_use.py
+++ b/test/tools/experimental/browser_use/test_browser_use.py
@@ -8,23 +8,14 @@
 import pytest
 
 from autogen import AssistantAgent, UserProxyAgent
-from autogen.import_utils import optional_import_block, skip_on_missing_imports
+from autogen.import_utils import skip_on_missing_imports
 from autogen.tools.experimental.browser_use import BrowserUseResult, BrowserUseTool
 
 from ....conftest import Credentials, credentials_browser_use
 
-with optional_import_block():
-    from browser_use import Agent
-    from langchain_openai import ChatOpenAI
 
-
-@pytest.mark.browser_use  # todo: remove me after we merge the PR that ads it automatically
 @skip_on_missing_imports(["langchain_openai", "browser_use"], "browser-use")
 class TestBrowserUseToolOpenai:
-    def _use_imports(self) -> None:
-        self._ChatOpenAI = ChatOpenAI
-        self._Agent = Agent
-
     def test_broser_use_tool_init(self, mock_credentials: Credentials) -> None:
         browser_use_tool = BrowserUseTool(llm_config=mock_credentials.llm_config)
         assert browser_use_tool.name == "browser_use"
diff --git a/test/tools/experimental/crawl4ai/test_crawl4ai.py b/test/tools/experimental/crawl4ai/test_crawl4ai.py
index 6ef1e82dc0..befb757fbe 100644
--- a/test/tools/experimental/crawl4ai/test_crawl4ai.py
+++ b/test/tools/experimental/crawl4ai/test_crawl4ai.py
@@ -13,19 +13,11 @@
 from ....conftest import Credentials
 
 with optional_import_block():
-    from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
-    from crawl4ai.extraction_strategy import LLMExtractionStrategy
+    from crawl4ai import CrawlerRunConfig
 
 
 @skip_on_missing_imports(["crawl4ai"], "crawl4ai")
 class TestCrawl4AITool:
-    def _use_imports(self) -> None:
-        self._AsyncWebCrawler = AsyncWebCrawler
-        self._BrowserConfig = BrowserConfig
-        self._CrawlerRunConfig = CrawlerRunConfig
-        self._CacheMode = CacheMode
-        self._LLMExtractionStrategy = LLMExtractionStrategy
-
     @pytest.mark.asyncio
     async def test_without_llm(self) -> None:
         tool_without_llm = Crawl4AITool()