fix: migrate to Scrapy's native AsyncCrawlerRunner

vdusek · claude · vdusek · commit adf5fb2174b0 · 2026-02-18T09:03:08.000+01:00
Adopt Scrapy 2.14's AsyncCrawlerRunner to eliminate the Deferred conversion
layer (deferred_to_future). The run_scrapy_actor function now handles
asyncio reactor installation internally, removing boilerplate from user code

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx
@@ -17,13 +17,13 @@ import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'
 
 ## Integrating Scrapy with the Apify platform
 
-The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications.
+The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications.
 
 <CodeBlock className="language-python" title="__main.py__: The Actor entry point ">
     {UnderscoreMainExample}
 </CodeBlock>
 
-In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed.
+In this setup, `apify.scrapy.initialize_logging` configures an Apify log formatter and reconfigures loggers to ensure consistent logging across Scrapy, the Apify SDK, and other libraries. The `apify.scrapy.run_scrapy_actor` installs Twisted's asyncio-compatible reactor and bridges asyncio coroutines with Twisted's reactor, enabling the Actor's main coroutine, which contains the Scrapy spider, to be executed.
 
 Make sure the `SCRAPY_SETTINGS_MODULE` environment variable is set to the path of the Scrapy settings module. This variable is also used by the `Actor` class to detect that the project is a Scrapy project, triggering additional actions.
 
@@ -47,7 +47,7 @@ Additional helper functions in the [`apify.scrapy`](https://github.com/apify/api
 - `apply_apify_settings` - Applies Apify-specific components to Scrapy settings.
 - `to_apify_request` and `to_scrapy_request` - Convert between Apify and Scrapy request objects.
 - `initialize_logging` - Configures logging for the Actor environment.
-- `run_scrapy_actor` - Bridges asyncio and Twisted event loops.
+- `run_scrapy_actor` - Installs Twisted's asyncio reactor and bridges asyncio and Twisted event loops.
 
 ## Create a new Apify-Scrapy project
 
diff --git a/docs/03_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py
@@ -1,11 +1,5 @@
 from __future__ import annotations
 
-from scrapy.utils.reactor import install_reactor
-
-# Install Twisted's asyncio reactor before importing any other Twisted or
-# Scrapy components.
-install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
-
 import os
 
 from apify.scrapy import initialize_logging, run_scrapy_actor
diff --git a/docs/03_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
+
 import asyncio
 
-from scrapy.crawler import CrawlerRunner
-from scrapy.utils.defer import deferred_to_future
+from scrapy.crawler import AsyncCrawlerRunner
 
 from apify import Actor
 from apify.scrapy import apply_apify_settings
@@ -23,14 +23,13 @@ async def main() -> None:
         # Apply Apify settings, which will override the Scrapy project settings.
         settings = apply_apify_settings(proxy_config=proxy_config)
 
-        # Create CrawlerRunner and execute the Scrapy spider.
-        crawler_runner = CrawlerRunner(settings)
-        crawl_deferred = crawler_runner.crawl(
+        # Create AsyncCrawlerRunner and execute the Scrapy spider.
+        crawler_runner = AsyncCrawlerRunner(settings)
+        await crawler_runner.crawl(
             Spider,
             start_urls=start_urls,
             allowed_domains=allowed_domains,
         )
-        await deferred_to_future(crawl_deferred)
 
 
 if __name__ == '__main__':
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-scrapy = ["scrapy>=2.11.0"]
+scrapy = ["scrapy>=2.14.0"]
 
 [project.urls]
 "Apify Homepage" = "https://apify.com"
@@ -161,10 +161,6 @@ indent-style = "space"
     "PLR2004", # Magic value used in comparison, consider replacing `{value}` with a constant variable
     "PLW0603", # Using the global statement to update `{name}` is discouraged
 ]
-"**/docs/**/scrapy_project/**/__main__.py" = [
-    # Because of asyncioreactor.install() call.
-    "E402", # Module level import not at top of file
-]
 "**/docs/**/scrapy_project/**" = [
     # Local imports are mixed up with the Apify SDK.
     "I001", # Import block is un-sorted or un-formatted
diff --git a/src/apify/scrapy/_actor_runner.py b/src/apify/scrapy/_actor_runner.py
@@ -3,24 +3,22 @@
 import asyncio
 from typing import TYPE_CHECKING
 
-from twisted.internet.defer import Deferred, ensureDeferred
-from twisted.internet.task import react
-
 if TYPE_CHECKING:
     from collections.abc import Coroutine
 
 
-async def _run_coro_as_deferred(coro: Coroutine) -> None:
-    """Wrap the given asyncio coroutine in a Task and await its result as a Twisted Deferred."""
-    task = asyncio.ensure_future(coro)
-    await Deferred.fromFuture(task)
-
-
 def run_scrapy_actor(coro: Coroutine) -> None:
     """Start Twisted's reactor and execute the provided Actor coroutine.
 
-    This function initiates the Twisted reactor and runs the given asyncio coroutine (typically the
-    Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted event loops,
-    enabling the Apify and Scrapy integration to work together.
+    This function installs Twisted's asyncio-compatible reactor, then initiates it and runs the given asyncio
+    coroutine (typically the Actor's main) by converting it to a Deferred. This bridges the asyncio and Twisted
+    event loops, enabling the Apify and Scrapy integration to work together.
     """
-    react(lambda _: ensureDeferred(_run_coro_as_deferred(coro)))
+    from scrapy.utils.reactor import install_reactor  # noqa: PLC0415
+
+    install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
+
+    from twisted.internet.defer import Deferred  # noqa: PLC0415
+    from twisted.internet.task import react  # noqa: PLC0415
+
+    react(lambda _reactor: Deferred.fromFuture(asyncio.ensure_future(coro)))
diff --git a/tests/e2e/test_actor_scrapy.py b/tests/e2e/test_actor_scrapy.py
@@ -28,7 +28,7 @@ async def test_actor_scrapy_title_spider(
     actor = await make_actor(
         'actor-scrapy-title-spider',
         source_files=actor_source_files,
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(
         actor,
diff --git a/tests/e2e/test_scrapy/actor_source/__main__.py b/tests/e2e/test_scrapy/actor_source/__main__.py
@@ -1,14 +1,9 @@
 from __future__ import annotations
 
-from scrapy.utils.reactor import install_reactor
+import os
 
-install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
-
-import os  # noqa: E402, I001
-
-from apify.scrapy import initialize_logging, run_scrapy_actor  # noqa: E402
-
-from .main import main  # noqa: E402
+from .main import main
+from apify.scrapy import initialize_logging, run_scrapy_actor
 
 os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
 
diff --git a/tests/e2e/test_scrapy/actor_source/main.py b/tests/e2e/test_scrapy/actor_source/main.py
@@ -1,16 +1,14 @@
-from __future__ import annotations  # noqa: I001
+from __future__ import annotations
 
-from scrapy.crawler import CrawlerRunner
-from scrapy.utils.defer import deferred_to_future
+from scrapy.crawler import AsyncCrawlerRunner
 
+from .spiders import Spider  # ty: ignore[unresolved-import]
 from apify import Actor
 from apify.scrapy import apply_apify_settings
 
-from .spiders import Spider  # ty: ignore[unresolved-import]
-
 
 async def main() -> None:
     async with Actor:
         settings = apply_apify_settings()
-        runner = CrawlerRunner(settings)
-        await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
+        runner = AsyncCrawlerRunner(settings)
+        await runner.crawl(Spider, start_urls=['http://localhost:8080/'])
diff --git a/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py
@@ -1,19 +1,17 @@
-from __future__ import annotations  # noqa: I001
+from __future__ import annotations
 
 import os
 
-from scrapy.crawler import CrawlerRunner
-from scrapy.utils.defer import deferred_to_future
+from scrapy.crawler import AsyncCrawlerRunner
 
+from .spiders import Spider  # ty: ignore[unresolved-import]
 from apify import Actor
 from apify.scrapy import apply_apify_settings
 
-from .spiders import Spider  # ty: ignore[unresolved-import]
-
 
 async def main() -> None:
     async with Actor:
         os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline'
         settings = apply_apify_settings()
-        runner = CrawlerRunner(settings)
-        await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
+        runner = AsyncCrawlerRunner(settings)
+        await runner.crawl(Spider, start_urls=['http://localhost:8080/'])
diff --git a/tests/e2e/test_scrapy/test_basic_spider.py b/tests/e2e/test_scrapy/test_basic_spider.py
@@ -12,7 +12,7 @@ async def test_basic_spider(make_actor: MakeActorFunction, run_actor: RunActorFu
     actor = await make_actor(
         label='scrapy-basic',
         source_files=get_scrapy_source_files('spider_basic.py', 'BasicSpider'),
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(actor)
     await verify_spider_results(actor, run_result)
diff --git a/tests/e2e/test_scrapy/test_cb_kwargs_spider.py b/tests/e2e/test_scrapy/test_cb_kwargs_spider.py
@@ -30,7 +30,7 @@ async def test_cb_kwargs_spider(make_actor: MakeActorFunction, run_actor: RunAct
     actor = await make_actor(
         label='scrapy-cb-kwargs',
         source_files=get_scrapy_source_files('spider_cb_kwargs.py', 'CbKwargsSpider'),
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(actor)
     await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS)
diff --git a/tests/e2e/test_scrapy/test_crawl_spider.py b/tests/e2e/test_scrapy/test_crawl_spider.py
@@ -12,7 +12,7 @@ async def test_crawl_spider(make_actor: MakeActorFunction, run_actor: RunActorFu
     actor = await make_actor(
         label='scrapy-crawl',
         source_files=get_scrapy_source_files('spider_crawl.py', 'CrawlProductSpider'),
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(actor)
     await verify_spider_results(actor, run_result)
diff --git a/tests/e2e/test_scrapy/test_custom_pipeline_spider.py b/tests/e2e/test_scrapy/test_custom_pipeline_spider.py
@@ -26,7 +26,7 @@ async def test_custom_pipeline_spider(make_actor: MakeActorFunction, run_actor:
                 'src/pipelines.py': read_actor_source('pipelines.py'),
             },
         ),
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(actor)
     await verify_spider_results(actor, run_result, expected_products=_EXPECTED_PRODUCTS)
diff --git a/tests/e2e/test_scrapy/test_itemloader_spider.py b/tests/e2e/test_scrapy/test_itemloader_spider.py
@@ -12,7 +12,7 @@ async def test_itemloader_spider(make_actor: MakeActorFunction, run_actor: RunAc
     actor = await make_actor(
         label='scrapy-itemloader',
         source_files=get_scrapy_source_files('spider_itemloader.py', 'ItemLoaderSpider'),
-        additional_requirements=['scrapy~=2.12.0'],
+        additional_requirements=['scrapy>=2.14.0'],
     )
     run_result = await run_actor(actor)
     await verify_spider_results(actor, run_result)
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ async def test_actor_scrapy_title_spider(`
`28`	`28`	`actor = await make_actor(`
`29`	`29`	`'actor-scrapy-title-spider',`
`30`	`30`	`source_files=actor_source_files,`
`31`		`- additional_requirements=['scrapy~=2.12.0'],`
	`31`	`+ additional_requirements=['scrapy>=2.14.0'],`
`32`	`32`	`)`
`33`	`33`	`run_result = await run_actor(`
`34`	`34`	`actor,`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ async def test_basic_spider(make_actor: MakeActorFunction, run_actor: RunActorFu`
`12`	`12`	`actor = await make_actor(`
`13`	`13`	`label='scrapy-basic',`
`14`	`14`	`source_files=get_scrapy_source_files('spider_basic.py', 'BasicSpider'),`
`15`		`- additional_requirements=['scrapy~=2.12.0'],`
	`15`	`+ additional_requirements=['scrapy>=2.14.0'],`
`16`	`16`	`)`
`17`	`17`	`run_result = await run_actor(actor)`
`18`	`18`	`await verify_spider_results(actor, run_result)`