Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from scrapy.http.headers import Headers
from scrapy.utils.request import request_from_dict

from crawlee._request import UserData
from crawlee._types import HttpHeaders

from apify import Request as ApifyRequest
Expand Down Expand Up @@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
if scrapy_request.meta.get('apify_request_id'):
request_kwargs['id'] = scrapy_request.meta['apify_request_id']

request_kwargs['user_data'] = scrapy_request.meta.get('userData', {})
user_data = scrapy_request.meta.get('userData', {})

# Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects
# from leaking into Request.from_url() during Scrapy-Apify roundtrips.
if isinstance(user_data, UserData):
user_data = user_data.model_dump(by_alias=True)

# Remove internal Crawlee data since it's managed by Request.from_url() and values
# from previous roundtrips cause incorrect state.
if isinstance(user_data, dict):
user_data.pop('__crawlee', None)

request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {}

# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
if isinstance(scrapy_request.headers, Headers):
Expand Down
36 changes: 35 additions & 1 deletion tests/unit/scrapy/requests/test_to_apify_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from crawlee._types import HttpHeaders

from apify.scrapy.requests import to_apify_request
from apify.scrapy.requests import to_apify_request, to_scrapy_request


class DummySpider(Spider):
Expand Down Expand Up @@ -90,3 +90,37 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None:

apify_request = to_apify_request(scrapy_request, spider) # ty: ignore[invalid-argument-type]
assert apify_request is None


def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None:
"""Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData.
After two roundtrips through to_apify_request/to_scrapy_request with userData propagation,
Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee'].
On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee')
and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping.
"""
# Step 1: Initial request -> first roundtrip
initial_scrapy_request = Request(url='https://example.com/page')
apify_request_1 = to_apify_request(initial_scrapy_request, spider)
assert apify_request_1 is not None
scrapy_request_1 = to_scrapy_request(apify_request_1, spider)

# Step 2: Spider yields follow-up with propagated userData -> second roundtrip
follow_up_1 = Request(
url='https://example.com/page2',
meta={'userData': scrapy_request_1.meta['userData']},
)
apify_request_2 = to_apify_request(follow_up_1, spider)
assert apify_request_2 is not None
scrapy_request_2 = to_scrapy_request(apify_request_2, spider)

# Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip.
# This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__.
follow_up_2 = Request(
url='https://example.com/image.png',
meta={'userData': scrapy_request_2.meta['userData']},
)
follow_up_apify_request = to_apify_request(follow_up_2, spider)
assert follow_up_apify_request is not None
assert follow_up_apify_request.url == 'https://example.com/image.png'
Loading