diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 5840ae35..dd40482d 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -10,6 +10,7 @@ from scrapy.http.headers import Headers from scrapy.utils.request import request_from_dict +from crawlee._request import UserData from crawlee._types import HttpHeaders from apify import Request as ApifyRequest @@ -52,7 +53,19 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ if scrapy_request.meta.get('apify_request_id'): request_kwargs['id'] = scrapy_request.meta['apify_request_id'] - request_kwargs['user_data'] = scrapy_request.meta.get('userData', {}) + user_data = scrapy_request.meta.get('userData', {}) + + # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects + # from leaking into Request.from_url() during Scrapy-Apify roundtrips. + if isinstance(user_data, UserData): + user_data = user_data.model_dump(by_alias=True) + + # Remove internal Crawlee data since it's managed by Request.from_url() and values + # from previous roundtrips cause incorrect state. + if isinstance(user_data, dict): + user_data.pop('__crawlee', None) + + request_kwargs['user_data'] = user_data if isinstance(user_data, dict) else {} # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request if isinstance(scrapy_request.headers, Headers): diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index ea43278a..d3aef6b2 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -6,7 +6,7 @@ from crawlee._types import HttpHeaders -from apify.scrapy.requests import to_apify_request +from apify.scrapy.requests import to_apify_request, to_scrapy_request class DummySpider(Spider): @@ -90,3 +90,37 @@ def test_invalid_scrapy_request_returns_none(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) # ty: ignore[invalid-argument-type] assert apify_request is None + + +def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> None: + """Reproduce: CrawleeRequestData() argument after ** must be a mapping, not CrawleeRequestData. + + After two roundtrips through to_apify_request/to_scrapy_request with userData propagation, + Request.from_url() writes a CrawleeRequestData object into UserData.__pydantic_extra__['__crawlee']. + On the next roundtrip, this CrawleeRequestData object is found by user_data_dict.get('__crawlee') + and passed to CrawleeRequestData(**obj), which fails because CrawleeRequestData is not a mapping. + """ + # Step 1: Initial request -> first roundtrip + initial_scrapy_request = Request(url='https://example.com/page') + apify_request_1 = to_apify_request(initial_scrapy_request, spider) + assert apify_request_1 is not None + scrapy_request_1 = to_scrapy_request(apify_request_1, spider) + + # Step 2: Spider yields follow-up with propagated userData -> second roundtrip + follow_up_1 = Request( + url='https://example.com/page2', + meta={'userData': scrapy_request_1.meta['userData']}, + ) + apify_request_2 = to_apify_request(follow_up_1, spider) + assert apify_request_2 is not None + scrapy_request_2 = to_scrapy_request(apify_request_2, spider) + + # Step 3: Spider yields another follow-up with propagated userData from 2nd roundtrip. + # This fails because userData now has __crawlee as CrawleeRequestData in __pydantic_extra__. + follow_up_2 = Request( + url='https://example.com/image.png', + meta={'userData': scrapy_request_2.meta['userData']}, + ) + follow_up_apify_request = to_apify_request(follow_up_2, spider) + assert follow_up_apify_request is not None + assert follow_up_apify_request.url == 'https://example.com/image.png'