Skip to content

Commit 662b93b

Browse files
authored
fix: Update push_data and user_data annotation with JsonSerializable instead of Any (#1889)
### Description - Improved annotation for arguments that accept JSON data by replacing implicit `Any` with explicit `JsonSerializable` type for `push_data` and `user_data` parameters. ### Issues - Closes: #1191
1 parent b3b8c59 commit 662b93b

20 files changed

Lines changed: 107 additions & 78 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ dependencies = [
4545
"pydantic>=2.11.0",
4646
"pyee>=9.0.0",
4747
"tldextract>=5.1.0",
48-
"typing-extensions>=4.1.0",
48+
"typing-extensions>=4.10.0",
4949
"yarl>=1.18.0",
5050
]
5151

src/crawlee/_request.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from collections.abc import Iterator, MutableMapping
3+
from collections.abc import Iterator, Mapping, MutableMapping
44
from datetime import datetime
55
from enum import IntEnum
66
from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast
@@ -135,7 +135,7 @@ class RequestOptions(TypedDict):
135135
keep_url_fragment: NotRequired[bool]
136136
use_extended_unique_key: NotRequired[bool]
137137
always_enqueue: NotRequired[bool]
138-
user_data: NotRequired[dict[str, JsonSerializable]]
138+
user_data: NotRequired[Mapping[str, JsonSerializable]]
139139
no_retry: NotRequired[bool]
140140
enqueue_strategy: NotRequired[EnqueueStrategy]
141141
max_retries: NotRequired[int | None]
@@ -200,7 +200,7 @@ class Request(BaseModel):
200200
headers: HttpHeaders = HttpHeaders()
201201
"""HTTP request headers."""
202202

203-
user_data: dict[str, JsonSerializable] = {}
203+
user_data: MutableMapping[str, JsonSerializable] = {}
204204
"""Custom user data assigned to the request. Use this to save any request related data to the
205205
request's scope, keeping them accessible on retries, failures etc.
206206
"""
@@ -209,8 +209,9 @@ class Request(BaseModel):
209209
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
210210
"""HTTP request headers."""
211211

212+
# Internally, the model contains `UserData`, this is just for convenience
212213
user_data: Annotated[
213-
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
214+
MutableMapping[str, JsonSerializable],
214215
Field(alias='userData', default_factory=UserData),
215216
PlainValidator(user_data_adapter.validate_python),
216217
PlainSerializer(

src/crawlee/_types.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import json
1515
import logging
1616
import re
17-
from collections.abc import Awaitable, Coroutine, Sequence
17+
from collections.abc import Awaitable, Coroutine, MutableMapping, Sequence
1818

1919
from typing_extensions import NotRequired, Required, Self, Unpack
2020

@@ -27,9 +27,7 @@
2727
from crawlee.storage_clients import StorageClient
2828
from crawlee.storages import KeyValueStore
2929

30-
# Workaround for https://github.com/pydantic/pydantic/issues/9445
31-
J = TypeVar('J', bound='JsonSerializable')
32-
JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None
30+
JsonSerializable = dict[str, 'JsonSerializable'] | list['JsonSerializable'] | str | int | float | bool | None
3331
else:
3432
from pydantic import JsonValue as JsonSerializable
3533

@@ -198,7 +196,7 @@ class PushDataKwargs(TypedDict):
198196

199197

200198
class PushDataFunctionCall(PushDataKwargs):
201-
data: list[dict[str, Any]] | dict[str, Any]
199+
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]
202200
dataset_id: str | None
203201
dataset_name: str | None
204202
dataset_alias: str | None
@@ -300,7 +298,7 @@ async def add_requests(
300298

301299
async def push_data(
302300
self,
303-
data: list[dict[str, Any]] | dict[str, Any],
301+
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
304302
dataset_id: str | None = None,
305303
dataset_name: str | None = None,
306304
dataset_alias: str | None = None,
@@ -392,7 +390,7 @@ def __call__(
392390
selector: str | None = None,
393391
attribute: str | None = None,
394392
label: str | None = None,
395-
user_data: dict[str, Any] | None = None,
393+
user_data: Mapping[str, JsonSerializable] | None = None,
396394
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
397395
rq_id: str | None = None,
398396
rq_name: str | None = None,
@@ -417,7 +415,7 @@ def __call__(
417415
selector: str | None = None,
418416
attribute: str | None = None,
419417
label: str | None = None,
420-
user_data: dict[str, Any] | None = None,
418+
user_data: Mapping[str, JsonSerializable] | None = None,
421419
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
422420
requests: Sequence[str | Request] | None = None,
423421
rq_id: str | None = None,
@@ -465,7 +463,7 @@ def __call__(
465463
selector: str = 'a',
466464
attribute: str = 'href',
467465
label: str | None = None,
468-
user_data: dict[str, Any] | None = None,
466+
user_data: Mapping[str, JsonSerializable] | None = None,
469467
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
470468
**kwargs: Unpack[EnqueueLinksKwargs],
471469
) -> Coroutine[None, None, list[Request]]:
@@ -543,7 +541,7 @@ class PushDataFunction(Protocol):
543541

544542
def __call__(
545543
self,
546-
data: list[dict[str, Any]] | dict[str, Any],
544+
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
547545
dataset_id: str | None = None,
548546
dataset_name: str | None = None,
549547
dataset_alias: str | None = None,
@@ -616,8 +614,8 @@ class UseStateFunction(Protocol):
616614

617615
def __call__(
618616
self,
619-
default_value: dict[str, JsonSerializable] | None = None,
620-
) -> Coroutine[None, None, dict[str, JsonSerializable]]:
617+
default_value: MutableMapping[str, JsonSerializable] | None = None,
618+
) -> Coroutine[None, None, MutableMapping[str, JsonSerializable]]:
621619
"""Call dunder method.
622620
623621
Args:

src/crawlee/_utils/file.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
from typing import TYPE_CHECKING, overload
1111

1212
if TYPE_CHECKING:
13-
from collections.abc import AsyncIterator
13+
from collections.abc import AsyncIterator, Mapping
1414
from typing import Any, TextIO
1515

1616
from typing_extensions import Unpack
1717

18-
from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs
18+
from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable
1919

2020
if sys.platform == 'win32':
2121

@@ -150,7 +150,7 @@ async def atomic_write(
150150

151151

152152
async def export_json_to_stream(
153-
iterator: AsyncIterator[dict[str, Any]],
153+
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
154154
dst: TextIO,
155155
**kwargs: Unpack[ExportDataJsonKwargs],
156156
) -> None:
@@ -159,7 +159,7 @@ async def export_json_to_stream(
159159

160160

161161
async def export_csv_to_stream(
162-
iterator: AsyncIterator[dict[str, Any]],
162+
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
163163
dst: TextIO,
164164
**kwargs: Unpack[ExportDataCsvKwargs],
165165
) -> None:

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import logging
55
from abc import ABC
66
from datetime import timedelta
7-
from typing import TYPE_CHECKING, Any, Generic
7+
from typing import TYPE_CHECKING, Generic
88

99
from more_itertools import partition
1010
from pydantic import ValidationError
@@ -21,12 +21,12 @@
2121
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult
2222

2323
if TYPE_CHECKING:
24-
from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator
24+
from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping
2525

2626
from typing_extensions import Unpack
2727

2828
from crawlee import RequestTransformAction
29-
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction
29+
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction, JsonSerializable
3030

3131
from ._abstract_http_parser import AbstractHttpParser
3232

@@ -200,7 +200,7 @@ async def extract_links(
200200
selector: str = 'a',
201201
attribute: str = 'href',
202202
label: str | None = None,
203-
user_data: dict[str, Any] | None = None,
203+
user_data: Mapping[str, JsonSerializable] | None = None,
204204
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
205205
| None = None,
206206
**kwargs: Unpack[EnqueueLinksKwargs],

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from ._result_comparator import create_default_comparator
4343

4444
if TYPE_CHECKING:
45+
from collections.abc import MutableMapping
4546
from types import TracebackType
4647

4748
from typing_extensions import Unpack
@@ -286,7 +287,7 @@ async def _crawl_one(
286287
self,
287288
rendering_type: RenderingType,
288289
context: BasicCrawlingContext,
289-
state: dict[str, JsonSerializable] | None = None,
290+
state: MutableMapping[str, JsonSerializable] | None = None,
290291
) -> SubCrawlerRun:
291292
"""Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`.
292293
@@ -297,8 +298,8 @@ async def _crawl_one(
297298
if state is not None:
298299

299300
async def get_input_state(
300-
default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs.
301-
) -> dict[str, JsonSerializable]:
301+
default_value: MutableMapping[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs.
302+
) -> MutableMapping[str, JsonSerializable]:
302303
return state
303304

304305
use_state_function = get_input_state
@@ -411,8 +412,10 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
411412
# avoid static crawl to modify the state.
412413
# (This static crawl is performed only to evaluate rendering type detection.)
413414
kvs = await context.get_key_value_store()
414-
default_value = dict[str, JsonSerializable]()
415-
old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value)
415+
default_value: MutableMapping[str, JsonSerializable] = {}
416+
old_state: MutableMapping[str, JsonSerializable] = await kvs.get_value(
417+
self._CRAWLEE_STATE_KEY, default_value
418+
)
416419
old_state_copy = deepcopy(old_state)
417420

418421
pw_run = await self._crawl_one('client only', context=context)

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080

8181
if TYPE_CHECKING:
8282
import re
83-
from collections.abc import Iterator
83+
from collections.abc import Iterator, Mapping, MutableMapping
8484
from contextlib import AbstractAsyncContextManager
8585

8686
from crawlee._types import (
@@ -856,8 +856,8 @@ async def add_requests(
856856

857857
async def use_state(
858858
self,
859-
default_value: dict[str, JsonSerializable] | None = None,
860-
) -> dict[str, JsonSerializable]:
859+
default_value: MutableMapping[str, JsonSerializable] | None = None,
860+
) -> MutableMapping[str, JsonSerializable]:
861861
kvs = await self.get_key_value_store()
862862
return await kvs.get_auto_saved_value(f'{self._CRAWLEE_STATE_KEY}_{self._id}', default_value)
863863

@@ -941,7 +941,7 @@ async def export_data(
941941

942942
async def _push_data(
943943
self,
944-
data: list[dict[str, Any]] | dict[str, Any],
944+
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
945945
dataset_id: str | None = None,
946946
dataset_name: str | None = None,
947947
dataset_alias: str | None = None,
@@ -1015,7 +1015,7 @@ async def enqueue_links(
10151015
selector: str | None = None,
10161016
attribute: str | None = None,
10171017
label: str | None = None,
1018-
user_data: dict[str, Any] | None = None,
1018+
user_data: Mapping[str, JsonSerializable] | None = None,
10191019
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
10201020
| None = None,
10211021
requests: Sequence[str | Request] | None = None,

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
HttpHeaders,
5151
HttpMethod,
5252
HttpPayload,
53+
JsonSerializable,
5354
)
5455
from crawlee.browsers._types import BrowserType
5556

@@ -448,7 +449,7 @@ async def extract_links(
448449
selector: str = 'a',
449450
attribute: str = 'href',
450451
label: str | None = None,
451-
user_data: dict | None = None,
452+
user_data: Mapping[str, JsonSerializable] | None = None,
452453
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
453454
| None = None,
454455
**kwargs: Unpack[EnqueueLinksKwargs],

src/crawlee/sessions/_models.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from collections.abc import MutableMapping
34
from datetime import datetime, timedelta
45
from typing import Annotated, Any
56

@@ -13,6 +14,8 @@
1314
computed_field,
1415
)
1516

17+
from crawlee._types import JsonSerializable
18+
1619
from ._cookies import CookieParam
1720
from ._session import Session
1821

@@ -24,7 +27,7 @@ class SessionModel(BaseModel):
2427

2528
id: Annotated[str, Field(alias='id')]
2629
max_age: Annotated[timedelta, Field(alias='maxAge')]
27-
user_data: Annotated[dict, Field(alias='userData')]
30+
user_data: Annotated[MutableMapping[str, JsonSerializable], Field(alias='userData')]
2831
max_error_score: Annotated[float, Field(alias='maxErrorScore')]
2932
error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]
3033
created_at: Annotated[datetime, Field(alias='createdAt')]

src/crawlee/sessions/_session.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
from crawlee.sessions._cookies import CookieParam, SessionCookies
1212

1313
if TYPE_CHECKING:
14+
from collections.abc import Mapping, MutableMapping
1415
from http.cookiejar import CookieJar
1516

17+
from crawlee._types import JsonSerializable
1618
from crawlee.sessions._models import SessionModel
1719

1820
logger = getLogger(__name__)
@@ -36,7 +38,7 @@ def __init__(
3638
*,
3739
id: str | None = None,
3840
max_age: timedelta = timedelta(minutes=50),
39-
user_data: dict | None = None,
41+
user_data: Mapping[str, JsonSerializable] | None = None,
4042
max_error_score: float = 3.0,
4143
error_score_decrement: float = 0.5,
4244
created_at: datetime | None = None,
@@ -63,7 +65,7 @@ def __init__(
6365
"""
6466
self._id = id or crypto_random_object_id(length=10)
6567
self._max_age = max_age
66-
self._user_data = user_data or {}
68+
self._user_data: MutableMapping[str, JsonSerializable] = dict(user_data) if user_data is not None else {}
6769
self._max_error_score = max_error_score
6870
self._error_score_decrement = error_score_decrement
6971
self._created_at = created_at or datetime.now(timezone.utc)
@@ -117,7 +119,7 @@ def id(self) -> str:
117119
return self._id
118120

119121
@property
120-
def user_data(self) -> dict:
122+
def user_data(self) -> MutableMapping[str, JsonSerializable]:
121123
"""Get the user data."""
122124
return self._user_data
123125

0 commit comments

Comments
 (0)