Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ dependencies = [
"pydantic>=2.11.0",
"pyee>=9.0.0",
"tldextract>=5.1.0",
"typing-extensions>=4.1.0",
"typing-extensions>=4.10.0",
"yarl>=1.18.0",
]

Expand Down
16 changes: 7 additions & 9 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@
from crawlee.storage_clients import StorageClient
from crawlee.storages import KeyValueStore

# Workaround for https://github.com/pydantic/pydantic/issues/9445
J = TypeVar('J', bound='JsonSerializable')
JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None
JsonSerializable = dict[str, 'JsonSerializable'] | list['JsonSerializable'] | str | int | float | bool | None
else:
from pydantic import JsonValue as JsonSerializable
Comment thread
vdusek marked this conversation as resolved.

Expand Down Expand Up @@ -198,7 +196,7 @@ class PushDataKwargs(TypedDict):


class PushDataFunctionCall(PushDataKwargs):
data: list[dict[str, Any]] | dict[str, Any]
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]
dataset_id: str | None
dataset_name: str | None
dataset_alias: str | None
Expand Down Expand Up @@ -300,7 +298,7 @@ async def add_requests(

async def push_data(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down Expand Up @@ -392,7 +390,7 @@ def __call__(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
rq_id: str | None = None,
rq_name: str | None = None,
Expand All @@ -417,7 +415,7 @@ def __call__(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
requests: Sequence[str | Request] | None = None,
rq_id: str | None = None,
Expand Down Expand Up @@ -465,7 +463,7 @@ def __call__(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, list[Request]]:
Expand Down Expand Up @@ -543,7 +541,7 @@ class PushDataFunction(Protocol):

def __call__(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
from typing import TYPE_CHECKING, overload

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Mapping
from typing import Any, TextIO

from typing_extensions import Unpack

from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs
from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable

if sys.platform == 'win32':

Expand Down Expand Up @@ -150,7 +150,7 @@ async def atomic_write(


async def export_json_to_stream(
iterator: AsyncIterator[dict[str, Any]],
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
Comment thread
vdusek marked this conversation as resolved.
dst: TextIO,
**kwargs: Unpack[ExportDataJsonKwargs],
) -> None:
Expand All @@ -159,7 +159,7 @@ async def export_json_to_stream(


async def export_csv_to_stream(
iterator: AsyncIterator[dict[str, Any]],
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
dst: TextIO,
**kwargs: Unpack[ExportDataCsvKwargs],
) -> None:
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from abc import ABC
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Generic
from typing import TYPE_CHECKING, Generic

from more_itertools import partition
from pydantic import ValidationError
Expand All @@ -26,7 +26,7 @@
from typing_extensions import Unpack

from crawlee import RequestTransformAction
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction, JsonSerializable

from ._abstract_http_parser import AbstractHttpParser

Expand Down Expand Up @@ -200,7 +200,7 @@ async def extract_links(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Comment thread
vdusek marked this conversation as resolved.
Outdated
Comment thread
vdusek marked this conversation as resolved.
Outdated
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

if TYPE_CHECKING:
import re
from collections.abc import Iterator
from collections.abc import Iterator, Mapping
from contextlib import AbstractAsyncContextManager

from crawlee._types import (
Expand Down Expand Up @@ -941,7 +941,7 @@ async def export_data(

async def _push_data(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
Comment thread
vdusek marked this conversation as resolved.
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down Expand Up @@ -1015,7 +1015,7 @@ async def enqueue_links(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Comment thread
vdusek marked this conversation as resolved.
Outdated
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
requests: Sequence[str | Request] | None = None,
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
HttpHeaders,
HttpMethod,
HttpPayload,
JsonSerializable,
)
from crawlee.browsers._types import BrowserType

Expand Down Expand Up @@ -384,7 +385,7 @@ async def extract_links(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Comment thread
vdusek marked this conversation as resolved.
Outdated
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/sessions/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
computed_field,
)

from crawlee._types import JsonSerializable

from ._cookies import CookieParam
from ._session import Session

Expand All @@ -24,7 +26,7 @@ class SessionModel(BaseModel):

id: Annotated[str, Field(alias='id')]
max_age: Annotated[timedelta, Field(alias='maxAge')]
user_data: Annotated[dict, Field(alias='userData')]
user_data: Annotated[dict[str, JsonSerializable], Field(alias='userData')]
Comment thread
vdusek marked this conversation as resolved.
Outdated
max_error_score: Annotated[float, Field(alias='maxErrorScore')]
error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]
created_at: Annotated[datetime, Field(alias='createdAt')]
Expand Down
8 changes: 5 additions & 3 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from crawlee.sessions._cookies import CookieParam, SessionCookies

if TYPE_CHECKING:
from collections.abc import Mapping, MutableMapping
from http.cookiejar import CookieJar

from crawlee._types import JsonSerializable
from crawlee.sessions._models import SessionModel

logger = getLogger(__name__)
Expand All @@ -36,7 +38,7 @@ def __init__(
*,
id: str | None = None,
max_age: timedelta = timedelta(minutes=50),
user_data: dict | None = None,
user_data: Mapping[str, JsonSerializable] | None = None,
max_error_score: float = 3.0,
error_score_decrement: float = 0.5,
created_at: datetime | None = None,
Expand All @@ -63,7 +65,7 @@ def __init__(
"""
self._id = id or crypto_random_object_id(length=10)
self._max_age = max_age
self._user_data = user_data or {}
self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {}
Comment thread
vdusek marked this conversation as resolved.
Outdated
self._max_error_score = max_error_score
self._error_score_decrement = error_score_decrement
self._created_at = created_at or datetime.now(timezone.utc)
Expand Down Expand Up @@ -117,7 +119,7 @@ def id(self) -> str:
return self._id

@property
def user_data(self) -> dict:
def user_data(self) -> MutableMapping[str, JsonSerializable]:
"""Get the user data."""
return self._user_data

Expand Down
22 changes: 18 additions & 4 deletions src/crawlee/storage_clients/_base/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,20 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from typing import Any
from collections.abc import AsyncIterator, Mapping, Sequence

from typing_extensions import TypeIs

from crawlee._types import JsonSerializable
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata


def _is_list_of_items(
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]:
return isinstance(data, list)


class DatasetClient(ABC):
"""An abstract class for dataset storage clients.

Expand Down Expand Up @@ -42,7 +50,7 @@ async def purge(self) -> None:
"""

@abstractmethod
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None:
"""Push data to the dataset.

The backend method for the `Dataset.push_data` call.
Expand Down Expand Up @@ -82,7 +90,7 @@ async def iterate_items(
unwind: list[str] | None = None,
skip_empty: bool = False,
skip_hidden: bool = False,
) -> AsyncIterator[dict[str, Any]]:
) -> AsyncIterator[Mapping[str, JsonSerializable]]:
"""Iterate over the dataset items with filtering options.

The backend method for the `Dataset.iterate_items` call.
Expand All @@ -91,3 +99,9 @@ async def iterate_items(
raise NotImplementedError
if False:
yield {}

@staticmethod
def _is_list_of_items(
Comment thread
vdusek marked this conversation as resolved.
Outdated
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]:
return isinstance(data, list)
12 changes: 7 additions & 5 deletions src/crawlee/storage_clients/_file_system/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import json
import shutil
from collections.abc import Mapping
from datetime import datetime, timezone
from logging import getLogger
from pathlib import Path
Expand All @@ -12,14 +13,15 @@
from typing_extensions import Self, override

from crawlee._consts import METADATA_FILENAME
from crawlee._types import JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Sequence

from crawlee.configuration import Configuration

Expand Down Expand Up @@ -220,10 +222,10 @@ async def purge(self) -> None:
)

@override
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None:
async with self._lock:
new_item_count = self._metadata.item_count
if isinstance(data, list):
if self._is_list_of_items(data):
for item in data:
new_item_count += 1
await self._push_item(item, new_item_count)
Expand Down Expand Up @@ -304,7 +306,7 @@ async def get_data(
selected_files = selected_files[:limit]

# Read and parse each data file.
items = list[dict[str, Any]]()
items = list[Mapping[str, JsonSerializable]]()
for file_path in selected_files:
try:
file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
Expand Down Expand Up @@ -441,7 +443,7 @@ async def _update_metadata(
data = await json_dumps(self._metadata.model_dump())
await atomic_write(self.path_to_metadata, data)

async def _push_item(self, item: dict[str, Any], item_id: int) -> None:
async def _push_item(self, item: Mapping[str, JsonSerializable], item_id: int) -> None:
"""Push a single item to the dataset.

This method writes the item as a JSON file with a zero-padded numeric filename
Expand Down
Loading
Loading