Skip to content

Commit 2a9a840

Browse files
committed
Update KVS and its clients
1 parent 2e780f1 commit 2a9a840

14 files changed

+664
-285
lines changed

Diff for: src/crawlee/_utils/file.py

-16
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import asyncio
44
import contextlib
5-
import io
65
import json
76
import mimetypes
87
import os
@@ -83,21 +82,6 @@ def determine_file_extension(content_type: str) -> str | None:
8382
return ext[1:] if ext is not None else ext
8483

8584

86-
def is_file_or_bytes(value: Any) -> bool:
87-
"""Determine if the input value is a file-like object or bytes.
88-
89-
This function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).
90-
The method is simplified for common use cases and may not cover all edge cases.
91-
92-
Args:
93-
value: The value to be checked.
94-
95-
Returns:
96-
True if the value is either a file-like object or bytes, False otherwise.
97-
"""
98-
return isinstance(value, (bytes, bytearray, io.IOBase))
99-
100-
10185
async def json_dumps(obj: Any) -> str:
10286
"""Serialize an object to a JSON-formatted string with specific settings.
10387

Diff for: src/crawlee/storage_clients/_base/_dataset_client.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def item_count(self) -> int:
5656
@abstractmethod
5757
async def open(
5858
cls,
59+
*,
5960
id: str | None,
6061
name: str | None,
6162
storage_dir: Path,
@@ -82,7 +83,7 @@ async def drop(self) -> None:
8283
"""
8384

8485
@abstractmethod
85-
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
86+
async def push_data(self, *, data: list[Any] | dict[str, Any]) -> None:
8687
"""Push data to the dataset.
8788
8889
The backend method for the `Dataset.push_data` call.

Diff for: src/crawlee/storage_clients/_base/_key_value_store_client.py

+87-62
Original file line numberDiff line numberDiff line change
@@ -6,111 +6,136 @@
66
from crawlee._utils.docs import docs_group
77

88
if TYPE_CHECKING:
9-
from contextlib import AbstractAsyncContextManager
10-
11-
from httpx import Response
12-
13-
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
9+
from collections.abc import AsyncIterator
10+
from datetime import datetime
11+
from pathlib import Path
12+
13+
from crawlee.storage_clients.models import (
14+
KeyValueStoreRecord,
15+
KeyValueStoreRecordMetadata,
16+
)
17+
18+
# Properties:
19+
# - id
20+
# - name
21+
# - created_at
22+
# - accessed_at
23+
# - modified_at
24+
25+
# Methods:
26+
# - open
27+
# - drop
28+
# - get_value
29+
# - set_value
30+
# - delete_value
31+
# - iterate_keys
32+
# - get_public_url
1433

1534

1635
@docs_group('Abstract classes')
1736
class KeyValueStoreClient(ABC):
18-
"""An abstract class for key-value store resource clients.
37+
"""An abstract class for key-value store (KVS) resource clients.
1938
2039
These clients are specific to the type of resource they manage and operate under a designated storage
2140
client, like a memory storage client.
2241
"""
2342

43+
@property
2444
@abstractmethod
25-
async def get(self) -> KeyValueStoreMetadata | None:
26-
"""Get metadata about the key-value store being managed by this client.
45+
def id(self) -> str:
46+
"""The ID of the key-value store."""
2747

28-
Returns:
29-
An object containing the key-value store's details, or None if the key-value store does not exist.
30-
"""
48+
@property
49+
@abstractmethod
50+
def name(self) -> str | None:
51+
"""The name of the key-value store."""
3152

53+
@property
3254
@abstractmethod
33-
async def delete(self) -> None:
34-
"""Permanently delete the key-value store managed by this client."""
55+
def created_at(self) -> datetime:
56+
"""The time at which the key-value store was created."""
3557

58+
@property
3659
@abstractmethod
37-
async def list_keys(
38-
self,
60+
def accessed_at(self) -> datetime:
61+
"""The time at which the key-value store was last accessed."""
62+
63+
@property
64+
@abstractmethod
65+
def modified_at(self) -> datetime:
66+
"""The time at which the key-value store was last modified."""
67+
68+
@classmethod
69+
@abstractmethod
70+
async def open(
71+
cls,
3972
*,
40-
limit: int = 1000,
41-
exclusive_start_key: str | None = None,
42-
) -> KeyValueStoreListKeysPage:
43-
"""List the keys in the key-value store.
73+
id: str | None,
74+
name: str | None,
75+
storage_dir: Path,
76+
) -> KeyValueStoreClient:
77+
"""Open existing or create a new key-value store client.
78+
79+
If a key-value store with the given name already exists, the appropriate key-value store client is returned.
80+
Otherwise, a new key-value store is created and client for it is returned.
4481
4582
Args:
46-
limit: Number of keys to be returned. Maximum value is 1000.
47-
exclusive_start_key: All keys up to this one (including) are skipped from the result.
83+
id: The ID of the key-value store.
84+
name: The name of the key-value store.
85+
storage_dir: The path to the storage directory. If the client persists data, it should use this directory.
4886
4987
Returns:
50-
The list of keys in the key-value store matching the given arguments.
88+
A key-value store client.
5189
"""
5290

5391
@abstractmethod
54-
async def get_record(self, key: str) -> KeyValueStoreRecord | None:
55-
"""Retrieve the given record from the key-value store.
56-
57-
Args:
58-
key: Key of the record to retrieve.
92+
async def drop(self) -> None:
93+
"""Drop the whole key-value store and remove all its values.
5994
60-
Returns:
61-
The requested record, or None, if the record does not exist
95+
The backend method for the `KeyValueStore.drop` call.
6296
"""
6397

6498
@abstractmethod
65-
async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord[bytes] | None:
66-
"""Retrieve the given record from the key-value store, without parsing it.
67-
68-
Args:
69-
key: Key of the record to retrieve.
99+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
100+
"""Retrieve the given record from the key-value store.
70101
71-
Returns:
72-
The requested record, or None, if the record does not exist
102+
The backend method for the `KeyValueStore.get_value` call.
73103
"""
74104

75105
@abstractmethod
76-
async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]:
77-
"""Retrieve the given record from the key-value store, as a stream.
106+
async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
107+
"""Set a value in the key-value store by its key.
78108
79-
Args:
80-
key: Key of the record to retrieve.
81-
82-
Returns:
83-
The requested record as a context-managed streaming Response, or None, if the record does not exist
109+
The backend method for the `KeyValueStore.set_value` call.
84110
"""
85111

86112
@abstractmethod
87-
async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None:
88-
"""Set a value to the given record in the key-value store.
113+
async def delete_value(self, *, key: str) -> None:
114+
"""Delete a value from the key-value store by its key.
89115
90-
Args:
91-
key: The key of the record to save the value to.
92-
value: The value to save into the record.
93-
content_type: The content type of the saved value.
116+
The backend method for the `KeyValueStore.delete_value` call.
94117
"""
95118

96119
@abstractmethod
97-
async def delete_record(self, key: str) -> None:
98-
"""Delete the specified record from the key-value store.
120+
async def iterate_keys(
121+
self,
122+
*,
123+
exclusive_start_key: str | None = None,
124+
limit: int | None = None,
125+
) -> AsyncIterator[KeyValueStoreRecordMetadata]:
126+
"""Iterate over all the existing keys in the key-value store.
99127
100-
Args:
101-
key: The key of the record which to delete.
128+
The backend method for the `KeyValueStore.iterate_keys` call.
102129
"""
130+
# This syntax is to make mypy properly work with abstract AsyncIterator.
131+
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
132+
raise NotImplementedError
133+
if False: # type: ignore[unreachable]
134+
yield 0
103135

104136
@abstractmethod
105-
async def get_public_url(self, key: str) -> str:
137+
async def get_public_url(self, *, key: str) -> str:
106138
"""Get the public URL for the given key.
107139
108-
Args:
109-
key: Key of the record for which URL is required.
110-
111-
Returns:
112-
The public URL for the given key.
113-
114-
Raises:
115-
ValueError: If the key does not exist.
140+
The backend method for the `KeyValueStore.get_public_url` call.
116141
"""

Diff for: src/crawlee/storage_clients/_file_system/_dataset_client.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@
1010
from pydantic import ValidationError
1111
from typing_extensions import override
1212

13-
from crawlee._consts import METADATA_FILENAME
1413
from crawlee._utils.crypto import crypto_random_object_id
1514
from crawlee.storage_clients._base import DatasetClient
1615
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
1716

18-
from ._utils import json_dumps
17+
from ._utils import METADATA_FILENAME, json_dumps
1918

2019
if TYPE_CHECKING:
2120
from collections.abc import AsyncIterator
@@ -129,6 +128,11 @@ async def open(
129128
Returns:
130129
A new instance of the file system dataset client.
131130
"""
131+
if id:
132+
raise ValueError(
133+
'Opening a dataset by "id" is not supported for file system storage client, use "name" instead.'
134+
)
135+
132136
name = name or cls._DEFAULT_NAME
133137
dataset_path = storage_dir / cls._STORAGE_SUBDIR / name
134138
metadata_path = dataset_path / METADATA_FILENAME

Diff for: src/crawlee/storage_clients/_file_system/_key_value_store.py

-11
This file was deleted.

0 commit comments

Comments
 (0)