14
14
from crawlee ._utils .crypto import crypto_random_object_id
15
15
from crawlee .storage_clients ._base import KeyValueStoreClient
16
16
from crawlee .storage_clients .models import (
17
- KeyValueStoreKeyInfo ,
18
17
KeyValueStoreListKeysPage ,
19
18
KeyValueStoreMetadata ,
20
19
KeyValueStoreRecord ,
@@ -245,11 +244,15 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
245
244
# Update the metadata to record access
246
245
await self ._update_metadata (update_accessed_at = True )
247
246
247
+ # Calculate the size of the value in bytes
248
+ size = len (value_bytes )
249
+
248
250
return KeyValueStoreRecord (
249
251
key = metadata .key ,
250
252
value = value ,
251
253
content_type = metadata .content_type ,
252
254
filename = filename ,
255
+ size = size ,
253
256
)
254
257
255
258
@override
@@ -271,7 +274,9 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No
271
274
record_path = self ._path_to_kvs / filename
272
275
273
276
# Get the metadata.
274
- record_metadata = KeyValueStoreRecordMetadata (key = key , content_type = content_type )
277
+ # Calculate the size of the value in bytes
278
+ size = len (value_bytes )
279
+ record_metadata = KeyValueStoreRecordMetadata (key = key , content_type = content_type , size = size )
275
280
record_metadata_filepath = record_path .with_name (f'{ record_path .name } .{ METADATA_FILENAME } ' )
276
281
record_metadata_content = await json_dumps (record_metadata .model_dump ())
277
282
@@ -330,61 +335,87 @@ async def delete_value(self, *, key: str) -> None:
330
335
if deleted :
331
336
await self ._update_metadata (update_accessed_at = True , update_modified_at = True )
332
337
338
+ @override
333
339
async def iterate_keys (
334
340
self ,
335
341
* ,
336
342
exclusive_start_key : str | None = None ,
337
343
limit : int = 1000 ,
338
- ) -> AsyncIterator [KeyValueStoreKeyInfo ]:
339
- keys = []
340
- has_next = False
341
-
344
+ ) -> AsyncIterator [KeyValueStoreRecordMetadata ]:
342
345
# Check if the KVS directory exists
343
346
if not self ._path_to_kvs .exists ():
344
347
return
345
348
349
+ count = 0
346
350
async with self ._lock :
347
351
# Get all files in the KVS directory
348
- files = await asyncio .to_thread (self ._path_to_kvs .glob , '*' )
352
+ files = sorted ( await asyncio .to_thread (list , self ._path_to_kvs .glob ( '*' )) )
349
353
350
- # Filter out metadata files and get unique key names
351
- key_files = {}
352
354
for file_path in files :
355
+ # Skip the main metadata file
353
356
if file_path .name == METADATA_FILENAME :
354
357
continue
355
358
356
- # Skip metadata files for records
357
- if file_path .name .endswith (f'.{ METADATA_FILENAME } ' ):
359
+ # Only process metadata files for records
360
+ if not file_path .name .endswith (f'.{ METADATA_FILENAME } ' ):
358
361
continue
359
362
360
- # Extract the base key name
361
- key = file_path .name
362
- key_files [key ] = file_path
363
+ # Extract the base key name from the metadata filename
364
+ key_name = file_path .name [: - len (f'.{ METADATA_FILENAME } ' )]
363
365
364
- # Sort keys for consistent ordering
365
- all_keys = sorted (key_files .keys ())
366
+ # Apply exclusive_start_key filter if provided
367
+ if exclusive_start_key is not None and key_name <= exclusive_start_key :
368
+ continue
366
369
367
- # Apply exclusive_start_key if provided
368
- if exclusive_start_key is not None :
369
- start_idx = 0
370
- for idx , key in enumerate (all_keys ):
371
- if key > exclusive_start_key : # exclusive start
372
- start_idx = idx
373
- break
374
- all_keys = all_keys [start_idx :]
370
+ # Try to read and parse the metadata file
371
+ try :
372
+ metadata_content = await asyncio .to_thread (file_path .read_text , encoding = 'utf-8' )
373
+ metadata_dict = json .loads (metadata_content )
374
+ record_metadata = KeyValueStoreRecordMetadata (** metadata_dict )
375
375
376
- # Apply limit
377
- if len ( all_keys ) > limit :
378
- keys = all_keys [: limit ]
379
- has_next = True
380
- else :
381
- keys = all_keys
382
- has_next = False
376
+ yield record_metadata
377
+
378
+ count += 1
379
+ if count >= limit :
380
+ break
381
+ except ( json . JSONDecodeError , ValidationError ) as e :
382
+ logger . warning ( f'Failed to parse metadata file { file_path } : { e } ' )
383
383
384
384
# Update accessed_at timestamp
385
385
await self ._update_metadata (update_accessed_at = True )
386
386
387
- return KeyValueStoreListKeysPage (keys = keys , has_next = has_next )
387
+ @override
388
+ async def list_keys (
389
+ self ,
390
+ * ,
391
+ exclusive_start_key : str | None = None ,
392
+ limit : int = 1000 ,
393
+ ) -> KeyValueStoreListKeysPage :
394
+ keys = []
395
+ had_more = False
396
+ next_exclusive_start_key = None
397
+
398
+ # Use the iterate_keys method to get all keys
399
+ async for metadata in self .iterate_keys (exclusive_start_key = exclusive_start_key , limit = limit + 1 ):
400
+ keys .append (metadata .key )
401
+ # If we've collected more than the limit, we know there are more keys
402
+ if len (keys ) > limit :
403
+ had_more = True
404
+ next_exclusive_start_key = metadata .key
405
+ keys .pop () # Remove the extra key
406
+ break
407
+
408
+ # Update the accessed_at timestamp is already handled by iterate_keys
409
+
410
+ return KeyValueStoreListKeysPage (
411
+ count = len (keys ),
412
+ items = keys ,
413
+ had_more = had_more ,
414
+ is_truncated = had_more ,
415
+ limit = limit ,
416
+ exclusive_start_key = exclusive_start_key ,
417
+ next_exclusive_start_key = next_exclusive_start_key ,
418
+ )
388
419
389
420
@override
390
421
async def get_public_url (self , * , key : str ) -> str :
0 commit comments