Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor!: Introduce new storage clients #1107

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ ignore = [
"PLR0911", # Too many return statements
"PLR0913", # Too many arguments in function definition
"PLR0915", # Too many statements
"PTH", # flake8-use-pathlib
"PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
"PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
"S102", # Use of `exec` detected
Expand Down
14 changes: 3 additions & 11 deletions src/crawlee/_service_locator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from crawlee._utils.docs import docs_group
from crawlee.configuration import Configuration
from crawlee.errors import ServiceConflictError
from crawlee.events import EventManager
from crawlee.storage_clients import StorageClient
from crawlee.events import EventManager, LocalEventManager
from crawlee.storage_clients import FileSystemStorageClient, StorageClient


@docs_group('Classes')
Expand Down Expand Up @@ -49,8 +49,6 @@ def set_configuration(self, configuration: Configuration) -> None:
def get_event_manager(self) -> EventManager:
"""Get the event manager."""
if self._event_manager is None:
from crawlee.events import LocalEventManager

self._event_manager = (
LocalEventManager().from_config(config=self._configuration)
if self._configuration
Expand All @@ -77,13 +75,7 @@ def set_event_manager(self, event_manager: EventManager) -> None:
def get_storage_client(self) -> StorageClient:
"""Get the storage client."""
if self._storage_client is None:
from crawlee.storage_clients import MemoryStorageClient

self._storage_client = (
MemoryStorageClient.from_config(config=self._configuration)
if self._configuration
else MemoryStorageClient.from_config()
)
self._storage_client = FileSystemStorageClient()

self._storage_client_was_retrieved = True
return self._storage_client
Expand Down
4 changes: 0 additions & 4 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,6 @@ async def push_data(
**kwargs: Unpack[PushDataKwargs],
) -> None:
"""Track a call to the `push_data` context helper."""
from crawlee.storages._dataset import Dataset

await Dataset.check_and_serialize(data)

self.push_data_calls.append(
PushDataFunctionCall(
data=data,
Expand Down
16 changes: 0 additions & 16 deletions src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import asyncio
import contextlib
import io
import json
import mimetypes
import os
Expand Down Expand Up @@ -83,21 +82,6 @@ def determine_file_extension(content_type: str) -> str | None:
return ext[1:] if ext is not None else ext


def is_file_or_bytes(value: Any) -> bool:
"""Determine if the input value is a file-like object or bytes.

This function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like).
The method is simplified for common use cases and may not cover all edge cases.

Args:
value: The value to be checked.

Returns:
True if the value is either a file-like object or bytes, False otherwise.
"""
return isinstance(value, (bytes, bytearray, io.IOBase))


async def json_dumps(obj: Any) -> str:
"""Serialize an object to a JSON-formatted string with specific settings.

Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from __future__ import annotations

import os.path
from collections.abc import Iterable
from copy import deepcopy
from functools import reduce
from operator import or_
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

from browserforge.bayesian_network import extract_json
Expand Down Expand Up @@ -253,9 +253,9 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str,

def get_available_header_network() -> dict:
"""Get header network that contains possible header values."""
if os.path.isfile(DATA_DIR / 'header-network.zip'):
if Path(DATA_DIR / 'header-network.zip').is_file():
return extract_json(DATA_DIR / 'header-network.zip')
if os.path.isfile(DATA_DIR / 'header-network-definition.zip'):
if Path(DATA_DIR / 'header-network-definition.zip').is_file():
return extract_json(DATA_DIR / 'header-network-definition.zip')
raise FileNotFoundError('Missing header-network file.')

Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/storage_clients/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from ._base import StorageClient
from ._file_system import FileSystemStorageClient
from ._memory import MemoryStorageClient

__all__ = ['MemoryStorageClient', 'StorageClient']
__all__ = [
'FileSystemStorageClient',
'MemoryStorageClient',
'StorageClient',
]
9 changes: 0 additions & 9 deletions src/crawlee/storage_clients/_base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,11 @@
from ._dataset_client import DatasetClient
from ._dataset_collection_client import DatasetCollectionClient
from ._key_value_store_client import KeyValueStoreClient
from ._key_value_store_collection_client import KeyValueStoreCollectionClient
from ._request_queue_client import RequestQueueClient
from ._request_queue_collection_client import RequestQueueCollectionClient
from ._storage_client import StorageClient
from ._types import ResourceClient, ResourceCollectionClient

__all__ = [
'DatasetClient',
'DatasetCollectionClient',
'KeyValueStoreClient',
'KeyValueStoreCollectionClient',
'RequestQueueClient',
'RequestQueueCollectionClient',
'ResourceClient',
'ResourceCollectionClient',
'StorageClient',
]
Loading
Loading