Skip to content

Commit c8747d6

Browse files
committed
Fix cycle imports, tests and everything
1 parent 0408130 commit c8747d6

19 files changed

+285
-317
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
if TYPE_CHECKING:
2121
from types import TracebackType
2222

23+
from crawlee.events import EventManager
24+
2325
logger = getLogger(__name__)
2426

2527
T = TypeVar('T')
@@ -38,6 +40,7 @@ class Snapshotter:
3840
def __init__(
3941
self,
4042
*,
43+
event_manager: EventManager | None = None,
4144
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
4245
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
4346
max_used_cpu_ratio: float = 0.95,
@@ -54,6 +57,8 @@ def __init__(
5457
"""A default constructor.
5558
5659
Args:
60+
event_manager: The event manager used to emit system info events. From data provided by this event
61+
the CPU and memory usage are read.
5762
event_loop_snapshot_interval: The interval at which the event loop is sampled.
5863
client_snapshot_interval: The interval at which the client is sampled.
5964
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
@@ -79,8 +84,7 @@ def __init__(
7984
if available_memory_ratio is None and max_memory_size is None:
8085
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
8186

82-
self._event_manager = service_container.get_event_manager()
83-
87+
self._event_manager = event_manager or service_container.get_event_manager()
8488
self._event_loop_snapshot_interval = event_loop_snapshot_interval
8589
self._client_snapshot_interval = client_snapshot_interval
8690
self._max_event_loop_delay = max_event_loop_delay

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 71 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050

5151
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5252
from crawlee.base_storage_client._models import DatasetItemsListPage
53+
from crawlee.configuration import Configuration
54+
from crawlee.events._event_manager import EventManager
5355
from crawlee.http_clients import BaseHttpClient, HttpResponse
5456
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
5557
from crawlee.sessions import Session
@@ -93,6 +95,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
9395
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
9496
or if the website blocks the request."""
9597

98+
configuration: NotRequired[Configuration]
99+
"""Crawler configuration."""
100+
96101
request_handler_timeout: NotRequired[timedelta]
97102
"""Maximum duration allowed for a single request handler to run."""
98103

@@ -111,6 +116,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
111116
statistics: NotRequired[Statistics[StatisticsState]]
112117
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
113118

119+
event_manager: NotRequired[EventManager]
120+
"""A custom `EventManager` instance, allowing the use of non-default configuration."""
121+
114122
configure_logging: NotRequired[bool]
115123
"""If True, the crawler will set up logging infrastructure automatically."""
116124

@@ -157,126 +165,136 @@ class BasicCrawler(Generic[TCrawlingContext]):
157165
def __init__(
158166
self,
159167
*,
168+
configuration: Configuration | None = None,
169+
event_manager: EventManager | None = None,
160170
request_provider: RequestProvider | None = None,
161-
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
171+
session_pool: SessionPool | None = None,
172+
proxy_configuration: ProxyConfiguration | None = None,
162173
http_client: BaseHttpClient | None = None,
163-
concurrency_settings: ConcurrencySettings | None = None,
174+
request_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None,
164175
max_request_retries: int = 3,
165176
max_requests_per_crawl: int | None = None,
166177
max_session_rotations: int = 10,
167-
request_handler_timeout: timedelta = timedelta(minutes=1),
168-
session_pool: SessionPool | None = None,
178+
max_crawl_depth: int | None = None,
169179
use_session_pool: bool = True,
170180
retry_on_blocked: bool = True,
171-
proxy_configuration: ProxyConfiguration | None = None,
181+
concurrency_settings: ConcurrencySettings | None = None,
182+
request_handler_timeout: timedelta = timedelta(minutes=1),
172183
statistics: Statistics | None = None,
173184
configure_logging: bool = True,
174-
max_crawl_depth: int | None = None,
175185
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
176186
_additional_context_managers: Sequence[AsyncContextManager] | None = None,
177187
_logger: logging.Logger | None = None,
178188
) -> None:
179189
"""A default constructor.
180190
181191
Args:
192+
configuration: The configuration object. Some of its properties are used as defaults for the crawler.
193+
event_manager: The event manager for managing events for the crawler and all its components.
182194
request_provider: Provider for requests to be processed by the crawler.
183-
request_handler: A callable responsible for handling requests.
195+
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
196+
proxy_configuration: HTTP proxy configuration used when making requests.
184197
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
185-
concurrency_settings: Settings to fine-tune concurrency levels.
198+
request_handler: A callable responsible for handling requests.
186199
max_request_retries: Maximum number of attempts to process a single request.
187200
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
188201
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
189202
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
190203
this value.
191204
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
192205
if a proxy error occurs or if the website blocks the request.
193-
request_handler_timeout: Maximum duration allowed for a single request handler to run.
206+
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
194207
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
195-
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
196208
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
197-
proxy_configuration: HTTP proxy configuration used when making requests.
209+
concurrency_settings: Settings to fine-tune concurrency levels.
210+
request_handler_timeout: Maximum duration allowed for a single request handler to run.
198211
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
199212
configure_logging: If True, the crawler will set up logging infrastructure automatically.
200-
max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
201213
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
202214
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
203215
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
204216
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
205217
"""
206-
self._router: Router[TCrawlingContext] | None = None
218+
if configuration:
219+
service_container.set_configuration(configuration)
220+
if event_manager:
221+
service_container.set_event_manager(event_manager)
222+
223+
config = service_container.get_configuration()
224+
225+
# Core components
226+
self._request_provider = request_provider
227+
self._session_pool = session_pool or SessionPool()
228+
self._proxy_configuration = proxy_configuration
229+
self._http_client = http_client or HttpxHttpClient()
207230

231+
# Request router setup
232+
self._router: Router[TCrawlingContext] | None = None
208233
if isinstance(cast(Router, request_handler), Router):
209234
self._router = cast(Router[TCrawlingContext], request_handler)
210235
elif request_handler is not None:
211236
self._router = None
212237
self.router.default_handler(request_handler)
213238

214-
self._http_client = http_client or HttpxHttpClient()
215-
216-
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
217-
239+
# Error & failed request handlers
218240
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
219241
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
220242

243+
# Context pipeline
244+
self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
245+
246+
# Crawl settings
221247
self._max_request_retries = max_request_retries
222248
self._max_requests_per_crawl = max_requests_per_crawl
223249
self._max_session_rotations = max_session_rotations
250+
self._max_crawl_depth = max_crawl_depth
224251

225-
self._request_provider = request_provider
226-
227-
config = service_container.get_configuration()
228-
252+
# Timeouts
229253
self._request_handler_timeout = request_handler_timeout
230254
self._internal_timeout = (
231255
config.internal_timeout
232256
if config.internal_timeout is not None
233257
else max(2 * request_handler_timeout, timedelta(minutes=5))
234258
)
235259

236-
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
237-
238-
self._event_manager = service_container.get_event_manager()
239-
self._snapshotter = Snapshotter(
240-
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
241-
available_memory_ratio=config.available_memory_ratio,
242-
)
243-
self._pool = AutoscaledPool(
244-
system_status=SystemStatus(self._snapshotter),
245-
is_finished_function=self.__is_finished_function,
246-
is_task_ready_function=self.__is_task_ready_function,
247-
run_task_function=self.__run_task_function,
248-
concurrency_settings=concurrency_settings,
249-
)
250-
260+
# Retry and session settings
251261
self._use_session_pool = use_session_pool
252-
self._session_pool = session_pool or SessionPool()
253-
254262
self._retry_on_blocked = retry_on_blocked
255263

264+
# Logging setup
256265
if configure_logging:
257266
root_logger = logging.getLogger()
258267
configure_logger(root_logger, remove_old_handlers=True)
259-
260-
# Silence HTTPX logger
261-
httpx_logger = logging.getLogger('httpx')
268+
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
262269
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
270+
self._logger = _logger or logging.getLogger(__name__)
263271

264-
if not _logger:
265-
_logger = logging.getLogger(__name__)
266-
267-
self._logger = _logger
268-
269-
self._proxy_configuration = proxy_configuration
272+
# Statistics
270273
self._statistics = statistics or Statistics(
271-
event_manager=self._event_manager,
272274
periodic_message_logger=self._logger,
273275
log_message='Current request statistics:',
274276
)
277+
278+
# Additional context managers to enter and exit
275279
self._additional_context_managers = _additional_context_managers or []
276280

281+
# Internal, not explicitly configurable components
282+
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
283+
self._snapshotter = Snapshotter(
284+
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
285+
available_memory_ratio=config.available_memory_ratio,
286+
)
287+
self._autoscaled_pool = AutoscaledPool(
288+
system_status=SystemStatus(self._snapshotter),
289+
is_finished_function=self.__is_finished_function,
290+
is_task_ready_function=self.__is_task_ready_function,
291+
run_task_function=self.__run_task_function,
292+
concurrency_settings=concurrency_settings,
293+
)
294+
295+
# State flags
277296
self._running = False
278297
self._has_finished_before = False
279-
self._max_crawl_depth = max_crawl_depth
280298

281299
@property
282300
def log(self) -> logging.Logger:
@@ -459,8 +477,10 @@ def sigint_handler() -> None:
459477
return final_statistics
460478

461479
async def _run_crawler(self) -> None:
480+
event_manager = service_container.get_event_manager()
481+
462482
async with AsyncExitStack() as exit_stack:
463-
await exit_stack.enter_async_context(self._event_manager)
483+
await exit_stack.enter_async_context(event_manager)
464484
await exit_stack.enter_async_context(self._snapshotter)
465485
await exit_stack.enter_async_context(self._statistics)
466486

@@ -470,7 +490,7 @@ async def _run_crawler(self) -> None:
470490
for context_manager in self._additional_context_managers:
471491
await exit_stack.enter_async_context(context_manager)
472492

473-
await self._pool.run()
493+
await self._autoscaled_pool.run()
474494

475495
async def add_requests(
476496
self,

src/crawlee/configuration.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
8-
from typing_extensions import Self
98

10-
from crawlee import service_container
119
from crawlee._utils.docs import docs_group
1210
from crawlee._utils.models import timedelta_ms
1311

@@ -236,11 +234,13 @@ class Configuration(BaseSettings):
236234
def get_global_configuration(cls) -> Self:
237235
"""Retrieve the global instance of the configuration.
238236
239-
Mostly for the backward compatibility.
237+
TODO: Can we remove this?
240238
"""
241-
config = service_container.get_configuration()
239+
from crawlee.service_container import get_configuration
242240

243-
if not isinstance(config, cls):
244-
raise TypeError(f'Requested configuration of type {cls}, but got {config.__class__} instead.')
241+
cfg = get_configuration()
245242

246-
return config
243+
if not isinstance(cfg, cls):
244+
raise TypeError(f'Requested global configuration object of type {cls}, but {cfg.__class__} was found')
245+
246+
return cfg

src/crawlee/events/_event_manager.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,21 @@ def __init__(
8080
delay=self._persist_state_interval,
8181
)
8282

83+
self._initialized = False
84+
85+
@property
86+
def initialized(self) -> bool:
87+
"""Indicates whether the event manager has been initialized."""
88+
return self._initialized
89+
8390
async def __aenter__(self) -> EventManager:
8491
"""Initializes the event manager upon entering the async context."""
85-
self._emit_persist_state_event_rec_task.start()
92+
if not self._initialized:
93+
self._emit_persist_state_event_rec_task.start()
94+
self._initialized = True
95+
else:
96+
logger.warning('Calling EventManager.__aenter__(), but event manager was already initialized. Skipping...')
97+
8698
return self
8799

88100
async def __aexit__(
@@ -95,11 +107,15 @@ async def __aexit__(
95107
96108
This will stop listening for the events, and it will wait for all the event listeners to finish.
97109
"""
98-
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
99-
self._event_emitter.remove_all_listeners()
100-
self._listener_tasks.clear()
101-
self._listeners_to_wrappers.clear()
102-
await self._emit_persist_state_event_rec_task.stop()
110+
if self._initialized:
111+
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
112+
self._event_emitter.remove_all_listeners()
113+
self._listener_tasks.clear()
114+
self._listeners_to_wrappers.clear()
115+
await self._emit_persist_state_event_rec_task.stop()
116+
self._initialized = False
117+
else:
118+
logger.warning('Calling EventManager.__aexit__(), but event manager was not yet initialized. Skipping...')
103119

104120
def on(self, *, event: Event, listener: Listener) -> None:
105121
"""Add an event listener to the event manager.
@@ -125,17 +141,17 @@ async def listener_wrapper(event_data: EventData) -> None:
125141
self._listener_tasks.add(listener_task)
126142

127143
try:
128-
logger.debug('LocalEventManager.on.listener_wrapper(): Awaiting listener task...')
144+
logger.debug('EventManager.on.listener_wrapper(): Awaiting listener task...')
129145
await listener_task
130-
logger.debug('LocalEventManager.on.listener_wrapper(): Listener task completed.')
146+
logger.debug('EventManager.on.listener_wrapper(): Listener task completed.')
131147
except Exception:
132148
# We need to swallow the exception and just log it here, otherwise it could break the event emitter
133149
logger.exception(
134150
'Exception in the event listener',
135151
extra={'event_name': event.value, 'listener_name': listener.__name__},
136152
)
137153
finally:
138-
logger.debug('LocalEventManager.on.listener_wrapper(): Removing listener task from the set...')
154+
logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')
139155
self._listener_tasks.remove(listener_task)
140156

141157
self._listeners_to_wrappers[event][listener].append(listener_wrapper)
@@ -164,6 +180,9 @@ def emit(self, *, event: Event, event_data: EventData) -> None:
164180
event: The event which will be emitted.
165181
event_data: The data which will be passed to the event listeners.
166182
"""
183+
if not self._initialized:
184+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
185+
167186
self._event_emitter.emit(event.value, event_data)
168187

169188
async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None = None) -> None:
@@ -173,6 +192,8 @@ async def wait_for_all_listeners_to_complete(self, *, timeout: timedelta | None
173192
timeout: The maximum time to wait for the event listeners to finish. If they do not complete within
174193
the specified timeout, they will be canceled.
175194
"""
195+
if not self._initialized:
196+
raise RuntimeError('EventManager is not initialized. Please use it within async context manager.')
176197

177198
async def wait_for_listeners() -> None:
178199
"""Gathers all listener tasks and awaits their completion, logging any exceptions encountered."""

0 commit comments

Comments
 (0)