50
50
51
51
from crawlee ._types import ConcurrencySettings , HttpMethod , JsonSerializable
52
52
from crawlee .base_storage_client ._models import DatasetItemsListPage
53
+ from crawlee .configuration import Configuration
54
+ from crawlee .events ._event_manager import EventManager
53
55
from crawlee .http_clients import BaseHttpClient , HttpResponse
54
56
from crawlee .proxy_configuration import ProxyConfiguration , ProxyInfo
55
57
from crawlee .sessions import Session
@@ -93,6 +95,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
93
95
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
94
96
or if the website blocks the request."""
95
97
98
+ configuration : NotRequired [Configuration ]
99
+ """Crawler configuration."""
100
+
96
101
request_handler_timeout : NotRequired [timedelta ]
97
102
"""Maximum duration allowed for a single request handler to run."""
98
103
@@ -111,6 +116,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
111
116
statistics : NotRequired [Statistics [StatisticsState ]]
112
117
"""A custom `Statistics` instance, allowing the use of non-default configuration."""
113
118
119
+ event_manager : NotRequired [EventManager ]
120
+ """A custom `EventManager` instance, allowing the use of non-default configuration."""
121
+
114
122
configure_logging : NotRequired [bool ]
115
123
"""If True, the crawler will set up logging infrastructure automatically."""
116
124
@@ -157,126 +165,136 @@ class BasicCrawler(Generic[TCrawlingContext]):
157
165
def __init__ (
158
166
self ,
159
167
* ,
168
+ configuration : Configuration | None = None ,
169
+ event_manager : EventManager | None = None ,
160
170
request_provider : RequestProvider | None = None ,
161
- request_handler : Callable [[TCrawlingContext ], Awaitable [None ]] | None = None ,
171
+ session_pool : SessionPool | None = None ,
172
+ proxy_configuration : ProxyConfiguration | None = None ,
162
173
http_client : BaseHttpClient | None = None ,
163
- concurrency_settings : ConcurrencySettings | None = None ,
174
+ request_handler : Callable [[ TCrawlingContext ], Awaitable [ None ]] | None = None ,
164
175
max_request_retries : int = 3 ,
165
176
max_requests_per_crawl : int | None = None ,
166
177
max_session_rotations : int = 10 ,
167
- request_handler_timeout : timedelta = timedelta (minutes = 1 ),
168
- session_pool : SessionPool | None = None ,
178
+ max_crawl_depth : int | None = None ,
169
179
use_session_pool : bool = True ,
170
180
retry_on_blocked : bool = True ,
171
- proxy_configuration : ProxyConfiguration | None = None ,
181
+ concurrency_settings : ConcurrencySettings | None = None ,
182
+ request_handler_timeout : timedelta = timedelta (minutes = 1 ),
172
183
statistics : Statistics | None = None ,
173
184
configure_logging : bool = True ,
174
- max_crawl_depth : int | None = None ,
175
185
_context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
176
186
_additional_context_managers : Sequence [AsyncContextManager ] | None = None ,
177
187
_logger : logging .Logger | None = None ,
178
188
) -> None :
179
189
"""A default constructor.
180
190
181
191
Args:
192
+ configuration: The configuration object. Some of its properties are used as defaults for the crawler.
193
+ event_manager: The event manager for managing events for the crawler and all its components.
182
194
request_provider: Provider for requests to be processed by the crawler.
183
- request_handler: A callable responsible for handling requests.
195
+ session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
196
+ proxy_configuration: HTTP proxy configuration used when making requests.
184
197
http_client: HTTP client used by `BasicCrawlingContext.send_request` and the HTTP-based crawling.
185
- concurrency_settings: Settings to fine-tune concurrency levels .
198
+ request_handler: A callable responsible for handling requests .
186
199
max_request_retries: Maximum number of attempts to process a single request.
187
200
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
188
201
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
189
202
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
190
203
this value.
191
204
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
192
205
if a proxy error occurs or if the website blocks the request.
193
- request_handler_timeout : Maximum duration allowed for a single request handler to run .
206
+ max_crawl_depth : Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth .
194
207
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
195
- session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
196
208
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
197
- proxy_configuration: HTTP proxy configuration used when making requests.
209
+ concurrency_settings: Settings to fine-tune concurrency levels.
210
+ request_handler_timeout: Maximum duration allowed for a single request handler to run.
198
211
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
199
212
configure_logging: If True, the crawler will set up logging infrastructure automatically.
200
- max_crawl_depth: Maximum crawl depth. If set, the crawler will stop crawling after reaching this depth.
201
213
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
202
214
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
203
215
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
204
216
_logger: A logger instance, typically provided by a subclass, for consistent logging labels.
205
217
"""
206
- self ._router : Router [TCrawlingContext ] | None = None
218
+ if configuration :
219
+ service_container .set_configuration (configuration )
220
+ if event_manager :
221
+ service_container .set_event_manager (event_manager )
222
+
223
+ config = service_container .get_configuration ()
224
+
225
+ # Core components
226
+ self ._request_provider = request_provider
227
+ self ._session_pool = session_pool or SessionPool ()
228
+ self ._proxy_configuration = proxy_configuration
229
+ self ._http_client = http_client or HttpxHttpClient ()
207
230
231
+ # Request router setup
232
+ self ._router : Router [TCrawlingContext ] | None = None
208
233
if isinstance (cast (Router , request_handler ), Router ):
209
234
self ._router = cast (Router [TCrawlingContext ], request_handler )
210
235
elif request_handler is not None :
211
236
self ._router = None
212
237
self .router .default_handler (request_handler )
213
238
214
- self ._http_client = http_client or HttpxHttpClient ()
215
-
216
- self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
217
-
239
+ # Error & failed request handlers
218
240
self ._error_handler : ErrorHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
219
241
self ._failed_request_handler : FailedRequestHandler [TCrawlingContext | BasicCrawlingContext ] | None = None
220
242
243
+ # Context pipeline
244
+ self ._context_pipeline = (_context_pipeline or ContextPipeline ()).compose (self ._check_url_after_redirects )
245
+
246
+ # Crawl settings
221
247
self ._max_request_retries = max_request_retries
222
248
self ._max_requests_per_crawl = max_requests_per_crawl
223
249
self ._max_session_rotations = max_session_rotations
250
+ self ._max_crawl_depth = max_crawl_depth
224
251
225
- self ._request_provider = request_provider
226
-
227
- config = service_container .get_configuration ()
228
-
252
+ # Timeouts
229
253
self ._request_handler_timeout = request_handler_timeout
230
254
self ._internal_timeout = (
231
255
config .internal_timeout
232
256
if config .internal_timeout is not None
233
257
else max (2 * request_handler_timeout , timedelta (minutes = 5 ))
234
258
)
235
259
236
- self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
237
-
238
- self ._event_manager = service_container .get_event_manager ()
239
- self ._snapshotter = Snapshotter (
240
- max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
241
- available_memory_ratio = config .available_memory_ratio ,
242
- )
243
- self ._pool = AutoscaledPool (
244
- system_status = SystemStatus (self ._snapshotter ),
245
- is_finished_function = self .__is_finished_function ,
246
- is_task_ready_function = self .__is_task_ready_function ,
247
- run_task_function = self .__run_task_function ,
248
- concurrency_settings = concurrency_settings ,
249
- )
250
-
260
+ # Retry and session settings
251
261
self ._use_session_pool = use_session_pool
252
- self ._session_pool = session_pool or SessionPool ()
253
-
254
262
self ._retry_on_blocked = retry_on_blocked
255
263
264
+ # Logging setup
256
265
if configure_logging :
257
266
root_logger = logging .getLogger ()
258
267
configure_logger (root_logger , remove_old_handlers = True )
259
-
260
- # Silence HTTPX logger
261
- httpx_logger = logging .getLogger ('httpx' )
268
+ httpx_logger = logging .getLogger ('httpx' ) # Silence HTTPX logger
262
269
httpx_logger .setLevel (logging .DEBUG if get_configured_log_level () <= logging .DEBUG else logging .WARNING )
270
+ self ._logger = _logger or logging .getLogger (__name__ )
263
271
264
- if not _logger :
265
- _logger = logging .getLogger (__name__ )
266
-
267
- self ._logger = _logger
268
-
269
- self ._proxy_configuration = proxy_configuration
272
+ # Statistics
270
273
self ._statistics = statistics or Statistics (
271
- event_manager = self ._event_manager ,
272
274
periodic_message_logger = self ._logger ,
273
275
log_message = 'Current request statistics:' ,
274
276
)
277
+
278
+ # Additional context managers to enter and exit
275
279
self ._additional_context_managers = _additional_context_managers or []
276
280
281
+ # Internal, not explicitly configurable components
282
+ self ._tld_extractor = TLDExtract (cache_dir = tempfile .TemporaryDirectory ().name )
283
+ self ._snapshotter = Snapshotter (
284
+ max_memory_size = ByteSize .from_mb (config .memory_mbytes ) if config .memory_mbytes else None ,
285
+ available_memory_ratio = config .available_memory_ratio ,
286
+ )
287
+ self ._autoscaled_pool = AutoscaledPool (
288
+ system_status = SystemStatus (self ._snapshotter ),
289
+ is_finished_function = self .__is_finished_function ,
290
+ is_task_ready_function = self .__is_task_ready_function ,
291
+ run_task_function = self .__run_task_function ,
292
+ concurrency_settings = concurrency_settings ,
293
+ )
294
+
295
+ # State flags
277
296
self ._running = False
278
297
self ._has_finished_before = False
279
- self ._max_crawl_depth = max_crawl_depth
280
298
281
299
@property
282
300
def log (self ) -> logging .Logger :
@@ -459,8 +477,10 @@ def sigint_handler() -> None:
459
477
return final_statistics
460
478
461
479
async def _run_crawler (self ) -> None :
480
+ event_manager = service_container .get_event_manager ()
481
+
462
482
async with AsyncExitStack () as exit_stack :
463
- await exit_stack .enter_async_context (self . _event_manager )
483
+ await exit_stack .enter_async_context (event_manager )
464
484
await exit_stack .enter_async_context (self ._snapshotter )
465
485
await exit_stack .enter_async_context (self ._statistics )
466
486
@@ -470,7 +490,7 @@ async def _run_crawler(self) -> None:
470
490
for context_manager in self ._additional_context_managers :
471
491
await exit_stack .enter_async_context (context_manager )
472
492
473
- await self ._pool .run ()
493
+ await self ._autoscaled_pool .run ()
474
494
475
495
async def add_requests (
476
496
self ,
0 commit comments