Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,23 +130,29 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
async def _execute_pre_navigation_hooks(
self, context: BasicCrawlingContext
) -> AsyncGenerator[BasicCrawlingContext, None]:
context_id = id(context)
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
request_id = id(context.request)
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)
Comment thread
vdusek marked this conversation as resolved.

try:
for hook in self._pre_navigation_hooks:
async with self._shared_navigation_timeouts[context_id]:
async with self._shared_navigation_timeouts[request_id]:
await hook(context)

yield context
finally:
self._shared_navigation_timeouts.pop(context_id, None)
self._shared_navigation_timeouts.pop(request_id, None)

async def _execute_post_navigation_hooks(
self, context: HttpCrawlingContext
) -> AsyncGenerator[HttpCrawlingContext, None]:
request_id = id(context.request)

for hook in self._post_navigation_hooks:
await hook(context)
if request_id in self._shared_navigation_timeouts:
async with self._shared_navigation_timeouts[request_id]:
await hook(context)
else:
await hook(context)

yield context

Expand Down Expand Up @@ -262,7 +268,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
Yields:
The original crawling context enhanced by HTTP response.
"""
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
result = await self._http_client.crawl(
request=context.request,
session=context.session,
Expand Down
19 changes: 13 additions & 6 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,21 +242,21 @@ async def _open_page(
goto_options=GotoOptions(**self._goto_options),
)

context_id = id(pre_navigation_context)
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
request_id = id(pre_navigation_context.request)
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)

try:
# Only use the page context manager here — it sets the current page in a context variable,
# making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
async with browser_page_context(crawlee_page.page):
for hook in self._pre_navigation_hooks:
async with self._shared_navigation_timeouts[context_id]:
async with self._shared_navigation_timeouts[request_id]:
await hook(pre_navigation_context)

# Yield should be inside the browser_page_context.
yield pre_navigation_context
finally:
self._shared_navigation_timeouts.pop(context_id, None)
self._shared_navigation_timeouts.pop(request_id, None)

def _prepare_request_interceptor(
self,
Expand Down Expand Up @@ -329,7 +329,7 @@ async def _navigate(
await context.page.route(context.request.url, route_handler)

try:
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
response = await context.page.goto(
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
)
Expand Down Expand Up @@ -496,8 +496,15 @@ async def _handle_blocked_request_by_content(
async def _execute_post_navigation_hooks(
self, context: PlaywrightPostNavCrawlingContext
) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
request_id = id(context.request)

for hook in self._post_navigation_hooks:
await hook(context)
if request_id in self._shared_navigation_timeouts:
async with self._shared_navigation_timeouts[request_id]:
await hook(context)
else:
await hook(context)

yield context

async def _create_crawling_context(
Expand Down
Loading