Skip to content

Commit f3b1b25

Browse files
authored
Return proper status and headers for downloads (#293)
* Return proper status and headers for downloads * Test headers for downloads
1 parent ab103fe commit f3b1b25

File tree

2 files changed

+38
-23
lines changed

2 files changed

+38
-23
lines changed

scrapy_playwright/handler.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
import logging
33
import platform
44
from contextlib import suppress
5-
from dataclasses import dataclass
5+
from dataclasses import dataclass, field as dataclass_field
66
from ipaddress import ip_address
77
from time import time
88
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
99

1010
from playwright.async_api import (
1111
BrowserContext,
1212
BrowserType,
13-
Download,
13+
Download as PlaywrightDownload,
1414
Error as PlaywrightError,
1515
Page,
1616
Playwright as AsyncPlaywright,
@@ -66,6 +66,19 @@ class BrowserContextWrapper:
6666
persistent: bool
6767

6868

69+
@dataclass
70+
class Download:
71+
body: bytes = b""
72+
url: str = ""
73+
suggested_filename: str = ""
74+
exception: Optional[Exception] = None
75+
response_status: int = 200
76+
headers: dict = dataclass_field(default_factory=dict)
77+
78+
def __bool__(self) -> bool:
79+
return bool(self.body) or bool(self.exception)
80+
81+
6982
@dataclass
7083
class Config:
7184
cdp_url: Optional[str]
@@ -397,7 +410,7 @@ async def _download_request_with_page(
397410
await _set_redirect_meta(request=request, response=response)
398411
headers = Headers(await response.all_headers())
399412
headers.pop("Content-Encoding", None)
400-
elif not download.get("bytes"):
413+
elif not download:
401414
logger.warning(
402415
"Navigating to %s returned None, the response"
403416
" will have empty headers and status 200",
@@ -428,20 +441,21 @@ async def _download_request_with_page(
428441
server_addr = await response.server_addr()
429442
server_ip_address = ip_address(server_addr["ipAddress"])
430443

431-
if download.get("exception"):
432-
raise download["exception"]
444+
if download and download.exception:
445+
raise download.exception
433446

434447
if not request.meta.get("playwright_include_page"):
435448
await page.close()
436449
self.stats.inc_value("playwright/page_count/closed")
437450

438-
if download.get("bytes"):
439-
request.meta["playwright_suggested_filename"] = download.get("suggested_filename")
440-
respcls = responsetypes.from_args(url=download["url"], body=download["bytes"])
451+
if download:
452+
request.meta["playwright_suggested_filename"] = download.suggested_filename
453+
respcls = responsetypes.from_args(url=download.url, body=download.body)
441454
return respcls(
442-
url=download["url"],
443-
status=200,
444-
body=download["bytes"],
455+
url=download.url,
456+
status=download.response_status,
457+
headers=Headers(download.headers),
458+
body=download.body,
445459
request=request,
446460
flags=["playwright"],
447461
)
@@ -461,29 +475,29 @@ async def _download_request_with_page(
461475

462476
async def _get_response_and_download(
463477
self, request: Request, page: Page, spider: Spider
464-
) -> Tuple[Optional[PlaywrightResponse], dict]:
478+
) -> Tuple[Optional[PlaywrightResponse], Optional[Download]]:
465479
response: Optional[PlaywrightResponse] = None
466-
download: dict = {} # updated in-place in _handle_download
480+
download: Download = Download() # updated in-place in _handle_download
467481
download_started = asyncio.Event()
468482
download_ready = asyncio.Event()
469483

470-
async def _handle_download(dwnld: Download) -> None:
484+
async def _handle_download(dwnld: PlaywrightDownload) -> None:
471485
download_started.set()
472486
self.stats.inc_value("playwright/download_count")
473487
try:
474488
if failure := await dwnld.failure():
475489
raise RuntimeError(f"Failed to download {dwnld.url}: {failure}")
476-
download_path = await dwnld.path()
477-
download["bytes"] = download_path.read_bytes()
478-
download["url"] = dwnld.url
479-
download["suggested_filename"] = dwnld.suggested_filename
490+
download.body = (await dwnld.path()).read_bytes()
491+
download.url = dwnld.url
492+
download.suggested_filename = dwnld.suggested_filename
480493
except Exception as ex:
481-
download["exception"] = ex
494+
download.exception = ex
482495
finally:
483496
download_ready.set()
484497

485498
async def _handle_response(response: PlaywrightResponse) -> None:
486-
download["response_status"] = response.status
499+
download.response_status = response.status
500+
download.headers = await response.all_headers()
487501
download_started.set()
488502

489503
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
@@ -513,7 +527,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
513527
)
514528
await download_started.wait()
515529

516-
if download.get("response_status") == 204:
530+
if download.response_status == 204:
517531
raise err
518532

519533
logger.debug(
@@ -531,7 +545,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
531545
page.remove_listener("download", _handle_download)
532546
page.remove_listener("response", _handle_response)
533547

534-
return response, download
548+
return response, download if download else None
535549

536550
async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
537551
context_name = request.meta.get("playwright_context")

tests/tests_asyncio/test_playwright_requests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ async def test_logging_record_spider(self):
386386
assert any(getattr(rec, "spider", None) is spider for rec in self._caplog.records)
387387

388388
@allow_windows
389-
async def test_download_file(self):
389+
async def test_download_file_ok(self):
390390
settings_dict = {
391391
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
392392
}
@@ -399,6 +399,7 @@ async def test_download_file(self):
399399
response = await handler._download_request(request, Spider("foo"))
400400
assert response.meta["playwright_suggested_filename"] == "mancha.pdf"
401401
assert response.body.startswith(b"%PDF-1.5")
402+
assert response.headers.get("Content-Type") == b"application/pdf"
402403
assert handler.stats.get_value("playwright/download_count") == 1
403404

404405
@allow_windows

0 commit comments

Comments
 (0)