2
2
import logging
3
3
import platform
4
4
from contextlib import suppress
5
- from dataclasses import dataclass
5
+ from dataclasses import dataclass , field as dataclass_field
6
6
from ipaddress import ip_address
7
7
from time import time
8
8
from typing import Awaitable , Callable , Dict , Optional , Tuple , Type , TypeVar , Union
9
9
10
10
from playwright .async_api import (
11
11
BrowserContext ,
12
12
BrowserType ,
13
- Download ,
13
+ Download as PlaywrightDownload ,
14
14
Error as PlaywrightError ,
15
15
Page ,
16
16
Playwright as AsyncPlaywright ,
@@ -66,6 +66,19 @@ class BrowserContextWrapper:
66
66
persistent : bool
67
67
68
68
69
+ @dataclass
70
+ class Download :
71
+ body : bytes = b""
72
+ url : str = ""
73
+ suggested_filename : str = ""
74
+ exception : Optional [Exception ] = None
75
+ response_status : int = 200
76
+ headers : dict = dataclass_field (default_factory = dict )
77
+
78
+ def __bool__ (self ) -> bool :
79
+ return bool (self .body ) or bool (self .exception )
80
+
81
+
69
82
@dataclass
70
83
class Config :
71
84
cdp_url : Optional [str ]
@@ -397,7 +410,7 @@ async def _download_request_with_page(
397
410
await _set_redirect_meta (request = request , response = response )
398
411
headers = Headers (await response .all_headers ())
399
412
headers .pop ("Content-Encoding" , None )
400
- elif not download . get ( "bytes" ) :
413
+ elif not download :
401
414
logger .warning (
402
415
"Navigating to %s returned None, the response"
403
416
" will have empty headers and status 200" ,
@@ -428,20 +441,21 @@ async def _download_request_with_page(
428
441
server_addr = await response .server_addr ()
429
442
server_ip_address = ip_address (server_addr ["ipAddress" ])
430
443
431
- if download . get ( " exception" ) :
432
- raise download [ " exception" ]
444
+ if download and download . exception :
445
+ raise download . exception
433
446
434
447
if not request .meta .get ("playwright_include_page" ):
435
448
await page .close ()
436
449
self .stats .inc_value ("playwright/page_count/closed" )
437
450
438
- if download . get ( "bytes" ) :
439
- request .meta ["playwright_suggested_filename" ] = download .get ( " suggested_filename" )
440
- respcls = responsetypes .from_args (url = download [ " url" ] , body = download [ "bytes" ] )
451
+ if download :
452
+ request .meta ["playwright_suggested_filename" ] = download .suggested_filename
453
+ respcls = responsetypes .from_args (url = download . url , body = download . body )
441
454
return respcls (
442
- url = download ["url" ],
443
- status = 200 ,
444
- body = download ["bytes" ],
455
+ url = download .url ,
456
+ status = download .response_status ,
457
+ headers = Headers (download .headers ),
458
+ body = download .body ,
445
459
request = request ,
446
460
flags = ["playwright" ],
447
461
)
@@ -461,29 +475,29 @@ async def _download_request_with_page(
461
475
462
476
async def _get_response_and_download (
463
477
self , request : Request , page : Page , spider : Spider
464
- ) -> Tuple [Optional [PlaywrightResponse ], dict ]:
478
+ ) -> Tuple [Optional [PlaywrightResponse ], Optional [ Download ] ]:
465
479
response : Optional [PlaywrightResponse ] = None
466
- download : dict = {} # updated in-place in _handle_download
480
+ download : Download = Download () # updated in-place in _handle_download
467
481
download_started = asyncio .Event ()
468
482
download_ready = asyncio .Event ()
469
483
470
- async def _handle_download (dwnld : Download ) -> None :
484
+ async def _handle_download (dwnld : PlaywrightDownload ) -> None :
471
485
download_started .set ()
472
486
self .stats .inc_value ("playwright/download_count" )
473
487
try :
474
488
if failure := await dwnld .failure ():
475
489
raise RuntimeError (f"Failed to download { dwnld .url } : { failure } " )
476
- download_path = await dwnld .path ()
477
- download ["bytes" ] = download_path .read_bytes ()
478
- download ["url" ] = dwnld .url
479
- download ["suggested_filename" ] = dwnld .suggested_filename
490
+ download .body = (await dwnld .path ()).read_bytes ()
491
+ download .url = dwnld .url
492
+ download .suggested_filename = dwnld .suggested_filename
480
493
except Exception as ex :
481
- download [ " exception" ] = ex
494
+ download . exception = ex
482
495
finally :
483
496
download_ready .set ()
484
497
485
498
async def _handle_response (response : PlaywrightResponse ) -> None :
486
- download ["response_status" ] = response .status
499
+ download .response_status = response .status
500
+ download .headers = await response .all_headers ()
487
501
download_started .set ()
488
502
489
503
page_goto_kwargs = request .meta .get ("playwright_page_goto_kwargs" ) or {}
@@ -513,7 +527,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
513
527
)
514
528
await download_started .wait ()
515
529
516
- if download .get ( " response_status" ) == 204 :
530
+ if download .response_status == 204 :
517
531
raise err
518
532
519
533
logger .debug (
@@ -531,7 +545,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
531
545
page .remove_listener ("download" , _handle_download )
532
546
page .remove_listener ("response" , _handle_response )
533
547
534
- return response , download
548
+ return response , download if download else None
535
549
536
550
async def _apply_page_methods (self , page : Page , request : Request , spider : Spider ) -> None :
537
551
context_name = request .meta .get ("playwright_context" )
0 commit comments