Skip to content

Commit e26d3c1

Browse files
authored
Restart on browser crash (#295)
* Browser disconnected callback * PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER setting * Document PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER * Readme adjustment * Rename tests * Test browser restart * Simplify * Less lines * Remove browser name from logs
1 parent f3b1b25 commit e26d3c1

File tree

5 files changed

+84
-11
lines changed

5 files changed

+84
-11
lines changed

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,17 @@ def custom_headers(
324324
PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers
325325
```
326326

327+
### `PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER`
328+
Type `bool`, default `True`
329+
330+
Whether the browser will be restarted if it gets disconnected, for instance if the local
331+
browser crashes or a remote connection times out.
332+
Implemented by listening to the
333+
[`disconnected` Browser event](https://playwright.dev/python/docs/api/class-browser#browser-event-disconnected),
334+
for this reason it does not apply to persistent contexts since
335+
[BrowserType.launch_persistent_context](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context)
336+
returns the context directly.
337+
327338
### `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT`
328339
Type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting
329340

scrapy_playwright/handler.py

+15
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class Config:
9191
max_contexts: Optional[int]
9292
startup_context_kwargs: dict
9393
navigation_timeout: Optional[float]
94+
restart_disconnected_browser: bool
9495

9596
@classmethod
9697
def from_settings(cls, settings: Settings) -> "Config":
@@ -111,6 +112,9 @@ def from_settings(cls, settings: Settings) -> "Config":
111112
navigation_timeout=_get_float_setting(
112113
settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"
113114
),
115+
restart_disconnected_browser=settings.getbool(
116+
"PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True
117+
),
114118
)
115119
cfg.cdp_kwargs.pop("endpoint_url", None)
116120
cfg.connect_kwargs.pop("ws_endpoint", None)
@@ -188,6 +192,7 @@ async def _maybe_launch_browser(self) -> None:
188192
logger.info("Launching browser %s", self.browser_type.name)
189193
self.browser = await self.browser_type.launch(**self.config.launch_options)
190194
logger.info("Browser %s launched", self.browser_type.name)
195+
self.browser.on("disconnected", self._browser_disconnected_callback)
191196

192197
async def _maybe_connect_remote_devtools(self) -> None:
193198
async with self.browser_launch_lock:
@@ -197,6 +202,7 @@ async def _maybe_connect_remote_devtools(self) -> None:
197202
self.config.cdp_url, **self.config.cdp_kwargs
198203
)
199204
logger.info("Connected using CDP: %s", self.config.cdp_url)
205+
self.browser.on("disconnected", self._browser_disconnected_callback)
200206

201207
async def _maybe_connect_remote(self) -> None:
202208
async with self.browser_launch_lock:
@@ -206,6 +212,7 @@ async def _maybe_connect_remote(self) -> None:
206212
self.config.connect_url, **self.config.connect_kwargs
207213
)
208214
logger.info("Connected to remote Playwright")
215+
self.browser.on("disconnected", self._browser_disconnected_callback)
209216

210217
async def _create_browser_context(
211218
self,
@@ -599,6 +606,14 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
599606
self.stats.inc_value(f"{stats_prefix}/resource_type/{response.request.resource_type}")
600607
self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}")
601608

609+
async def _browser_disconnected_callback(self) -> None:
610+
await asyncio.gather(
611+
*[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()]
612+
)
613+
logger.debug("Browser disconnected")
614+
if self.config.restart_disconnected_browser:
615+
del self.browser
616+
602617
def _make_close_page_callback(self, context_name: str) -> Callable:
603618
def close_page_callback() -> None:
604619
if context_name in self.context_wrappers:

tests/__init__.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import platform
55
from contextlib import asynccontextmanager
66
from functools import wraps
7+
from typing import Optional
78

89
from scrapy import Request
910
from scrapy.http.response.html import HtmlResponse
@@ -37,12 +38,13 @@ def allow_windows(test_method):
3738

3839

3940
@asynccontextmanager
40-
async def make_handler(settings_dict: dict):
41+
async def make_handler(settings_dict: Optional[dict] = None):
4142
"""Convenience function to obtain an initialized handler and close it gracefully"""
4243
from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
4344

44-
settings_dict.setdefault("TELNETCONSOLE_ENABLED", False)
45-
crawler = get_crawler(settings_dict=settings_dict)
45+
settings: dict = settings_dict or {}
46+
settings.setdefault("TELNETCONSOLE_ENABLED", False)
47+
crawler = get_crawler(settings_dict=settings)
4648
handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
4749
try:
4850
await handler._launch()

tests/launch_browser_server.js renamed to tests/launch_chromium_server.js

+1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'.
1010
port: process.argv[2],
1111
wsPath: process.argv[3]
1212
});
13+
console.log(browserServer.wsEndpoint())
1314
})();

tests/tests_asyncio/test_remote.py renamed to tests/tests_asyncio/test_browser.py

+52-8
Original file line numberDiff line numberDiff line change
@@ -42,29 +42,29 @@ async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]:
4242
return proc, devtools_url
4343

4444

45-
def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]:
45+
def _run_chromium_browser_server() -> Tuple[subprocess.Popen, str]:
4646
"""Start a Playwright server in a separate process, return the process
4747
object and a string with its websocket endpoint.
4848
Pass fixed port and ws path as arguments instead of allowing Playwright
4949
to choose, for some reason I was unable to capture stdout/stderr :shrug:
5050
"""
5151
port = str(random.randint(60_000, 63_000))
5252
ws_path = str(uuid.uuid4())
53-
launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js")
53+
launch_server_script_path = str(Path(__file__).parent.parent / "launch_chromium_server.js")
5454
command = ["node", launch_server_script_path, port, ws_path]
5555
proc = subprocess.Popen(command) # pylint: disable=consider-using-with
5656
return proc, f"ws://localhost:{port}/{ws_path}"
5757

5858

5959
@asynccontextmanager
60-
async def remote_browser(is_chrome_devtools_protocol: bool = True):
60+
async def remote_chromium(with_devtools_protocol: bool = True):
6161
"""Launch a remote browser that lasts while in the context."""
6262
proc = url = None
6363
try:
64-
if is_chrome_devtools_protocol:
64+
if with_devtools_protocol:
6565
proc, url = await _run_chromium_devtools()
6666
else:
67-
proc, url = _run_playwright_browser_server()
67+
proc, url = _run_chromium_browser_server()
6868
await asyncio.sleep(1) # allow some time for the browser to start
6969
except Exception:
7070
pass
@@ -77,15 +77,15 @@ async def remote_browser(is_chrome_devtools_protocol: bool = True):
7777
proc.communicate()
7878

7979

80-
class TestRemote(IsolatedAsyncioTestCase):
80+
class TestRemoteBrowser(IsolatedAsyncioTestCase):
8181
@pytest.fixture(autouse=True)
8282
def inject_fixtures(self, caplog):
8383
caplog.set_level(logging.DEBUG)
8484
self._caplog = caplog
8585

8686
@allow_windows
8787
async def test_connect_devtools(self):
88-
async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url:
88+
async with remote_chromium(with_devtools_protocol=True) as devtools_url:
8989
settings_dict = {
9090
"PLAYWRIGHT_CDP_URL": devtools_url,
9191
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
@@ -103,7 +103,7 @@ async def test_connect_devtools(self):
103103

104104
@allow_windows
105105
async def test_connect(self):
106-
async with remote_browser(is_chrome_devtools_protocol=False) as browser_url:
106+
async with remote_chromium(with_devtools_protocol=False) as browser_url:
107107
settings_dict = {
108108
"PLAYWRIGHT_CONNECT_URL": browser_url,
109109
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
@@ -128,3 +128,47 @@ async def test_connect(self):
128128
logging.WARNING,
129129
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
130130
) in self._caplog.record_tuples
131+
132+
133+
class TestBrowserReconnect(IsolatedAsyncioTestCase):
134+
@pytest.fixture(autouse=True)
135+
def inject_fixtures(self, caplog):
136+
caplog.set_level(logging.DEBUG)
137+
self._caplog = caplog
138+
139+
@allow_windows
140+
async def test_restart_browser(self):
141+
spider = Spider("foo")
142+
async with make_handler() as handler:
143+
with StaticMockServer() as server:
144+
req1 = Request(
145+
server.urljoin("/index.html"),
146+
meta={"playwright": True, "playwright_include_page": True},
147+
)
148+
resp1 = await handler._download_request(req1, spider)
149+
page = resp1.meta["playwright_page"]
150+
await page.context.browser.close()
151+
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
152+
resp2 = await handler._download_request(req2, spider)
153+
assert_correct_response(resp1, req1)
154+
assert_correct_response(resp2, req2)
155+
assert (
156+
self._caplog.record_tuples.count(
157+
(
158+
"scrapy-playwright",
159+
logging.DEBUG,
160+
"Browser disconnected",
161+
)
162+
)
163+
== 2 # one mid-crawl after calling Browser.close() manually, one at the end
164+
)
165+
assert (
166+
self._caplog.record_tuples.count(
167+
(
168+
"scrapy-playwright",
169+
logging.INFO,
170+
"Launching browser chromium",
171+
)
172+
)
173+
== 2 # one at the beginning, one after calling Browser.close() manually
174+
)

0 commit comments

Comments
 (0)