From 4e35cdea4e042092ad6804ce7abe29dd82b83eb2 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Fri, 22 Feb 2019 16:32:59 +0800 Subject: [PATCH 1/3] Add better proxy support for the middleware. More specifically, user can configure the proxy for each of their request. If the proxy is not yet used, a new driver will be created. --- README.md | 17 ++++++++-- requirements/requirements.txt | 1 + scrapy_selenium/middlewares.py | 61 ++++++++++++++++++++++++++-------- tests/test_cases.py | 3 +- tests/test_middlewares.py | 37 +++++++++++++++++++++ 5 files changed, 103 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index e949156..f7be891 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Scrapy with selenium [![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selenium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability) -Scrapy middleware to handle javascript pages using selenium. +Scrapy middleware to handle javascript pages using selenium with better proxy support. ## Installation ``` @@ -18,6 +18,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium SELENIUM_DRIVER_NAME = 'firefox' SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver') SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox + SELENIUM_DRIVER_MAX_CONCURRENT=8 # maximal number of driver to be running concurrently. By default, it is 8. ``` Optionally, set the path to the browser executable: @@ -36,7 +37,7 @@ Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Reques ```python from scrapy_selenium import SeleniumRequest -yield SeleniumRequest(url, self.parse_result) +yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'some proxt url here'}) ``` The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. ```python @@ -92,3 +93,15 @@ yield SeleniumRequest( script='window.scrollTo(0, document.body.scrollHeight);', ) ``` + +#### `proxy` + +```python +from scrapy_selenium import SeleniumRequest + +yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'your proxy url here'}) +``` + +Here you can provide your proxy to the request. If the proxy has already used, the existing Selenium driver with that proxy will be used (if not yet evicted from the driver list); otherwise, a new driver with that proxy will be created instead. + +Internally, we use a LRU cache to track the active drivers. diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e6e2710..26d954e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,3 @@ scrapy>=1.0.0 selenium>=3.9.0 +pylru==1.0.3p \ No newline at end of file diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 3761ca5..62b9832 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -1,20 +1,35 @@ """This module contains the ``SeleniumMiddleware`` scrapy middleware""" from importlib import import_module +from typing import Iterable +from pylru import lrucache from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.http import HtmlResponse +from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.support.ui import WebDriverWait from .http import SeleniumRequest +def on_driver_removed(proxy: str, driver: WebDriver): + """ + Closes the webdriver when evicted from the cache. + + :param proxy: the proxy, not used + :param driver: the driver being evicted + """ + driver.quit() + + class SeleniumMiddleware: """Scrapy middleware handling the requests using selenium""" + # default proxy which is nop roxy at all + default_proxy = '' - def __init__(self, driver_name, driver_executable_path, driver_arguments, - browser_executable_path): + def __init__(self, driver_name: str, driver_executable_path: str, driver_arguments: Iterable[str], + browser_executable_path: str, max_concurrent_driver: int=8): """Initialize the selenium webdriver Parameters @@ -27,6 +42,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, A list of arguments to initialize the driver browser_executable_path: str The path of the executable binary of the browser + max_concurrent_driver: str + The maximal numnber of concurrent driver to be held """ webdriver_base_path = f'selenium.webdriver.{driver_name}' @@ -48,7 +65,18 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, f'{driver_name}_options': driver_options } - self.driver = driver_klass(**driver_kwargs) + def create_driver(proxy: str) -> WebDriver: + """ + Creates a new driver, with optional proxy + + :param proxy: the proxy, which should be something like http://... or https://... or socks:// + :return: a webdriver created with the provided proxy + """ + if proxy is not None and isinstance(proxy, str) and len(proxy) > 0: + driver_kwargs[f'{driver_name}_options'].add_argument("--proxy-server={}".format(proxy)) + return driver_klass(**driver_kwargs) + self.create_driver = create_driver + self.drivers = lrucache(max_concurrent_driver, on_driver_removed) @classmethod def from_crawler(cls, crawler): @@ -58,6 +86,7 @@ def from_crawler(cls, crawler): driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH') driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') + max_concurrent_driver = crawler.settings.get('SELENIUM_DRIVER_MAX_CONCURRENT') if not driver_name or not driver_executable_path: raise NotConfigured( @@ -68,7 +97,8 @@ def from_crawler(cls, crawler): driver_name=driver_name, driver_executable_path=driver_executable_path, driver_arguments=driver_arguments, - browser_executable_path=browser_executable_path + browser_executable_path=browser_executable_path, + max_concurrent_driver=max_concurrent_driver ) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) @@ -81,10 +111,15 @@ def process_request(self, request, spider): if not isinstance(request, SeleniumRequest): return None - self.driver.get(request.url) + # request a proxy: + if request.meta.get('proxy', self.default_proxy) not in self.drivers: + # this proxy is new, create a driver with this proxy + driver = self.create_driver(request.meta.get('proxy', self.default_proxy)) + self.drivers[request.meta.get('proxy', self.default_proxy)] = driver + return self.drivers[request.meta.get('proxy', self.default_proxy)] for cookie_name, cookie_value in request.cookies.items(): - self.driver.add_cookie( + driver.add_cookie( { 'name': cookie_name, 'value': cookie_value @@ -92,23 +127,23 @@ def process_request(self, request, spider): ) if request.wait_until: - WebDriverWait(self.driver, request.wait_time).until( + WebDriverWait(driver, request.wait_time).until( request.wait_until ) if request.screenshot: - request.meta['screenshot'] = self.driver.get_screenshot_as_png() + request.meta['screenshot'] = driver.get_screenshot_as_png() if request.script: - self.driver.execute_script(request.script) + driver.execute_script(request.script) - body = str.encode(self.driver.page_source) + body = str.encode(driver.page_source) # Expose the driver via the "meta" attribute - request.meta.update({'driver': self.driver}) + request.meta.update({'driver': driver}) return HtmlResponse( - self.driver.current_url, + driver.current_url, body=body, encoding='utf-8', request=request @@ -117,5 +152,5 @@ def process_request(self, request, spider): def spider_closed(self): """Shutdown the driver when spider is closed""" - self.driver.quit() + self.drivers.clear() diff --git a/tests/test_cases.py b/tests/test_cases.py index fbb992b..074c964 100644 --- a/tests/test_cases.py +++ b/tests/test_cases.py @@ -24,6 +24,7 @@ def setUpClass(cls): cls.settings = { 'SELENIUM_DRIVER_NAME': 'firefox', 'SELENIUM_DRIVER_EXECUTABLE_PATH': which('geckodriver'), - 'SELENIUM_DRIVER_ARGUMENTS': ['-headless'] + 'SELENIUM_DRIVER_ARGUMENTS': ['-headless'], + 'SELENIUM_DRIVER_MAX_CONCURRENT': 2, } cls.spider_klass = cls.SimpleSpider diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index fe365e4..419d6d7 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -135,3 +135,40 @@ def test_process_request_should_execute_script_if_script_option(self): html_response.selector.xpath('//title/text()').extract_first(), 'scrapy_selenium' ) + + def test_max_concurrent_driver(self): + """Test that up to max_concurrent_driver should be alive. Evicted driver should be closed.""" + SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + ) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] + SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.2'} + ) + self.assertEqual(len(self.selenium_middleware.drivers), 2) + SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.3'} + ) + # one of the driver is evicted + self.assertEqual(len(self.selenium_middleware.drivers), 2) + # when driver quites, the session id will be None + self.assertEqual(driver1.session_id, None) + + def test_same_proxy_should_reuse_driver(self): + SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + ) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] + SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + ) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver2 = self.selenium_middleware.drivers['http://1.1.1.1'] + self.assertEqual(driver1.session_id, driver2.session_id) From 8badad1476aafa0ea3a7f79200aa3ca80599b2b0 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Sat, 23 Feb 2019 13:43:02 +0800 Subject: [PATCH 2/3] Fix the dependency cannot found error. --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 26d954e..01fd23e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,3 +1,3 @@ scrapy>=1.0.0 selenium>=3.9.0 -pylru==1.0.3p \ No newline at end of file +pylru>=1.1.0 From 50c6d85c424a50ebae7025676835b366e58eb23a Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Mon, 25 Feb 2019 15:14:51 +0800 Subject: [PATCH 3/3] Update the tests. --- tests/test_middlewares.py | 48 ++++++++++++--------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 419d6d7..3bcc929 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -33,26 +33,7 @@ def tearDownClass(cls): super().tearDownClass() - cls.selenium_middleware.driver.quit() - - def test_from_crawler_method_should_initialize_the_driver(self): - """Test that the ``from_crawler`` method should initialize the selenium driver""" - - crawler = Crawler( - spidercls=self.spider_klass, - settings=self.settings - ) - - selenium_middleware = SeleniumMiddleware.from_crawler(crawler) - - # The driver must be initialized - self.assertIsNotNone(selenium_middleware.driver) - - # We can now use the driver - selenium_middleware.driver.get('http://www.python.org') - self.assertIn('Python', selenium_middleware.driver.title) - - selenium_middleware.driver.close() + cls.selenium_middleware.spider_closed() def test_spider_closed_should_close_the_driver(self): """Test that the ``spider_closed`` method should close the driver""" @@ -64,10 +45,9 @@ def test_spider_closed_should_close_the_driver(self): selenium_middleware = SeleniumMiddleware.from_crawler(crawler) - with patch.object(selenium_middleware.driver, 'quit') as mocked_quit: - selenium_middleware.spider_closed() - - mocked_quit.assert_called_once() + mocked_quit = [patch.object(driver, 'quit') for driver in selenium_middleware.drivers] + for q in mocked_quit: + q.assert_called_once() def test_process_request_should_return_none_if_not_selenium_request(self): """Test that the ``process_request`` should return none if not selenium request""" @@ -138,37 +118,37 @@ def test_process_request_should_execute_script_if_script_option(self): def test_max_concurrent_driver(self): """Test that up to max_concurrent_driver should be alive. Evicted driver should be closed.""" - SeleniumRequest( + self.selenium_middleware.process_request(SeleniumRequest( url='http://www.python.org', meta={'proxy': 'http://1.1.1.1'} - ) + )) self.assertEqual(len(self.selenium_middleware.drivers), 1) driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] - SeleniumRequest( + self.selenium_middleware.process_request(SeleniumRequest( url='http://www.python.org', meta={'proxy': 'http://1.1.1.2'} - ) + )) self.assertEqual(len(self.selenium_middleware.drivers), 2) - SeleniumRequest( + self.selenium_middleware.process_request(SeleniumRequest( url='http://www.python.org', meta={'proxy': 'http://1.1.1.3'} - ) + )) # one of the driver is evicted self.assertEqual(len(self.selenium_middleware.drivers), 2) # when driver quites, the session id will be None self.assertEqual(driver1.session_id, None) def test_same_proxy_should_reuse_driver(self): - SeleniumRequest( + self.selenium_middleware.process_request(SeleniumRequest( url='http://www.python.org', meta={'proxy': 'http://1.1.1.1'} - ) + )) self.assertEqual(len(self.selenium_middleware.drivers), 1) driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] - SeleniumRequest( + self.selenium_middleware.process_request(SeleniumRequest( url='http://www.python.org', meta={'proxy': 'http://1.1.1.1'} - ) + )) self.assertEqual(len(self.selenium_middleware.drivers), 1) driver2 = self.selenium_middleware.drivers['http://1.1.1.1'] self.assertEqual(driver1.session_id, driver2.session_id)