diff --git a/README.md b/README.md index e949156..f7be891 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Scrapy with selenium [![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selenium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability) -Scrapy middleware to handle javascript pages using selenium. +Scrapy middleware to handle javascript pages using selenium with better proxy support. ## Installation ``` @@ -18,6 +18,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium SELENIUM_DRIVER_NAME = 'firefox' SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver') SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox + SELENIUM_DRIVER_MAX_CONCURRENT=8 # maximal number of driver to be running concurrently. By default, it is 8. ``` Optionally, set the path to the browser executable: @@ -36,7 +37,7 @@ Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Reques ```python from scrapy_selenium import SeleniumRequest -yield SeleniumRequest(url, self.parse_result) +yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'some proxt url here'}) ``` The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed. ```python @@ -92,3 +93,15 @@ yield SeleniumRequest( script='window.scrollTo(0, document.body.scrollHeight);', ) ``` + +#### `proxy` + +```python +from scrapy_selenium import SeleniumRequest + +yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'your proxy url here'}) +``` + +Here you can provide your proxy to the request. If the proxy has already used, the existing Selenium driver with that proxy will be used (if not yet evicted from the driver list); otherwise, a new driver with that proxy will be created instead. + +Internally, we use a LRU cache to track the active drivers. diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e6e2710..01fd23e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,3 @@ scrapy>=1.0.0 selenium>=3.9.0 +pylru>=1.1.0 diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 3761ca5..62b9832 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -1,20 +1,35 @@ """This module contains the ``SeleniumMiddleware`` scrapy middleware""" from importlib import import_module +from typing import Iterable +from pylru import lrucache from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.http import HtmlResponse +from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.support.ui import WebDriverWait from .http import SeleniumRequest +def on_driver_removed(proxy: str, driver: WebDriver): + """ + Closes the webdriver when evicted from the cache. + + :param proxy: the proxy, not used + :param driver: the driver being evicted + """ + driver.quit() + + class SeleniumMiddleware: """Scrapy middleware handling the requests using selenium""" + # default proxy which is nop roxy at all + default_proxy = '' - def __init__(self, driver_name, driver_executable_path, driver_arguments, - browser_executable_path): + def __init__(self, driver_name: str, driver_executable_path: str, driver_arguments: Iterable[str], + browser_executable_path: str, max_concurrent_driver: int=8): """Initialize the selenium webdriver Parameters @@ -27,6 +42,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, A list of arguments to initialize the driver browser_executable_path: str The path of the executable binary of the browser + max_concurrent_driver: str + The maximal numnber of concurrent driver to be held """ webdriver_base_path = f'selenium.webdriver.{driver_name}' @@ -48,7 +65,18 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments, f'{driver_name}_options': driver_options } - self.driver = driver_klass(**driver_kwargs) + def create_driver(proxy: str) -> WebDriver: + """ + Creates a new driver, with optional proxy + + :param proxy: the proxy, which should be something like http://... or https://... or socks:// + :return: a webdriver created with the provided proxy + """ + if proxy is not None and isinstance(proxy, str) and len(proxy) > 0: + driver_kwargs[f'{driver_name}_options'].add_argument("--proxy-server={}".format(proxy)) + return driver_klass(**driver_kwargs) + self.create_driver = create_driver + self.drivers = lrucache(max_concurrent_driver, on_driver_removed) @classmethod def from_crawler(cls, crawler): @@ -58,6 +86,7 @@ def from_crawler(cls, crawler): driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH') driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') + max_concurrent_driver = crawler.settings.get('SELENIUM_DRIVER_MAX_CONCURRENT') if not driver_name or not driver_executable_path: raise NotConfigured( @@ -68,7 +97,8 @@ def from_crawler(cls, crawler): driver_name=driver_name, driver_executable_path=driver_executable_path, driver_arguments=driver_arguments, - browser_executable_path=browser_executable_path + browser_executable_path=browser_executable_path, + max_concurrent_driver=max_concurrent_driver ) crawler.signals.connect(middleware.spider_closed, signals.spider_closed) @@ -81,10 +111,15 @@ def process_request(self, request, spider): if not isinstance(request, SeleniumRequest): return None - self.driver.get(request.url) + # request a proxy: + if request.meta.get('proxy', self.default_proxy) not in self.drivers: + # this proxy is new, create a driver with this proxy + driver = self.create_driver(request.meta.get('proxy', self.default_proxy)) + self.drivers[request.meta.get('proxy', self.default_proxy)] = driver + return self.drivers[request.meta.get('proxy', self.default_proxy)] for cookie_name, cookie_value in request.cookies.items(): - self.driver.add_cookie( + driver.add_cookie( { 'name': cookie_name, 'value': cookie_value @@ -92,23 +127,23 @@ def process_request(self, request, spider): ) if request.wait_until: - WebDriverWait(self.driver, request.wait_time).until( + WebDriverWait(driver, request.wait_time).until( request.wait_until ) if request.screenshot: - request.meta['screenshot'] = self.driver.get_screenshot_as_png() + request.meta['screenshot'] = driver.get_screenshot_as_png() if request.script: - self.driver.execute_script(request.script) + driver.execute_script(request.script) - body = str.encode(self.driver.page_source) + body = str.encode(driver.page_source) # Expose the driver via the "meta" attribute - request.meta.update({'driver': self.driver}) + request.meta.update({'driver': driver}) return HtmlResponse( - self.driver.current_url, + driver.current_url, body=body, encoding='utf-8', request=request @@ -117,5 +152,5 @@ def process_request(self, request, spider): def spider_closed(self): """Shutdown the driver when spider is closed""" - self.driver.quit() + self.drivers.clear() diff --git a/tests/test_cases.py b/tests/test_cases.py index fbb992b..074c964 100644 --- a/tests/test_cases.py +++ b/tests/test_cases.py @@ -24,6 +24,7 @@ def setUpClass(cls): cls.settings = { 'SELENIUM_DRIVER_NAME': 'firefox', 'SELENIUM_DRIVER_EXECUTABLE_PATH': which('geckodriver'), - 'SELENIUM_DRIVER_ARGUMENTS': ['-headless'] + 'SELENIUM_DRIVER_ARGUMENTS': ['-headless'], + 'SELENIUM_DRIVER_MAX_CONCURRENT': 2, } cls.spider_klass = cls.SimpleSpider diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index fe365e4..3bcc929 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -33,26 +33,7 @@ def tearDownClass(cls): super().tearDownClass() - cls.selenium_middleware.driver.quit() - - def test_from_crawler_method_should_initialize_the_driver(self): - """Test that the ``from_crawler`` method should initialize the selenium driver""" - - crawler = Crawler( - spidercls=self.spider_klass, - settings=self.settings - ) - - selenium_middleware = SeleniumMiddleware.from_crawler(crawler) - - # The driver must be initialized - self.assertIsNotNone(selenium_middleware.driver) - - # We can now use the driver - selenium_middleware.driver.get('http://www.python.org') - self.assertIn('Python', selenium_middleware.driver.title) - - selenium_middleware.driver.close() + cls.selenium_middleware.spider_closed() def test_spider_closed_should_close_the_driver(self): """Test that the ``spider_closed`` method should close the driver""" @@ -64,10 +45,9 @@ def test_spider_closed_should_close_the_driver(self): selenium_middleware = SeleniumMiddleware.from_crawler(crawler) - with patch.object(selenium_middleware.driver, 'quit') as mocked_quit: - selenium_middleware.spider_closed() - - mocked_quit.assert_called_once() + mocked_quit = [patch.object(driver, 'quit') for driver in selenium_middleware.drivers] + for q in mocked_quit: + q.assert_called_once() def test_process_request_should_return_none_if_not_selenium_request(self): """Test that the ``process_request`` should return none if not selenium request""" @@ -135,3 +115,40 @@ def test_process_request_should_execute_script_if_script_option(self): html_response.selector.xpath('//title/text()').extract_first(), 'scrapy_selenium' ) + + def test_max_concurrent_driver(self): + """Test that up to max_concurrent_driver should be alive. Evicted driver should be closed.""" + self.selenium_middleware.process_request(SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + )) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] + self.selenium_middleware.process_request(SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.2'} + )) + self.assertEqual(len(self.selenium_middleware.drivers), 2) + self.selenium_middleware.process_request(SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.3'} + )) + # one of the driver is evicted + self.assertEqual(len(self.selenium_middleware.drivers), 2) + # when driver quites, the session id will be None + self.assertEqual(driver1.session_id, None) + + def test_same_proxy_should_reuse_driver(self): + self.selenium_middleware.process_request(SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + )) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver1 = self.selenium_middleware.drivers['http://1.1.1.1'] + self.selenium_middleware.process_request(SeleniumRequest( + url='http://www.python.org', + meta={'proxy': 'http://1.1.1.1'} + )) + self.assertEqual(len(self.selenium_middleware.drivers), 1) + driver2 = self.selenium_middleware.drivers['http://1.1.1.1'] + self.assertEqual(driver1.session_id, driver2.session_id)