clemfromspace · zhangtemplar · Feb 22, 2019 · Feb 23, 2019 · Feb 25, 2019 · Feb 25, 2019
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Scrapy with selenium
 [![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selenium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability)
 
-Scrapy middleware to handle javascript pages using selenium.
+Scrapy middleware to handle javascript pages using selenium with better proxy support.
 
 ## Installation
 ```
@@ -18,6 +18,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium
     SELENIUM_DRIVER_NAME = 'firefox'
     SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
     SELENIUM_DRIVER_ARGUMENTS=['-headless']  # '--headless' if using chrome instead of firefox
+    SELENIUM_DRIVER_MAX_CONCURRENT=8  # maximal number of driver to be running concurrently. By default, it is 8.
     ```
 
 Optionally, set the path to the browser executable:
@@ -36,7 +37,7 @@ Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Reques
 ```python
 from scrapy_selenium import SeleniumRequest
 
-yield SeleniumRequest(url, self.parse_result)
+yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'some proxt url here'})
 ```
 The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed.
 ```python
@@ -92,3 +93,15 @@ yield SeleniumRequest(
     script='window.scrollTo(0, document.body.scrollHeight);',
 )
 ```
+
+#### `proxy`
+
+```python
+from scrapy_selenium import SeleniumRequest
+
+yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'your proxy url here'})
+```
+
+Here you can provide your proxy to the request. If the proxy has already used, the existing Selenium driver with that proxy will be used (if not yet evicted from the driver list); otherwise, a new driver with that proxy will be created instead.
+
+Internally, we use a LRU cache to track the active drivers.
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,2 +1,3 @@
 scrapy>=1.0.0
 selenium>=3.9.0
+pylru>=1.1.0
diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py
@@ -1,20 +1,35 @@
 """This module contains the ``SeleniumMiddleware`` scrapy middleware"""
 
 from importlib import import_module
+from typing import Iterable
 
+from pylru import lrucache
 from scrapy import signals
 from scrapy.exceptions import NotConfigured
 from scrapy.http import HtmlResponse
+from selenium.webdriver.chrome.webdriver import WebDriver
 from selenium.webdriver.support.ui import WebDriverWait
 
 from .http import SeleniumRequest
 
 
+def on_driver_removed(proxy: str, driver: WebDriver):
+    """
+    Closes the webdriver when evicted from the cache.
+
+    :param proxy: the proxy, not used
+    :param driver: the driver being evicted
+    """
+    driver.quit()
+
+
 class SeleniumMiddleware:
     """Scrapy middleware handling the requests using selenium"""
+    # default proxy which is nop roxy at all
+    default_proxy = ''
 
-    def __init__(self, driver_name, driver_executable_path, driver_arguments,
-        browser_executable_path):
+    def __init__(self, driver_name: str, driver_executable_path: str, driver_arguments: Iterable[str],
+        browser_executable_path: str, max_concurrent_driver: int=8):
         """Initialize the selenium webdriver
 
         Parameters
@@ -27,6 +42,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments,
             A list of arguments to initialize the driver
         browser_executable_path: str
             The path of the executable binary of the browser
+        max_concurrent_driver: str
+            The maximal numnber of concurrent driver to be held
         """
 
         webdriver_base_path = f'selenium.webdriver.{driver_name}'
@@ -48,7 +65,18 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments,
             f'{driver_name}_options': driver_options
         }
 
-        self.driver = driver_klass(**driver_kwargs)
+        def create_driver(proxy: str) -> WebDriver:
+            """
+            Creates a new driver, with optional proxy
+
+            :param proxy: the proxy, which should be something like http://... or https://... or socks://
+            :return: a webdriver created with the provided proxy
+            """
+            if proxy is not None and isinstance(proxy, str) and len(proxy) > 0:
+                driver_kwargs[f'{driver_name}_options'].add_argument("--proxy-server={}".format(proxy))
+            return driver_klass(**driver_kwargs)
+        self.create_driver = create_driver
+        self.drivers = lrucache(max_concurrent_driver, on_driver_removed)
 
     @classmethod
     def from_crawler(cls, crawler):
@@ -58,6 +86,7 @@ def from_crawler(cls, crawler):
         driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
         browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
         driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
+        max_concurrent_driver = crawler.settings.get('SELENIUM_DRIVER_MAX_CONCURRENT')
 
         if not driver_name or not driver_executable_path:
             raise NotConfigured(
@@ -68,7 +97,8 @@ def from_crawler(cls, crawler):
             driver_name=driver_name,
             driver_executable_path=driver_executable_path,
             driver_arguments=driver_arguments,
-            browser_executable_path=browser_executable_path
+            browser_executable_path=browser_executable_path,
+            max_concurrent_driver=max_concurrent_driver
         )
 
         crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
@@ -81,34 +111,39 @@ def process_request(self, request, spider):
         if not isinstance(request, SeleniumRequest):
             return None
 
-        self.driver.get(request.url)
+        # request a proxy:
+        if request.meta.get('proxy', self.default_proxy) not in self.drivers:
+            # this proxy is new, create a driver with this proxy
+            driver = self.create_driver(request.meta.get('proxy', self.default_proxy))
+            self.drivers[request.meta.get('proxy', self.default_proxy)] = driver
+        return self.drivers[request.meta.get('proxy', self.default_proxy)]
 
         for cookie_name, cookie_value in request.cookies.items():
-            self.driver.add_cookie(
+            driver.add_cookie(
                 {
                     'name': cookie_name,
                     'value': cookie_value
                 }
             )
 
         if request.wait_until:
-            WebDriverWait(self.driver, request.wait_time).until(
+            WebDriverWait(driver, request.wait_time).until(
                 request.wait_until
             )
 
         if request.screenshot:
-            request.meta['screenshot'] = self.driver.get_screenshot_as_png()
+            request.meta['screenshot'] = driver.get_screenshot_as_png()
 
         if request.script:
-            self.driver.execute_script(request.script)
+            driver.execute_script(request.script)
 
-        body = str.encode(self.driver.page_source)
+        body = str.encode(driver.page_source)
 
         # Expose the driver via the "meta" attribute
-        request.meta.update({'driver': self.driver})
+        request.meta.update({'driver': driver})
 
         return HtmlResponse(
-            self.driver.current_url,
+            driver.current_url,
             body=body,
             encoding='utf-8',
             request=request
@@ -117,5 +152,5 @@ def process_request(self, request, spider):
     def spider_closed(self):
         """Shutdown the driver when spider is closed"""
 
-        self.driver.quit()
+        self.drivers.clear()
 
diff --git a/tests/test_cases.py b/tests/test_cases.py
@@ -24,6 +24,7 @@ def setUpClass(cls):
         cls.settings = {
             'SELENIUM_DRIVER_NAME': 'firefox',
             'SELENIUM_DRIVER_EXECUTABLE_PATH': which('geckodriver'),
-            'SELENIUM_DRIVER_ARGUMENTS': ['-headless']
+            'SELENIUM_DRIVER_ARGUMENTS': ['-headless'],
+            'SELENIUM_DRIVER_MAX_CONCURRENT': 2,
         }
         cls.spider_klass = cls.SimpleSpider
diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py
@@ -33,26 +33,7 @@ def tearDownClass(cls):
 
         super().tearDownClass()
 
-        cls.selenium_middleware.driver.quit()
-
-    def test_from_crawler_method_should_initialize_the_driver(self):
-        """Test that the ``from_crawler`` method should initialize the selenium driver"""
-
-        crawler = Crawler(
-            spidercls=self.spider_klass,
-            settings=self.settings
-        )
-
-        selenium_middleware = SeleniumMiddleware.from_crawler(crawler)
-
-        # The driver must be initialized
-        self.assertIsNotNone(selenium_middleware.driver)
-
-        # We can now use the driver
-        selenium_middleware.driver.get('http://www.python.org')
-        self.assertIn('Python', selenium_middleware.driver.title)
-
-        selenium_middleware.driver.close()
+        cls.selenium_middleware.spider_closed()
 
     def test_spider_closed_should_close_the_driver(self):
         """Test that the ``spider_closed`` method should close the driver"""
@@ -64,10 +45,9 @@ def test_spider_closed_should_close_the_driver(self):
 
         selenium_middleware = SeleniumMiddleware.from_crawler(crawler)
 
-        with patch.object(selenium_middleware.driver, 'quit') as mocked_quit:
-            selenium_middleware.spider_closed()
-
-        mocked_quit.assert_called_once()
+        mocked_quit = [patch.object(driver, 'quit') for driver in selenium_middleware.drivers]
+        for q in mocked_quit:
+            q.assert_called_once()
 
     def test_process_request_should_return_none_if_not_selenium_request(self):
         """Test that the ``process_request`` should return none if not selenium request"""
@@ -135,3 +115,40 @@ def test_process_request_should_execute_script_if_script_option(self):
             html_response.selector.xpath('//title/text()').extract_first(),
             'scrapy_selenium'
         )
+
+    def test_max_concurrent_driver(self):
+        """Test that up to max_concurrent_driver should be alive. Evicted driver should be closed."""
+        self.selenium_middleware.process_request(SeleniumRequest(
+            url='http://www.python.org',
+            meta={'proxy': 'http://1.1.1.1'}
+        ))
+        self.assertEqual(len(self.selenium_middleware.drivers), 1)
+        driver1 = self.selenium_middleware.drivers['http://1.1.1.1']
+        self.selenium_middleware.process_request(SeleniumRequest(
+            url='http://www.python.org',
+            meta={'proxy': 'http://1.1.1.2'}
+        ))
+        self.assertEqual(len(self.selenium_middleware.drivers), 2)
+        self.selenium_middleware.process_request(SeleniumRequest(
+            url='http://www.python.org',
+            meta={'proxy': 'http://1.1.1.3'}
+        ))
+        # one of the driver is evicted
+        self.assertEqual(len(self.selenium_middleware.drivers), 2)
+        # when driver quites, the session id will be None
+        self.assertEqual(driver1.session_id, None)
+
+    def test_same_proxy_should_reuse_driver(self):
+        self.selenium_middleware.process_request(SeleniumRequest(
+            url='http://www.python.org',
+            meta={'proxy': 'http://1.1.1.1'}
+        ))
+        self.assertEqual(len(self.selenium_middleware.drivers), 1)
+        driver1 = self.selenium_middleware.drivers['http://1.1.1.1']
+        self.selenium_middleware.process_request(SeleniumRequest(
+            url='http://www.python.org',
+            meta={'proxy': 'http://1.1.1.1'}
+        ))
+        self.assertEqual(len(self.selenium_middleware.drivers), 1)
+        driver2 = self.selenium_middleware.drivers['http://1.1.1.1']
+        self.assertEqual(driver1.session_id, driver2.session_id)