Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add better proxy support for the middleware. #33

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Scrapy with selenium
[![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selenium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability)

Scrapy middleware to handle javascript pages using selenium.
Scrapy middleware to handle javascript pages using selenium with better proxy support.

## Installation
```
Expand All @@ -18,6 +18,7 @@ You will also need one of the Selenium [compatible browsers](http://www.selenium
SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_MAX_CONCURRENT=8 # maximal number of driver to be running concurrently. By default, it is 8.
```

Optionally, set the path to the browser executable:
Expand All @@ -36,7 +37,7 @@ Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Reques
```python
from scrapy_selenium import SeleniumRequest

yield SeleniumRequest(url, self.parse_result)
yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'some proxt url here'})
```
The request will be handled by selenium, and the request will have an additional `meta` key, named `driver` containing the selenium driver with the request processed.
```python
Expand Down Expand Up @@ -92,3 +93,15 @@ yield SeleniumRequest(
script='window.scrollTo(0, document.body.scrollHeight);',
)
```

#### `proxy`

```python
from scrapy_selenium import SeleniumRequest

yield SeleniumRequest(url, self.parse_result, meta={'proxy': 'your proxy url here'})
```

Here you can provide your proxy to the request. If the proxy has already used, the existing Selenium driver with that proxy will be used (if not yet evicted from the driver list); otherwise, a new driver with that proxy will be created instead.

Internally, we use a LRU cache to track the active drivers.
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
scrapy>=1.0.0
selenium>=3.9.0
pylru>=1.1.0
61 changes: 48 additions & 13 deletions scrapy_selenium/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""

from importlib import import_module
from typing import Iterable

from pylru import lrucache
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support.ui import WebDriverWait

from .http import SeleniumRequest


def on_driver_removed(proxy: str, driver: WebDriver):
"""
Closes the webdriver when evicted from the cache.

:param proxy: the proxy, not used
:param driver: the driver being evicted
"""
driver.quit()


class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
# default proxy which is nop roxy at all
default_proxy = ''

def __init__(self, driver_name, driver_executable_path, driver_arguments,
browser_executable_path):
def __init__(self, driver_name: str, driver_executable_path: str, driver_arguments: Iterable[str],
browser_executable_path: str, max_concurrent_driver: int=8):
"""Initialize the selenium webdriver

Parameters
Expand All @@ -27,6 +42,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments,
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
max_concurrent_driver: str
The maximal numnber of concurrent driver to be held
"""

webdriver_base_path = f'selenium.webdriver.{driver_name}'
Expand All @@ -48,7 +65,18 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments,
f'{driver_name}_options': driver_options
}

self.driver = driver_klass(**driver_kwargs)
def create_driver(proxy: str) -> WebDriver:
"""
Creates a new driver, with optional proxy

:param proxy: the proxy, which should be something like http://... or https://... or socks://
:return: a webdriver created with the provided proxy
"""
if proxy is not None and isinstance(proxy, str) and len(proxy) > 0:
driver_kwargs[f'{driver_name}_options'].add_argument("--proxy-server={}".format(proxy))
return driver_klass(**driver_kwargs)
self.create_driver = create_driver
self.drivers = lrucache(max_concurrent_driver, on_driver_removed)

@classmethod
def from_crawler(cls, crawler):
Expand All @@ -58,6 +86,7 @@ def from_crawler(cls, crawler):
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
max_concurrent_driver = crawler.settings.get('SELENIUM_DRIVER_MAX_CONCURRENT')

if not driver_name or not driver_executable_path:
raise NotConfigured(
Expand All @@ -68,7 +97,8 @@ def from_crawler(cls, crawler):
driver_name=driver_name,
driver_executable_path=driver_executable_path,
driver_arguments=driver_arguments,
browser_executable_path=browser_executable_path
browser_executable_path=browser_executable_path,
max_concurrent_driver=max_concurrent_driver
)

crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
Expand All @@ -81,34 +111,39 @@ def process_request(self, request, spider):
if not isinstance(request, SeleniumRequest):
return None

self.driver.get(request.url)
# request a proxy:
if request.meta.get('proxy', self.default_proxy) not in self.drivers:
# this proxy is new, create a driver with this proxy
driver = self.create_driver(request.meta.get('proxy', self.default_proxy))
self.drivers[request.meta.get('proxy', self.default_proxy)] = driver
return self.drivers[request.meta.get('proxy', self.default_proxy)]

for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)

if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
WebDriverWait(driver, request.wait_time).until(
request.wait_until
)

if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
request.meta['screenshot'] = driver.get_screenshot_as_png()

if request.script:
self.driver.execute_script(request.script)
driver.execute_script(request.script)

body = str.encode(self.driver.page_source)
body = str.encode(driver.page_source)

# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
request.meta.update({'driver': driver})

return HtmlResponse(
self.driver.current_url,
driver.current_url,
body=body,
encoding='utf-8',
request=request
Expand All @@ -117,5 +152,5 @@ def process_request(self, request, spider):
def spider_closed(self):
"""Shutdown the driver when spider is closed"""

self.driver.quit()
self.drivers.clear()

3 changes: 2 additions & 1 deletion tests/test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def setUpClass(cls):
cls.settings = {
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': which('geckodriver'),
'SELENIUM_DRIVER_ARGUMENTS': ['-headless']
'SELENIUM_DRIVER_ARGUMENTS': ['-headless'],
'SELENIUM_DRIVER_MAX_CONCURRENT': 2,
}
cls.spider_klass = cls.SimpleSpider
65 changes: 41 additions & 24 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,7 @@ def tearDownClass(cls):

super().tearDownClass()

cls.selenium_middleware.driver.quit()

def test_from_crawler_method_should_initialize_the_driver(self):
"""Test that the ``from_crawler`` method should initialize the selenium driver"""

crawler = Crawler(
spidercls=self.spider_klass,
settings=self.settings
)

selenium_middleware = SeleniumMiddleware.from_crawler(crawler)

# The driver must be initialized
self.assertIsNotNone(selenium_middleware.driver)

# We can now use the driver
selenium_middleware.driver.get('http://www.python.org')
self.assertIn('Python', selenium_middleware.driver.title)

selenium_middleware.driver.close()
cls.selenium_middleware.spider_closed()

def test_spider_closed_should_close_the_driver(self):
"""Test that the ``spider_closed`` method should close the driver"""
Expand All @@ -64,10 +45,9 @@ def test_spider_closed_should_close_the_driver(self):

selenium_middleware = SeleniumMiddleware.from_crawler(crawler)

with patch.object(selenium_middleware.driver, 'quit') as mocked_quit:
selenium_middleware.spider_closed()

mocked_quit.assert_called_once()
mocked_quit = [patch.object(driver, 'quit') for driver in selenium_middleware.drivers]
for q in mocked_quit:
q.assert_called_once()

def test_process_request_should_return_none_if_not_selenium_request(self):
"""Test that the ``process_request`` should return none if not selenium request"""
Expand Down Expand Up @@ -135,3 +115,40 @@ def test_process_request_should_execute_script_if_script_option(self):
html_response.selector.xpath('//title/text()').extract_first(),
'scrapy_selenium'
)

def test_max_concurrent_driver(self):
"""Test that up to max_concurrent_driver should be alive. Evicted driver should be closed."""
self.selenium_middleware.process_request(SeleniumRequest(
url='http://www.python.org',
meta={'proxy': 'http://1.1.1.1'}
))
self.assertEqual(len(self.selenium_middleware.drivers), 1)
driver1 = self.selenium_middleware.drivers['http://1.1.1.1']
self.selenium_middleware.process_request(SeleniumRequest(
url='http://www.python.org',
meta={'proxy': 'http://1.1.1.2'}
))
self.assertEqual(len(self.selenium_middleware.drivers), 2)
self.selenium_middleware.process_request(SeleniumRequest(
url='http://www.python.org',
meta={'proxy': 'http://1.1.1.3'}
))
# one of the driver is evicted
self.assertEqual(len(self.selenium_middleware.drivers), 2)
# when driver quites, the session id will be None
self.assertEqual(driver1.session_id, None)

def test_same_proxy_should_reuse_driver(self):
self.selenium_middleware.process_request(SeleniumRequest(
url='http://www.python.org',
meta={'proxy': 'http://1.1.1.1'}
))
self.assertEqual(len(self.selenium_middleware.drivers), 1)
driver1 = self.selenium_middleware.drivers['http://1.1.1.1']
self.selenium_middleware.process_request(SeleniumRequest(
url='http://www.python.org',
meta={'proxy': 'http://1.1.1.1'}
))
self.assertEqual(len(self.selenium_middleware.drivers), 1)
driver2 = self.selenium_middleware.drivers['http://1.1.1.1']
self.assertEqual(driver1.session_id, driver2.session_id)