Skip to content

Commit

Permalink
Merge branch 'release/0.0.6'
Browse files Browse the repository at this point in the history
  • Loading branch information
clemfromspace committed Jan 3, 2019
2 parents fa34af2 + aa34ef3 commit 3045ad0
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 8 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,24 @@ Scrapy middleware to handle javascript pages using selenium.
```
$ pip install scrapy-selenium
```

You should use **python>=3.6**.
You will also need one of the Selenium [compatible browsers](http://www.seleniumhq.org/about/platforms.jsp).

## Configuration
1. Add the browser to use, the path to the executable, and the arguments to pass to the executable to the scrapy settings:
1. Add the browser to use, the path to the driver executable, and the arguments to pass to the executable to the scrapy settings:
```python
from shutil import which

SELENIUM_DRIVER_NAME='firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH=which('geckodriver')
SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox
```

Optionally, set the path to the browser executable:
```python
SELENIUM_BROWSER_EXECUTABLE_PATH = which('firefox')
```

2. Add the `SeleniumMiddleware` to the downloader middlewares:
```python
DOWNLOADER_MIDDLEWARES = {
Expand Down
12 changes: 9 additions & 3 deletions scrapy_selenium/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""

def __init__(self, driver_name, driver_executable_path, driver_arguments):
def __init__(self, driver_name, driver_executable_path, driver_arguments,
browser_executable_path):
"""Initialize the selenium webdriver
Parameters
Expand All @@ -24,7 +25,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments):
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
"""

webdriver_base_path = f'selenium.webdriver.{driver_name}'
Expand All @@ -36,6 +38,8 @@ def __init__(self, driver_name, driver_executable_path, driver_arguments):
driver_options_klass = getattr(driver_options_module, 'Options')

driver_options = driver_options_klass()
if browser_executable_path:
driver_options.binary_location = browser_executable_path
for argument in driver_arguments:
driver_options.add_argument(argument)

Expand All @@ -52,6 +56,7 @@ def from_crawler(cls, crawler):

driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')

if not driver_name or not driver_executable_path:
Expand All @@ -62,7 +67,8 @@ def from_crawler(cls, crawler):
middleware = cls(
driver_name=driver_name,
driver_executable_path=driver_executable_path,
driver_arguments=driver_arguments
driver_arguments=driver_arguments,
browser_executable_path=browser_executable_path
)

crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scrapy-selenium
version = 0.0.5
version = 0.0.6
url = https://github.com/clemfromspace/scrapy-selenium
licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
description = Scrapy with selenium
Expand Down

0 comments on commit 3045ad0

Please sign in to comment.