Skip to content

Commit 9cb9d40

Browse files
committed
Initial commit
0 parents  commit 9cb9d40

16 files changed

+581
-0
lines changed

.coveragerc

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[run]
2+
omit =
3+
tests/*
4+
*__init__.py*

.gitignore

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Created by .ignore support plugin (hsz.mobi)
2+
### Python template
3+
# Byte-compiled / optimized / DLL files
4+
__pycache__/
5+
*.py[cod]
6+
*$py.class
7+
8+
# C extensions
9+
*.so
10+
11+
# Distribution / packaging
12+
.Python
13+
env/
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.coverage
43+
.coverage.*
44+
.cache
45+
nosetests.xml
46+
coverage.xml
47+
*,cover
48+
.hypothesis/
49+
reports/
50+
.pytest_cache
51+
52+
# Translations
53+
*.mo
54+
*.pot
55+
56+
# Django stuff:
57+
*.log
58+
59+
# Flask instance folder
60+
instance/
61+
62+
# Scrapy stuff:
63+
.scrapy
64+
65+
# Sphinx documentation
66+
docs/_build/
67+
.tmpdocs/
68+
69+
# PyBuilder
70+
target/
71+
72+
# IPython Notebook
73+
.ipynb_checkpoints
74+
75+
# pyenv
76+
.python-version
77+
78+
# celery beat schedule file
79+
celerybeat-schedule
80+
81+
# dotenv
82+
.env
83+
84+
# virtualenv
85+
.venv
86+
venv/
87+
venv-jenkins*/
88+
ENV/
89+
90+
# Spyder project settings
91+
.spyderproject
92+
93+
# Rope project settings
94+
.ropeproject
95+
96+
# .idea is the directory for pycharm project files
97+
.idea
98+
99+
# MACOS stuff
100+
.DS_Store

.travis.yml

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
env:
2+
global:
3+
- CODECLIMATE_REPO_TOKEN=a47935830d841ad61a6e960be8a3b6a5e557146ac010dafa993e61bf82898472
4+
5+
language: python
6+
7+
python:
8+
- 3.6
9+
10+
addons:
11+
firefox: "49.0.2"
12+
13+
before_install:
14+
- wget https://github.com/mozilla/geckodriver/releases/download/v0.11.1/geckodriver-v0.11.1-linux64.tar.gz
15+
- mkdir geckodriver
16+
- tar -xzf geckodriver-v0.11.1-linux64.tar.gz -C geckodriver
17+
- export PATH=$PATH:$PWD/geckodriver
18+
19+
install:
20+
- pip install -r requirements/requirements-test.txt
21+
22+
before_script:
23+
- "export DISPLAY=:99.0"
24+
- "sh -e /etc/init.d/xvfb start"
25+
- sleep 3
26+
27+
script:
28+
- pytest --cov-config .coveragerc --cov=scrapy_selenium tests/
29+
- codeclimate-test-reporter
30+
31+
deploy:
32+
provider: pypi
33+
user: clemfromspace
34+
password:
35+
secure: "TNv6olOrZXQU5uXOv2pXCHn0knCxCvyoIQJCNPVn7kwqrVhsUK+A9Tp0xBWkQugdtN30KQ9dPu7VNRfizyWvjBMXxnVmBwbjG/qUdPsa2jz4cgTNvfScBoTeESE8PkFu91xBmP9KXV0XYWVahEL6IK2klFqnFRhkpnDbeRgzSB+UUBzltb0CwIBs7r1BxI1Fcz4HkvEtoqOi/jB1GV7k2F2RIaXHNwnQ4b4Et3FzOX7y5ONUhlwtlgHfIsr3mtQkmQ0cRhzV6Sub9dwC0RckDjqRGd/cV81uWr444KK1F+XSxLU4M5+8am6zO3PDApkyYblfq54FzfbrmgrNaZ2VREVoS7SryW2cxmPPTQbBaaKAu8AZ6HIDgYzDGk54Q8W8XvK0UdAj9fPvFNHuOTJw/1HPGUcLcIDJebBSdZzg5q9hPAOv2MK+fyqfyTx5AcMJnbvitSncT5qie+OX6ZPZrXphxBv29PUPNv94f4czMk1gTvuxVyOPwP3qkyDMA2thRu/SXtE+EW/q1M9lQCXAxBU+wi+QDydxCbYs8rmF0V+dCaOdZEcEtE03l73BK8/MczX4sP3HkcoAsttkD8oXoCdo8I2nxeVqx2YlI6928ayxospLzMQQlaCy4zfAYrYyE5VqEDoS84fxJkO4aHJJDRSFJ90U0BwwLkBVRsa4t8U="
36+
on:
37+
tags: true
38+
distributions: sdist bdist_wheel
39+
repo: clemfromspace/scrapy-selenium
40+
41+
notifications:
42+
email: false

LICENCE

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
2+
Version 2, December 2004
3+
4+
Copyright (C) 2018 Clément Denoix <[email protected]>
5+
6+
Everyone is permitted to copy and distribute verbatim or modified
7+
copies of this license document, and changing it is allowed as long
8+
as the name is changed.
9+
10+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12+
13+
0. You just DO WHAT THE FUCK YOU WANT TO.

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include requirements/requirements.txt

README.md

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Scrapy with selenium
2+
[![PyPI](https://img.shields.io/pypi/v/scrapy-selenium.svg)](https://pypi.python.org/pypi/scrapy-selenium) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-selemnium.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-selenium) [![Test Coverage](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/5c737098dc38a835ff96/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-selenium/maintainability)
3+
4+
Scrapy middleware to handle javascript pages using selenium.
5+
6+
## Installation
7+
```
8+
$ pip install scrapy-selenium
9+
```
10+
11+
You will also need one of the Selenium [compatible browsers](http://www.seleniumhq.org/about/platforms.jsp).
12+
13+
## Configuration
14+
1. Add the browser to use, the path to the executable, and the arguments to pass to the executable to the scrapy settings:
15+
```python
16+
from shutil import which
17+
18+
SELENIUM_DRIVER_NAME='firefox'
19+
SELENIUM_DRIVER_EXECUTABLE_PATH=which('geckodriver')
20+
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox
21+
```
22+
23+
2. Add the `SeleniumMiddleware` to the downloader middlewares:
24+
```python
25+
DOWNLOADER_MIDDLEWARES = {
26+
'scrapy_selenium.SeleniumMiddleware': 800
27+
}
28+
```
29+
## Usage
30+
Use the `scrapy_selenium.SeleniumRequest` instead of the scrapy built-in `Request` like below:
31+
```python
32+
from scrapy_selenium import SeleniumRequest
33+
34+
yield SeleniumRequest(url, self.parse_result)
35+
```
36+
The request will be handled by selenium, and the response will have an additional `meta` key, named `driver` containing the selenium driver with the request processed.
37+
```python
38+
def parse_result(self, response):
39+
print(response.meta['driver'].title)
40+
```
41+
For more information about the available driver methods and attributes, refer to the [selenium python documentation](http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver)
42+
43+
The `selector` response attribute work as usual (but contains the html processed by the selenium driver).
44+
```python
45+
def parse_result(self, response):
46+
print(response.selector.xpath('//title/@text'))
47+
```
48+
49+
### Additional arguments
50+
The `scrapy_selenium.SeleniumRequest` accept 3 additional arguments:
51+
52+
#### `wait_time` / `wait_until`
53+
54+
When used, selenium will perform an [Explicit wait](http://selenium-python.readthedocs.io/waits.html#explicit-waits) before returning the response to the spider.
55+
```python
56+
from selenium.webdriver.common.by import By
57+
from selenium.webdriver.support import expected_conditions as EC
58+
59+
yield SeleniumRequest(
60+
url,
61+
self.parse_result,
62+
wait_time=10,
63+
wait_until=EC.element_to_be_clickable((By.ID, 'someid'))
64+
)
65+
```
66+
67+
#### `screenshot`
68+
When used, selenium will take a screenshot of the page and the binary data of the .png captured will be added to the response `meta`:
69+
```python
70+
yield SeleniumRequest(
71+
url,
72+
self.parse_result,
73+
screenshot=True
74+
)
75+
76+
def parse_result(self, response):
77+
with open('image.png', 'wb') as image_file:
78+
image_file.write(response.meta['screenshot])
79+
```
80+

requirements/requirements-test.txt

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-r requirements.txt
2+
3+
pytest==3.4.0
4+
coverage<4.4
5+
pytest-cov==2.4.0
6+
codeclimate-test-reporter==0.2.3

requirements/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
scrapy>=1.0.0
2+
selenium>=3.9.0

scrapy_selenium/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .http import SeleniumRequest
2+
from .middlewares import SeleniumMiddleware

scrapy_selenium/http.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""This module contains the ``SeleniumRequest`` class"""
2+
3+
from scrapy import Request
4+
5+
6+
class SeleniumRequest(Request):
7+
"""Scrapy ``Request`` subclass providing additional arguments"""
8+
9+
def __init__(self, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
10+
"""Initialize a new selenium request
11+
12+
Parameters
13+
----------
14+
wait_time: int
15+
The number of seconds to wait.
16+
wait_until: method
17+
One of the "selenium.webdriver.support.expected_conditions". The response
18+
will be returned until the given condition is fulfilled.
19+
screenshot: bool
20+
If True, a screenshot of the page will be taken and the data of the screenshot
21+
will be returned in the response "meta" attribute.
22+
23+
"""
24+
25+
self.wait_time = wait_time
26+
self.wait_until = wait_until
27+
self.screenshot = screenshot
28+
29+
super().__init__(*args, **kwargs)

0 commit comments

Comments
 (0)