diff --git a/README.md b/README.md index b642de9..4c9716a 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ def parse_result(self, response): ``` ### Additional arguments -The `scrapy_selenium.SeleniumRequest` accept 4 additional arguments: +The `scrapy_selenium.SeleniumRequest` accept 6 additional arguments: #### `wait_time` / `wait_until` @@ -97,3 +97,27 @@ yield SeleniumRequest( script='window.scrollTo(0, document.body.scrollHeight);', ) ``` + +#### `cb_selenium` / `cb_selenium_kwargs` +When used, the callback is called instead of `webdriver.get(request.url)`. It allows you more +control to put the webpage to the given state that you expected. +```python +def cb_selenium(url, webdriver, arg1): + wait = WebDriverWait(webdriver, timeout=10) + webdriver.get(url) + + btn = wait.until( + EC.element_to_be_clickable((By.XPATH, "//button[@class='button']")) + ) + btn.click() + + wait.until(EC.visibility_of_element_located((By.ID, arg1))) + + +yield SeleniumRequest( + url=url, + callback=self.parse_result, + cb_selenium=cb_selenium, + cb_selenium_kwargs={"arg1": "123456"}, +) +``` diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 2addc1f..256d4d5 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,7 +1,7 @@ -r requirements.txt -pytest==3.4.0 coverage<4.4 -pytest-cov==2.4.0 +pytest>=3.4.0 +pytest-cov>=2.4.0 codeclimate-test-reporter==0.2.3 -attrs>=17.4.0 +attrs>=19.2.0 diff --git a/scrapy_selenium/http.py b/scrapy_selenium/http.py index cddf7bf..8c170a5 100644 --- a/scrapy_selenium/http.py +++ b/scrapy_selenium/http.py @@ -6,7 +6,8 @@ class SeleniumRequest(Request): """Scrapy ``Request`` subclass providing additional arguments""" - def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=None, *args, **kwargs): + def __init__(self, wait_time=None, wait_until=None, screenshot=False, + script=None, cb_selenium=None, cb_selenium_kwargs=None, *args, **kwargs): """Initialize a new selenium request Parameters @@ -21,6 +22,12 @@ def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=Non will be returned in the response "meta" attribute. script: str JavaScript code to execute. + cb_selenium: method + Selenium handler which contains webdriver actions leading to the expected + state of the web page. The handler takes url, webdriver and custom arguments if needed + `cb_selenium(url, webdriver, arg1, arg2)`. + cb_selenium_kwargs: dict + Keywords arguments for the selenium callback `cb_selenium`. """ @@ -28,5 +35,7 @@ def __init__(self, wait_time=None, wait_until=None, screenshot=False, script=Non self.wait_until = wait_until self.screenshot = screenshot self.script = script + self.cb_selenium = cb_selenium + self.cb_selenium_kwargs = cb_selenium_kwargs super().__init__(*args, **kwargs) diff --git a/scrapy_selenium/middlewares.py b/scrapy_selenium/middlewares.py index 201db2c..a58be66 100644 --- a/scrapy_selenium/middlewares.py +++ b/scrapy_selenium/middlewares.py @@ -73,7 +73,7 @@ def from_crawler(cls, crawler): driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH') browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH') command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR') - driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS') + driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS', []) if driver_name is None: raise NotConfigured('SELENIUM_DRIVER_NAME must be set') @@ -100,7 +100,11 @@ def process_request(self, request, spider): if not isinstance(request, SeleniumRequest): return None - self.driver.get(request.url) + if callable(request.cb_selenium): + kwargs = request.cb_selenium_kwargs if request.cb_selenium_kwargs else {} + request.cb_selenium(request.url, self.driver, **kwargs) + else: + self.driver.get(request.url) for cookie_name, cookie_value in request.cookies.items(): self.driver.add_cookie( diff --git a/setup.py b/setup.py index 16fd185..639de1b 100644 --- a/setup.py +++ b/setup.py @@ -1,34 +1,8 @@ -"""This module contains the packaging routine for the pybook package""" - from setuptools import setup, find_packages -try: - from pip.download import PipSession - from pip.req import parse_requirements -except ImportError: - # It is quick hack to support pip 10 that has changed its internal - # structure of the modules. - from pip._internal.download import PipSession - from pip._internal.req.req_file import parse_requirements - - -def get_requirements(source): - """Get the requirements from the given ``source`` - - Parameters - ---------- - source: str - The filename containing the requirements - - """ - - install_reqs = parse_requirements(filename=source, session=PipSession()) - - return [str(ir.req) for ir in install_reqs] - setup( packages=find_packages(), - install_requires=get_requirements('requirements/requirements.txt') + install_requires=[l for l in open('requirements/requirements.txt').readlines()] ) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index fe365e4..1c5bfcf 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -5,6 +5,11 @@ from scrapy import Request from scrapy.crawler import Crawler +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as ec +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys + from scrapy_selenium.http import SeleniumRequest from scrapy_selenium.middlewares import SeleniumMiddleware @@ -135,3 +140,35 @@ def test_process_request_should_execute_script_if_script_option(self): html_response.selector.xpath('//title/text()').extract_first(), 'scrapy_selenium' ) + + def test_process_request_should_execute_cb_selenium(self): + """Test that the ``process_request`` should execute cb_selenium and return a response""" + + def cb_selenium(url, webdriver, query): + wait = WebDriverWait(webdriver, timeout=10) + + webdriver.get(url) + + elt = wait.until(ec.visibility_of_element_located((By.ID, "id-search-field"))) + elt.send_keys(query + Keys.ENTER) + + wait.until(ec.visibility_of_element_located( + (By.XPATH, "//ul[@class='list-recent-events menu']") + )) + + selenium_request = SeleniumRequest( + url='http://www.python.org', + cb_selenium=cb_selenium, + cb_selenium_kwargs={"query": "python"} + ) + + html_response = self.selenium_middleware.process_request( + request=selenium_request, + spider=None + ) + + titles_xpath = "//ul[@class='list-recent-events menu']/li/h3/a/text()" + self.assertIn( + "python", + html_response.selector.xpath(titles_xpath).extract_first().lower() + )