From 139886a1bc57ce10bdf684d8ca1b6f2bd48ff4f9 Mon Sep 17 00:00:00 2001 From: nramirezuy Date: Fri, 28 Feb 2014 15:12:54 -0200 Subject: [PATCH] autoschedule extension added --- scrapylib/autoschedule.py | 62 ++++++++++++++++++++++++++++ tests/test_autoschedule.py | 82 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 scrapylib/autoschedule.py create mode 100644 tests/test_autoschedule.py diff --git a/scrapylib/autoschedule.py b/scrapylib/autoschedule.py new file mode 100644 index 0000000..647aeed --- /dev/null +++ b/scrapylib/autoschedule.py @@ -0,0 +1,62 @@ +import os + +import scrapinghub +from scrapy import log +from scrapy import signals +from scrapy.exceptions import NotConfigured + + +class AutoSchedule(object): + ''' + AUTOSCHEDULE_SETTINGS: + {reason: {spider_name: [{**params}, {...}]}} + ''' + + def __init__(self, settings): + self.settings = settings.getdict('AUTOSCHEDULE_SETTINGS') + self.apikey = settings.get('AUTOSCHEDULE_APIKEY') + if not self.settings.values() or not self.apikey: + raise NotConfigured + + @classmethod + def from_crawler(cls, crawler): + settings = crawler.settings + if not settings.getbool('AUTOSCHEDULE_ENABLED'): + raise NotConfigured + o = cls(settings) + crawler.signals.connect(o.spider_closed, + signal=signals.spider_closed) + return o + + def _compile_params(self, settings_params, spider): + params = {} + for k, v in settings_params.items(): + if isinstance(v, basestring): + new_v = getattr(spider, v, v) + params[k] = v if hasattr(new_v, '__call__') else new_v + else: + params[k] = v + return params + + def spider_closed(self, spider, reason): + settings = self.settings.get(reason).get(spider.name) + if not settings: + return + + _compiled_params_list = [] + for params in settings: + params = self._compile_params(params, spider) + _compiled_params_list.append(params) + + conn = scrapinghub.Connection(self.apikey) + project = conn[os.environ.get('SCRAPY_PROJECT_ID')] + current_job_key = os.environ.get('SCRAPY_JOB') + for settings_params in _compiled_params_list: + params = {'parent_job_key': current_job_key} + params.update(settings_params) + self._schedule(project, params) + + def _schedule(self, project, params): + job = project.schedule(**params) + log.msg('Scheduled {spider} spider, job {job}, params {params}'\ + .format(job=job, params=params, **params), level=log.INFO) diff --git a/tests/test_autoschedule.py b/tests/test_autoschedule.py new file mode 100644 index 0000000..fca0a32 --- /dev/null +++ b/tests/test_autoschedule.py @@ -0,0 +1,82 @@ +import unittest + +from scrapy.utils.test import get_crawler +from scrapy.exceptions import NotConfigured +from scrapy.spider import Spider + +from scrapylib.autoschedule import AutoSchedule + + +class AutoScheduleTestCase(unittest.TestCase): + + ext_cls = AutoSchedule + + def _mock_crawler(self, settings=None): + class MockedDownloader(object): + slots = {} + + class MockedEngine(object): + downloader = MockedDownloader() + fake_spider_closed_result = None + + def close_spider(self, spider, reason): + self.fake_spider_closed_result = (spider, reason) + + crawler = get_crawler(settings) + crawler.engine = MockedEngine() + return crawler + + def test_enabled(self): + settings = {'AUTOSCHEDULE_ENABLED': True} + crawler = self._mock_crawler(settings) + self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler) + settings['AUTOSCHEDULE_APIKEY'] = '123' + crawler = self._mock_crawler(settings) + self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler) + settings['AUTOSCHEDULE_SETTINGS'] = {} + crawler = self._mock_crawler(settings) + self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler) + # finally enabled + settings['AUTOSCHEDULE_SETTINGS'] = {'finished': {'foo': []}} + crawler = self._mock_crawler(settings) + self.ext_cls.from_crawler(crawler) + + def test_compile_params(self): + spider = Spider('foo', pstr='string', pint=1, pcall=iter, plist=[]) + jobs = [{ + 'spider': 'bar', + 'string': 'pstr', + 'int': 'pint', + 'call': 'pcall', + 'constant': 'constant1', + }, { + 'spider': 'foo_bar', + 'string': 'pstr', + 'call': 'pcall', + 'list': 'plist', + 'constant': 'constant2', + }] + settings = { + 'AUTOSCHEDULE_APIKEY': '123', + 'AUTOSCHEDULE_ENABLED': True, + 'AUTOSCHEDULE_SETTINGS': { + 'finished': { + 'foo': jobs + } + } + } + crawler = self._mock_crawler(settings) + ext = self.ext_cls.from_crawler(crawler) + params= [] + ext._schedule = lambda y, x: params.append(x) + ext.spider_closed(spider, 'finished') + for j, p in zip(jobs, params): + for k, v in p.items(): + if k == 'parent_job_key': + continue + + sv = getattr(spider, j[k], v) + if not hasattr(sv, '__call__'): + self.assertEqual(sv, v) + else: + self.assertNotEqual(sv, k)