diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index ded0843..622ae4f 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -14,6 +14,15 @@ logger = logging.getLogger(__name__) +class DeltaFetchPseudoItem(dict): + """ + A pseudo item class to be used when: + - No actual item shall be generated from a page, and + - The page shall be skipped in future runs + """ + pass + + class DeltaFetch(object): """ This is a spider middleware to ignore requests to pages containing items @@ -86,7 +95,11 @@ def process_spider_output(self, response, result, spider): self.db[key] = str(time.time()) if self.stats: self.stats.inc_value('deltafetch/stored', spider=spider) - yield r + if isinstance(r, DeltaFetchPseudoItem): + reason = r.get('reason', 'pseudo_item') + self.stats.inc_value('deltafetch/stored/%s' % reason, spider=spider) + if not isinstance(r, DeltaFetchPseudoItem): + yield r def _get_key(self, request): key = request.meta.get('deltafetch_key') or request_fingerprint(request) diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index d3c3289..cad2cef 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -13,7 +13,7 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler -from scrapy_deltafetch.middleware import DeltaFetch +from scrapy_deltafetch.middleware import DeltaFetch, DeltaFetchPseudoItem dbmodule = None @@ -201,6 +201,22 @@ def test_process_spider_output(self): b'test_key_2'])) assert mw.db[b'key'] + def test_process_spider_output_pseudo_item(self): + self._create_test_db() + mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw.spider_opened(self.spider) + response = mock.Mock() + response.request = Request('http://url', + meta={'deltafetch_key': 'key'}) + result = [DeltaFetchPseudoItem(reason='skip')] + self.assertEqual(list(mw.process_spider_output( + response, result, self.spider)), []) + self.assertEqual(set(mw.db.keys()), + set([b'key', + b'test_key_1', + b'test_key_2'])) + assert mw.db[b'key'] + def test_process_spider_output_dict(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)