@@ -37,6 +37,7 @@ def __init__(
37
37
flush_on_start = False ,
38
38
queue_key = defaults .SCHEDULER_QUEUE_KEY ,
39
39
queue_cls = defaults .SCHEDULER_QUEUE_CLASS ,
40
+ dupefilter = None ,
40
41
dupefilter_key = defaults .SCHEDULER_DUPEFILTER_KEY ,
41
42
dupefilter_cls = defaults .SCHEDULER_DUPEFILTER_CLASS ,
42
43
idle_before_close = 0 ,
@@ -56,6 +57,8 @@ def __init__(
56
57
Requests queue key.
57
58
queue_cls : str
58
59
Importable path to the queue class.
60
+ dupefilter: Dupefilter
61
+ Custom dupefilter instance.
59
62
dupefilter_key : str
60
63
Duplicates filter key.
61
64
dupefilter_cls : str
@@ -72,6 +75,7 @@ def __init__(
72
75
self .flush_on_start = flush_on_start
73
76
self .queue_key = queue_key
74
77
self .queue_cls = queue_cls
78
+ self .df = dupefilter
75
79
self .dupefilter_cls = dupefilter_cls
76
80
self .dupefilter_key = dupefilter_key
77
81
self .idle_before_close = idle_before_close
@@ -105,6 +109,10 @@ def from_settings(cls, settings):
105
109
if val :
106
110
kwargs [name ] = val
107
111
112
+ dupefilter_cls = load_object (kwargs ["dupefilter_cls" ])
113
+ if not hasattr (dupefilter_cls , "from_spider" ):
114
+ kwargs ["dupefilter" ] = dupefilter_cls .from_settings (settings )
115
+
108
116
# Support serializer as a path to a module.
109
117
if isinstance (kwargs .get ("serializer" ), str ):
110
118
kwargs ["serializer" ] = importlib .import_module (kwargs ["serializer" ])
@@ -137,7 +145,8 @@ def open(self, spider):
137
145
f"Failed to instantiate queue class '{ self .queue_cls } ': { e } "
138
146
)
139
147
140
- self .df = load_object (self .dupefilter_cls ).from_spider (spider )
148
+ if not self .df :
149
+ self .df = load_object (self .dupefilter_cls ).from_spider (spider )
141
150
142
151
if self .flush_on_start :
143
152
self .flush ()
0 commit comments