Skip to content

Commit 01aec7e

Browse files
committed
extension point of geturl #118
1 parent ec1c2e8 commit 01aec7e

File tree

2 files changed

+11
-3
lines changed

2 files changed

+11
-3
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/BloomFilterDuplicateRemover.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,18 @@ protected BloomFilter<CharSequence> rebuildBloomFilter() {
4646

4747
@Override
4848
public boolean isDuplicate(Request request, Task task) {
49-
boolean isDuplicate = bloomFilter.mightContain(request.getUrl());
49+
boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
5050
if (!isDuplicate) {
51-
bloomFilter.put(request.getUrl());
51+
bloomFilter.put(getUrl(request));
5252
counter.incrementAndGet();
5353
}
5454
return isDuplicate;
5555
}
5656

57+
protected String getUrl(Request request) {
58+
return request.getUrl();
59+
}
60+
5761
@Override
5862
public void resetDuplicateCheck(Task task) {
5963
rebuildBloomFilter();

webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ public class HashSetDuplicateRemover implements DuplicateRemover {
1616

1717
@Override
1818
public boolean isDuplicate(Request request, Task task) {
19-
return !urls.add(request.getUrl());
19+
return !urls.add(getUrl(request));
20+
}
21+
22+
protected String getUrl(Request request) {
23+
return request.getUrl();
2024
}
2125

2226
@Override

0 commit comments

Comments
 (0)