Skip to content

Commit cfae008

Browse files
committed
Merge branch 'release/1.0.3'
2 parents 1cd199b + f4a8825 commit cfae008

File tree

15 files changed

+30
-12
lines changed

15 files changed

+30
-12
lines changed

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on
176176
the same "printed page" as the copyright notice for easier identification within
177177
third-party archives.
178178

179-
Copyright 2013 code4craft
179+
Copyright 2025 code4craft
180180

181181
Licensed under the Apache License, Version 2.0 (the "License");
182182
you may not use this file except in compliance with the License.

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
<version>2.2.1</version>
1313
</parent>
1414
<groupId>us.codecraft</groupId>
15-
<version>1.0.2</version>
15+
<version>1.0.3</version>
1616
<packaging>pom</packaging>
1717
<properties>
1818
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

webmagic-core/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
*/
3737
public class HttpClientGenerator {
3838

39-
private transient Logger logger = LoggerFactory.getLogger(getClass());
39+
private Logger logger = LoggerFactory.getLogger(getClass());
4040

4141
private PoolingHttpClientConnectionManager connectionManager;
4242

webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java

+5
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ public Selectable smartContent() {
3131
return select(smartContentSelector, getSourceTexts());
3232
}
3333

34+
public Selectable smartContent(int threshold) {
35+
SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
36+
return select(smartContentSelector, getSourceTexts());
37+
}
38+
3439
@Override
3540
public Selectable links() {
3641
return selectElements(new LinksSelector());

webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java

+4
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() {
2020
return new SmartContentSelector();
2121
}
2222

23+
public static SmartContentSelector smartContent(int threshold) {
24+
return new SmartContentSelector(threshold);
25+
}
26+
2327
public static CssSelector $(String expr) {
2428
return new CssSelector(expr);
2529
}

webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@
1616
@Experimental
1717
public class SmartContentSelector implements Selector {
1818

19+
private int threshold = 86;
20+
1921
public SmartContentSelector() {
2022
}
2123

24+
public SmartContentSelector(int threshold) {
25+
this.threshold = threshold;
26+
}
27+
2228
@Override
2329
public String select(String html) {
2430
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
@@ -29,7 +35,6 @@ public String select(String html) {
2935
html = html.replaceAll("(?is)<.*?>", "");
3036
List<String> lines;
3137
int blocksWidth =3;
32-
int threshold =86;
3338
int start;
3439
int end;
3540
StringBuilder text = new StringBuilder();

webmagic-coverage/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
<parent>
1111
<groupId>us.codecraft</groupId>
1212
<artifactId>webmagic</artifactId>
13-
<version>1.0.2</version>
13+
<version>1.0.3</version>
1414
</parent>
1515

1616
<artifactId>webmagic-coverage</artifactId>

webmagic-extension/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import us.codecraft.webmagic.Request;
77
import us.codecraft.webmagic.Task;
88
import us.codecraft.webmagic.selector.PlainText;
9+
import us.codecraft.webmagic.utils.HttpConstant;
910

1011
import java.io.*;
1112

@@ -96,7 +97,7 @@ public Page download(Request request, Task task) {
9697
page.setRawText(content);
9798
page.setUrl(new PlainText(request.getUrl()));
9899
page.setRequest(request);
99-
page.setStatusCode(200);
100+
page.setStatusCode(HttpConstant.StatusCode.CODE_200);
100101
}
101102
onSuccess(page, task);
102103
} catch (Exception e) {

webmagic-samples/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-saxon/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-scripts/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-selenium/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic</artifactId>
11-
<version>1.0.2</version>
11+
<version>1.0.3</version>
1212
</parent>
1313
<modelVersion>4.0.0</modelVersion>
1414

webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java

+3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
import us.codecraft.webmagic.downloader.AbstractDownloader;
1515
import us.codecraft.webmagic.selector.Html;
1616
import us.codecraft.webmagic.selector.PlainText;
17+
import us.codecraft.webmagic.utils.HttpConstant;
1718

1819
import java.io.Closeable;
1920
import java.io.IOException;
21+
import java.net.http.HttpRequest;
2022
import java.util.Map;
2123

2224
/**
@@ -111,6 +113,7 @@ public Page download(Request request, Task task) {
111113
page.setHtml(new Html(content, request.getUrl()));
112114
page.setUrl(new PlainText(request.getUrl()));
113115
page.setRequest(request);
116+
page.setStatusCode(HttpConstant.StatusCode.CODE_200);
114117
onSuccess(page, task);
115118
} catch (Exception e) {
116119
logger.warn("download page {} error", request.getUrl(), e);

0 commit comments

Comments
 (0)