Skip to content

Commit fd4a136

Browse files
committed
Merge branch 'release/0.9.0'
2 parents 9e59b37 + 3688226 commit fd4a136

File tree

25 files changed

+418
-187
lines changed

25 files changed

+418
-187
lines changed

README-zh.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w
3939
<dependency>
4040
<groupId>us.codecraft</groupId>
4141
<artifactId>webmagic-core</artifactId>
42-
<version>0.7.5</version>
42+
<version>${webmagic.version}</version>
4343
</dependency>
4444
<dependency>
4545
<groupId>us.codecraft</groupId>
4646
<artifactId>webmagic-extension</artifactId>
47-
<version>0.7.5</version>
47+
<version>${webmagic.version}</version>
4848
</dependency>
4949
```
5050

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ Add dependencies to your pom.xml:
2525
<dependency>
2626
<groupId>us.codecraft</groupId>
2727
<artifactId>webmagic-core</artifactId>
28-
<version>0.7.5</version>
28+
<version>${webmagic.version}</version>
2929
</dependency>
3030
<dependency>
3131
<groupId>us.codecraft</groupId>
3232
<artifactId>webmagic-extension</artifactId>
33-
<version>0.7.5</version>
33+
<version>${webmagic.version}</version>
3434
</dependency>
3535
```
3636

pom.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
33
<groupId>us.codecraft</groupId>
4-
<version>0.8.0</version>
4+
<version>0.9.0</version>
55
<modelVersion>4.0.0</modelVersion>
66
<packaging>pom</packaging>
77
<properties>
@@ -124,7 +124,7 @@
124124
<dependency>
125125
<groupId>us.codecraft</groupId>
126126
<artifactId>xsoup</artifactId>
127-
<version>0.3.6</version>
127+
<version>0.3.7</version>
128128
</dependency>
129129
<dependency>
130130
<groupId>com.alibaba</groupId>

webmagic-core/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.8.0</version>
6+
<version>0.9.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-core/src/main/java/us/codecraft/webmagic/Page.java

+6-5
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
* {@link #getHtml()} get content of current page <br>
2121
* {@link #putField(String, Object)} save extracted result <br>
2222
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
23-
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br>
23+
* {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch <br>
2424
*
2525
* @author [email protected] <br>
2626
* @see us.codecraft.webmagic.downloader.Downloader
@@ -52,7 +52,7 @@ public class Page {
5252
private List<Request> targetRequests = new ArrayList<Request>();
5353

5454
private String charset;
55-
55+
5656
public Page() {
5757
}
5858

@@ -108,7 +108,8 @@ public Json getJson() {
108108
* @deprecated since 0.4.0
109109
* The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
110110
*/
111-
public void setHtml(Html html) {
111+
@Deprecated
112+
public void setHtml(Html html) {
112113
this.html = html;
113114
}
114115

@@ -121,7 +122,7 @@ public List<Request> getTargetRequests() {
121122
*
122123
* @param requests requests
123124
*/
124-
public void addTargetRequests(List<String> requests) {
125+
public void addTargetRequests(Iterable<String> requests) {
125126
for (String s : requests) {
126127
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
127128
continue;
@@ -137,7 +138,7 @@ public void addTargetRequests(List<String> requests) {
137138
* @param requests requests
138139
* @param priority priority
139140
*/
140-
public void addTargetRequests(List<String> requests, long priority) {
141+
public void addTargetRequests(Iterable<String> requests, long priority) {
141142
for (String s : requests) {
142143
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
143144
continue;

webmagic-core/src/main/java/us/codecraft/webmagic/Site.java

+26
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public class Site {
2828

2929
private String charset;
3030

31+
private String defaultCharset;
32+
3133
private int sleepTime = 5000;
3234

3335
private int retryTimes = 0;
@@ -168,6 +170,30 @@ public String getCharset() {
168170
return charset;
169171
}
170172

173+
/**
174+
* Set default charset of page.
175+
*
176+
* When charset detect failed, use this default charset.
177+
*
178+
* @param defaultCharset the default charset
179+
* @return this
180+
* @since 0.9.0
181+
*/
182+
public Site setDefaultCharset(String defaultCharset) {
183+
this.defaultCharset = defaultCharset;
184+
return this;
185+
}
186+
187+
/**
188+
* The default charset if charset detected failed.
189+
*
190+
* @return the defulat charset
191+
* @since 0.9.0
192+
*/
193+
public String getDefaultCharset() {
194+
return defaultCharset;
195+
}
196+
171197
public int getTimeOut() {
172198
return timeOut;
173199
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

+6-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import java.nio.charset.Charset;
55
import java.util.HashMap;
66
import java.util.Map;
7+
import java.util.Optional;
78

89
import org.apache.commons.io.IOUtils;
910
import org.apache.http.HttpResponse;
@@ -76,7 +77,7 @@ public Page download(Request request, Task task) {
7677
}
7778
CloseableHttpResponse httpResponse = null;
7879
CloseableHttpClient httpClient = getHttpClient(task.getSite());
79-
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
80+
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
8081
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
8182
Page page = Page.fail();
8283
try {
@@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
116117
page.setBytes(bytes);
117118
if (!request.isBinaryContent()) {
118119
if (charset == null) {
119-
charset = getHtmlCharset(contentType, bytes);
120+
charset = getHtmlCharset(contentType, bytes, task);
120121
}
121122
page.setCharset(charset);
122123
page.setRawText(new String(bytes, charset));
@@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
131132
return page;
132133
}
133134

134-
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
135+
private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException {
135136
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
136137
if (charset == null) {
137-
charset = Charset.defaultCharset().name();
138-
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
138+
charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name);
139+
logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset());
139140
}
140141
return charset;
141142
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java

+20-28
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11
package us.codecraft.webmagic.downloader;
22

3-
import java.io.IOException;
4-
import java.security.KeyManagementException;
5-
import java.security.NoSuchAlgorithmException;
6-
import java.security.cert.CertificateException;
7-
import java.security.cert.X509Certificate;
8-
import java.util.Map;
9-
10-
import javax.net.ssl.SSLContext;
11-
import javax.net.ssl.TrustManager;
12-
import javax.net.ssl.X509TrustManager;
13-
143
import org.apache.commons.lang3.JavaVersion;
154
import org.apache.commons.lang3.SystemUtils;
165
import org.apache.http.HttpException;
@@ -22,28 +11,32 @@
2211
import org.apache.http.config.SocketConfig;
2312
import org.apache.http.conn.socket.ConnectionSocketFactory;
2413
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
25-
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
2614
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
27-
import org.apache.http.impl.client.BasicCookieStore;
28-
import org.apache.http.impl.client.CloseableHttpClient;
29-
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
30-
import org.apache.http.impl.client.HttpClientBuilder;
31-
import org.apache.http.impl.client.HttpClients;
15+
import org.apache.http.impl.client.*;
3216
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
3317
import org.apache.http.impl.cookie.BasicClientCookie;
3418
import org.apache.http.protocol.HttpContext;
3519
import org.slf4j.Logger;
3620
import org.slf4j.LoggerFactory;
37-
3821
import us.codecraft.webmagic.Site;
3922

23+
import javax.net.ssl.SSLContext;
24+
import javax.net.ssl.TrustManager;
25+
import javax.net.ssl.X509TrustManager;
26+
import java.io.IOException;
27+
import java.security.KeyManagementException;
28+
import java.security.NoSuchAlgorithmException;
29+
import java.security.cert.CertificateException;
30+
import java.security.cert.X509Certificate;
31+
import java.util.Map;
32+
4033
/**
4134
* @author [email protected] <br>
4235
* @since 0.4.0
4336
*/
4437
public class HttpClientGenerator {
4538

46-
private transient Logger logger = LoggerFactory.getLogger(getClass());
39+
private transient Logger logger = LoggerFactory.getLogger(getClass());
4740

4841
private PoolingHttpClientConnectionManager connectionManager;
4942

@@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
6154
SSLContext sslContext = createIgnoreVerifySSL();
6255
String[] supportedProtocols;
6356
if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) {
64-
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" };
57+
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"};
6558
} else {
66-
supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" };
59+
supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"};
6760
}
6861
logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols));
6962
return new SSLConnectionSocketFactory(sslContext, supportedProtocols,
7063
null,
71-
new DefaultHostnameVerifier()); // 优先绕过安全证书
72-
} catch (KeyManagementException e) {
73-
logger.error("ssl connection fail", e);
74-
} catch (NoSuchAlgorithmException e) {
64+
//不进行主机校验
65+
(host, sslSession) -> true); // 优先绕过安全证书
66+
} catch (KeyManagementException | NoSuchAlgorithmException e) {
7567
logger.error("ssl connection fail", e);
7668
}
7769
return SSLConnectionSocketFactory.getSocketFactory();
78-
}
70+
}
7971

8072
private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
8173
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
@@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() {
9789
};
9890

9991
SSLContext sc = SSLContext.getInstance("TLS");
100-
sc.init(null, new TrustManager[] { trustManager }, null);
92+
sc.init(null, new TrustManager[]{trustManager}, null);
10193
return sc;
102-
}
94+
}
10395

10496
public HttpClientGenerator setPoolSize(int poolSize) {
10597
connectionManager.setMaxTotal(poolSize);

webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package us.codecraft.webmagic.proxy;
22

33
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Request;
45
import us.codecraft.webmagic.Task;
56

67
/**
@@ -23,7 +24,23 @@ public interface ProxyProvider {
2324
* Get a proxy for task by some strategy.
2425
* @param task the download task
2526
* @return proxy
27+
* @deprecated Use {@link #getProxy(Request, Task)} instead.
2628
*/
27-
Proxy getProxy(Task task);
29+
@Deprecated
30+
default Proxy getProxy(Task task) {
31+
throw new UnsupportedOperationException();
32+
}
33+
34+
/**
35+
* Returns a proxy for the request.
36+
*
37+
* @param request the request
38+
* @param task the download task
39+
* @return proxy
40+
* @since 0.9.0
41+
*/
42+
default Proxy getProxy(Request request, Task task) {
43+
return this.getProxy(task);
44+
}
2845

2946
}

webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package us.codecraft.webmagic.proxy;
22

33
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Request;
45
import us.codecraft.webmagic.Task;
56

67
import java.util.ArrayList;
@@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) {
4445
}
4546

4647
@Override
47-
public Proxy getProxy(Task task) {
48+
public Proxy getProxy(Request request, Task task) {
4849
return proxies.get(incrForLoop());
4950
}
5051

webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java

+2-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import org.jsoup.Jsoup;
44
import org.jsoup.nodes.Document;
55
import org.jsoup.nodes.Element;
6+
import us.codecraft.webmagic.utils.BaseSelectorUtils;
67

78
import java.util.ArrayList;
89
import java.util.List;
@@ -13,16 +14,9 @@
1314
*/
1415
public abstract class BaseElementSelector implements Selector, ElementSelector {
1516
private Document parse(String text) {
16-
if (text == null) {
17-
return null;
18-
}
19-
2017
// Jsoup could not parse <tr></tr> or <td></td> tag directly
2118
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
22-
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
23-
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
24-
text = "<table>" + text + "</table>";
25-
}
19+
text = BaseSelectorUtils.preParse(text);
2620
return Jsoup.parse(text);
2721
}
2822

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package us.codecraft.webmagic.utils;
2+
3+
/**
4+
* @author hooy
5+
*/
6+
public class BaseSelectorUtils {
7+
8+
/**
9+
* Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly
10+
* https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
11+
*
12+
* @param text - the html string
13+
* @return text
14+
*/
15+
public static String preParse(String text) {
16+
if (((text.startsWith("<tr>") || text.startsWith("<tr ")) && text.endsWith("</tr>"))
17+
|| ((text.startsWith("<td>") || text.startsWith("<td ")) && text.endsWith("</td>"))) {
18+
text = "<table>" + text + "</table>";
19+
}
20+
return text;
21+
}
22+
23+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package us.codecraft.webmagic;
2+
3+
import static org.junit.Assert.assertEquals;
4+
5+
import java.nio.charset.StandardCharsets;
6+
7+
import org.junit.Test;
8+
9+
public class SiteTest {
10+
11+
@Test
12+
public void test() {
13+
Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
14+
assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
15+
}
16+
17+
}

0 commit comments

Comments
 (0)