Skip to content

Commit dc7218e

Browse files
committed
Merge branch 'release/0.7.6'
2 parents 31c4e48 + 838c47f commit dc7218e

File tree

31 files changed

+496
-347
lines changed

31 files changed

+496
-347
lines changed

pom.xml

Lines changed: 74 additions & 50 deletions
Large diffs are not rendered by default.

webmagic-core/pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.7.5</version>
6+
<version>0.7.6</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

@@ -52,8 +52,8 @@
5252
</dependency>
5353

5454
<dependency>
55-
<groupId>commons-collections</groupId>
56-
<artifactId>commons-collections</artifactId>
55+
<groupId>org.apache.commons</groupId>
56+
<artifactId>commons-collections4</artifactId>
5757
</dependency>
5858

5959
<dependency>

webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
package us.codecraft.webmagic;
22

3-
import org.apache.commons.collections.CollectionUtils;
3+
4+
import java.io.Closeable;
5+
import java.io.IOException;
6+
import java.util.ArrayList;
7+
import java.util.Collection;
8+
import java.util.Date;
9+
import java.util.List;
10+
import java.util.UUID;
11+
import java.util.concurrent.ExecutorService;
12+
import java.util.concurrent.TimeUnit;
13+
import java.util.concurrent.atomic.AtomicInteger;
14+
import java.util.concurrent.atomic.AtomicLong;
15+
import java.util.concurrent.locks.Condition;
16+
import java.util.concurrent.locks.ReentrantLock;
17+
import org.apache.commons.collections4.CollectionUtils;
418
import org.apache.commons.lang3.SerializationUtils;
519
import org.slf4j.Logger;
620
import org.slf4j.LoggerFactory;
@@ -17,16 +31,6 @@
1731
import us.codecraft.webmagic.utils.UrlUtils;
1832
import us.codecraft.webmagic.utils.WMCollections;
1933

20-
import java.io.Closeable;
21-
import java.io.IOException;
22-
import java.util.*;
23-
import java.util.concurrent.ExecutorService;
24-
import java.util.concurrent.TimeUnit;
25-
import java.util.concurrent.atomic.AtomicInteger;
26-
import java.util.concurrent.atomic.AtomicLong;
27-
import java.util.concurrent.locks.Condition;
28-
import java.util.concurrent.locks.ReentrantLock;
29-
3034
/**
3135
* Entrance of a crawler.<br>
3236
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
@@ -106,7 +110,7 @@ public class Spider implements Runnable, Task {
106110

107111
private Date startTime;
108112

109-
private int emptySleepTime = 30000;
113+
private long emptySleepTime = 30000;
110114

111115
/**
112116
* create a spider with pageProcessor.
@@ -305,32 +309,53 @@ protected void initComponent() {
305309
public void run() {
306310
checkRunningStat();
307311
initComponent();
308-
logger.info("Spider {} started!",getUUID());
312+
logger.info("Spider {} started!", getUUID());
313+
// interrupt won't be necessarily detected
309314
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
310-
final Request request = scheduler.poll(this);
311-
if (request == null) {
312-
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
313-
break;
314-
}
315-
// wait until new url added
316-
waitNewUrl();
317-
} else {
318-
threadPool.execute(new Runnable() {
319-
@Override
320-
public void run() {
321-
try {
322-
processRequest(request);
323-
onSuccess(request);
324-
} catch (Exception e) {
325-
onError(request, e);
326-
logger.error("process request " + request + " error", e);
327-
} finally {
328-
pageCount.incrementAndGet();
329-
signalNewUrl();
315+
Request poll = scheduler.poll(this);
316+
if (poll == null) {
317+
if (threadPool.getThreadAlive() == 0) {
318+
//no alive thread anymore , try again
319+
poll = scheduler.poll(this);
320+
if (poll == null) {
321+
if (exitWhenComplete) {
322+
break;
323+
} else {
324+
// wait
325+
try {
326+
Thread.sleep(emptySleepTime);
327+
continue;
328+
} catch (InterruptedException e) {
329+
Thread.currentThread().interrupt();
330+
break;
331+
}
330332
}
331333
}
332-
});
334+
} else {
335+
// wait until new url added,
336+
if (waitNewUrl())
337+
//if interrupted
338+
break;
339+
continue;
340+
}
333341
}
342+
final Request request = poll;
343+
//this may swallow the interruption
344+
threadPool.execute(new Runnable() {
345+
@Override
346+
public void run() {
347+
try {
348+
processRequest(request);
349+
onSuccess(request);
350+
} catch (Exception e) {
351+
onError(request, e);
352+
logger.error("process request " + request + " error", e);
353+
} finally {
354+
pageCount.incrementAndGet();
355+
signalNewUrl();
356+
}
357+
}
358+
});
334359
}
335360
stat.set(STAT_STOPPED);
336361
// release some resources
@@ -469,6 +494,7 @@ protected void sleep(int time) {
469494
Thread.sleep(time);
470495
} catch (InterruptedException e) {
471496
logger.error("Thread interrupted when sleep",e);
497+
Thread.currentThread().interrupt();
472498
}
473499
}
474500

@@ -565,16 +591,24 @@ public Spider addRequest(Request... requests) {
565591
return this;
566592
}
567593

568-
private void waitNewUrl() {
594+
/**
595+
*
596+
* @return isInterrupted
597+
*/
598+
private boolean waitNewUrl() {
599+
// now there may not be any thread live
569600
newUrlLock.lock();
570601
try {
571-
//double check
572-
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
573-
return;
602+
//double check,unnecessary, unless very fast concurrent
603+
if (threadPool.getThreadAlive() == 0) {
604+
return false;
574605
}
606+
//wait for amount of time
575607
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
608+
return false;
576609
} catch (InterruptedException e) {
577-
logger.warn("waitNewUrl - interrupted, error {}", e);
610+
// logger.warn("waitNewUrl - interrupted, error {}", e);
611+
return true;
578612
} finally {
579613
newUrlLock.unlock();
580614
}
@@ -772,7 +806,10 @@ public Scheduler getScheduler() {
772806
*
773807
* @param emptySleepTime In MILLISECONDS.
774808
*/
775-
public void setEmptySleepTime(int emptySleepTime) {
809+
public void setEmptySleepTime(long emptySleepTime) {
810+
if(emptySleepTime<=0){
811+
throw new IllegalArgumentException("emptySleepTime should be more than zero!");
812+
}
776813
this.emptySleepTime = emptySleepTime;
777814
}
778815
}

webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
*/
99
public interface SpiderListener {
1010

11-
public void onSuccess(Request request);
11+
void onSuccess(Request request);
1212

1313
/**
1414
* @deprecated Use {@link #onError(Request, Exception)} instead.
1515
*/
1616
@Deprecated
17-
public void onError(Request request);
17+
default void onError(Request request) {
18+
}
1819

1920
default void onError(Request request, Exception e) {
2021
this.onError(request);

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import us.codecraft.webmagic.Page;
44
import us.codecraft.webmagic.Request;
55
import us.codecraft.webmagic.Site;
6+
import us.codecraft.webmagic.Task;
67
import us.codecraft.webmagic.selector.Html;
78

89
/**
@@ -26,7 +27,7 @@ public Html download(String url) {
2627
/**
2728
* A simple method to download a url.
2829
*
29-
* @param url url
30+
* @param url url
3031
* @param charset charset
3132
* @return html
3233
*/
@@ -35,10 +36,26 @@ public Html download(String url, String charset) {
3536
return (Html) page.getHtml();
3637
}
3738

39+
@Deprecated
3840
protected void onSuccess(Request request) {
3941
}
4042

43+
/**
44+
* @since 0.7.6
45+
*/
46+
protected void onSuccess(Request request, Task task) {
47+
this.onSuccess(request);
48+
}
49+
50+
@Deprecated
4151
protected void onError(Request request) {
4252
}
4353

54+
/**
55+
* @since 0.7.6
56+
*/
57+
protected void onError(Request request, Task task, Throwable e) {
58+
this.onError(request);
59+
}
60+
4461
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,12 @@ public Page download(Request request, Task task) {
8282
try {
8383
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
8484
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
85-
onSuccess(request);
85+
onSuccess(request, task);
8686
logger.info("downloading page success {}", request.getUrl());
8787
return page;
8888
} catch (IOException e) {
8989
logger.warn("download page {} error", request.getUrl(), e);
90-
onError(request);
90+
onError(request, task, e);
9191
return page;
9292
} finally {
9393
if (httpResponse != null) {
@@ -110,7 +110,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http
110110
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
111111
Page page = new Page();
112112
page.setBytes(bytes);
113-
if (!request.isBinaryContent()){
113+
if (!request.isBinaryContent()) {
114114
if (charset == null) {
115115
charset = getHtmlCharset(contentType, bytes);
116116
}

webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
import us.codecraft.webmagic.Site;
55

66
/**
7-
* Interface to be implemented to customize a crawler.<br>
8-
* <br>
7+
* Interface to be implemented to customize a crawler.
8+
*
9+
* <p>
910
* In PageProcessor, you can customize:
10-
* <br>
11-
* start urls and other settings in {@link Site}<br>
12-
* how the urls to fetch are detected <br>
13-
* how the data are extracted and stored <br>
11+
* </p>
12+
* <ul>
13+
* <li>start URLs and other settings in {@link Site}</li>
14+
* <li>how the URLs to fetch are detected</li>
15+
* <li>how the data are extracted and stored</li>
16+
* </ul>
1417
*
1518
* @author [email protected] <br>
1619
* @see Site
@@ -20,17 +23,20 @@
2023
public interface PageProcessor {
2124

2225
/**
23-
* process the page, extract urls to fetch, extract the data and store
26+
* Processes the page, extract URLs to fetch, extract the data and store.
2427
*
2528
* @param page page
2629
*/
27-
public void process(Page page);
30+
void process(Page page);
2831

2932
/**
30-
* get the site settings
33+
* Returns the site settings.
3134
*
3235
* @return site
3336
* @see Site
3437
*/
35-
public Site getSite();
38+
default Site getSite() {
39+
return Site.me();
40+
}
41+
3642
}

webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package us.codecraft.webmagic.selector;
22

3-
import org.apache.commons.collections.CollectionUtils;
43

54
import java.util.ArrayList;
65
import java.util.List;
6+
import org.apache.commons.collections4.CollectionUtils;
77

88
/**
99

webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package us.codecraft.webmagic.selector;
22

33
import org.jsoup.Jsoup;
4+
import org.jsoup.nodes.Document;
45
import org.jsoup.nodes.Element;
56

67
import java.util.ArrayList;
@@ -11,34 +12,47 @@
1112
* @since 0.3.0
1213
*/
1314
public abstract class BaseElementSelector implements Selector, ElementSelector {
15+
private Document parse(String text) {
16+
if (text == null) {
17+
return null;
18+
}
19+
20+
// Jsoup could not parse <tr></tr> or <td></td> tag directly
21+
// https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag
22+
if ((text.startsWith("<tr>") && text.endsWith("</tr>"))
23+
|| (text.startsWith("<td>") && text.endsWith("</td>"))) {
24+
text = "<table>" + text + "</table>";
25+
}
26+
return Jsoup.parse(text);
27+
}
1428

1529
@Override
1630
public String select(String text) {
1731
if (text != null) {
18-
return select(Jsoup.parse(text));
32+
return select(parse(text));
1933
}
2034
return null;
2135
}
2236

2337
@Override
2438
public List<String> selectList(String text) {
2539
if (text != null) {
26-
return selectList(Jsoup.parse(text));
40+
return selectList(parse(text));
2741
} else {
2842
return new ArrayList<String>();
2943
}
3044
}
3145

3246
public Element selectElement(String text) {
3347
if (text != null) {
34-
return selectElement(Jsoup.parse(text));
48+
return selectElement(parse(text));
3549
}
3650
return null;
3751
}
3852

3953
public List<Element> selectElements(String text) {
4054
if (text != null) {
41-
return selectElements(Jsoup.parse(text));
55+
return selectElements(parse(text));
4256
} else {
4357
return new ArrayList<Element>();
4458
}

0 commit comments

Comments
 (0)