Skip to content

Commit 4c57985

Browse files
committed
update redisscheduler
1 parent 9d197ec commit 4c57985

File tree

10 files changed

+157
-95
lines changed

10 files changed

+157
-95
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Page.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,4 +148,15 @@ public void setRequest(Request request) {
148148
public ResultItems getResultItems() {
149149
return resultItems;
150150
}
151+
152+
@Override
153+
public String toString() {
154+
return "Page{" +
155+
"request=" + request +
156+
", resultItems=" + resultItems +
157+
", html=" + html +
158+
", url=" + url +
159+
", targetRequests=" + targetRequests +
160+
'}';
161+
}
151162
}

webmagic-core/src/main/java/us/codecraft/webmagic/Request.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,4 +113,13 @@ public void setExtras(Map<String, Object> extras) {
113113
public void setUrl(String url) {
114114
this.url = url;
115115
}
116+
117+
@Override
118+
public String toString() {
119+
return "Request{" +
120+
"url='" + url + '\'' +
121+
", extras=" + extras +
122+
", priority=" + priority +
123+
'}';
124+
}
116125
}

webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,33 +40,33 @@
4040
*/
4141
public class Spider implements Runnable, Task {
4242

43-
private Downloader downloader;
43+
protected Downloader downloader;
4444

45-
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
45+
protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
4646

47-
private PageProcessor pageProcessor;
47+
protected PageProcessor pageProcessor;
4848

49-
private List<String> startUrls;
49+
protected List<String> startUrls;
5050

51-
private Site site;
51+
protected Site site;
5252

53-
private String uuid;
53+
protected String uuid;
5454

55-
private Scheduler scheduler = new QueueScheduler();
55+
protected Scheduler scheduler = new QueueScheduler();
5656

57-
private Logger logger = Logger.getLogger(getClass());
57+
protected Logger logger = Logger.getLogger(getClass());
5858

59-
private ExecutorService executorService;
59+
protected ExecutorService executorService;
6060

61-
private int threadNum = 1;
61+
protected int threadNum = 1;
6262

63-
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
63+
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
6464

65-
private final static int STAT_INIT = 0;
65+
protected final static int STAT_INIT = 0;
6666

67-
private final static int STAT_RUNNING = 1;
67+
protected final static int STAT_RUNNING = 1;
6868

69-
private final static int STAT_STOPPED = 2;
69+
protected final static int STAT_STOPPED = 2;
7070

7171
/**
7272
* 使用已定义的抽取规则新建一个Spider。
@@ -206,7 +206,7 @@ public void run() {
206206
destroy();
207207
}
208208

209-
private void destroy() {
209+
protected void destroy() {
210210
destroyEach(downloader);
211211
destroyEach(pageProcessor);
212212
for (Pipeline pipeline : pipelines) {
@@ -233,7 +233,7 @@ public void test(String... urls){
233233
}
234234
}
235235

236-
private void processRequest(Request request) {
236+
protected void processRequest(Request request) {
237237
Page page = downloader.download(request, this);
238238
if (page == null) {
239239
sleep(site.getSleepTime());
@@ -249,23 +249,23 @@ private void processRequest(Request request) {
249249
sleep(site.getSleepTime());
250250
}
251251

252-
private void sleep(int time) {
252+
protected void sleep(int time) {
253253
try {
254254
Thread.sleep(time);
255255
} catch (InterruptedException e) {
256256
e.printStackTrace();
257257
}
258258
}
259259

260-
private void addRequest(Page page) {
260+
protected void addRequest(Page page) {
261261
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
262262
for (Request request : page.getTargetRequests()) {
263263
scheduler.push(request, this);
264264
}
265265
}
266266
}
267267

268-
private void checkIfNotRunning() {
268+
protected void checkIfNotRunning() {
269269
if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) {
270270
throw new IllegalStateException("Spider is already running!");
271271
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,7 @@ public Page download(Request request, Task task) {
6666
}
6767
//
6868
handleGzip(httpResponse);
69-
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
70-
charset);
71-
Page page = new Page();
72-
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
73-
page.setUrl(new PlainText(request.getUrl()));
74-
page.setRequest(request);
75-
return page;
69+
return handleResponse(request, charset, httpResponse,task);
7670
} else {
7771
logger.warn("code error " + statusCode + "\t" + request.getUrl());
7872
}
@@ -82,6 +76,16 @@ public Page download(Request request, Task task) {
8276
return null;
8377
}
8478

79+
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse,Task task) throws IOException {
80+
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
81+
charset);
82+
Page page = new Page();
83+
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
84+
page.setUrl(new PlainText(request.getUrl()));
85+
page.setRequest(request);
86+
return page;
87+
}
88+
8589
@Override
8690
public void setThread(int thread) {
8791
poolSize=thread;

webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import org.apache.log4j.Logger;
55
import us.codecraft.webmagic.ResultItems;
66
import us.codecraft.webmagic.Task;
7+
import us.codecraft.webmagic.utils.FilePersistentBase;
78

8-
import java.io.File;
99
import java.io.FileWriter;
1010
import java.io.IOException;
1111
import java.io.PrintWriter;
@@ -18,17 +18,15 @@
1818
* Date: 13-4-21
1919
* Time: 下午6:28
2020
*/
21-
public class FilePipeline implements Pipeline {
22-
23-
private String path = "/data/webmagic/";
21+
public class FilePipeline extends FilePersistentBase implements Pipeline {
2422

2523
private Logger logger = Logger.getLogger(getClass());
2624

2725
/**
2826
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
2927
*/
3028
public FilePipeline() {
31-
29+
setPath("/data/webmagic/");
3230
}
3331

3432
/**
@@ -37,21 +35,14 @@ public FilePipeline() {
3735
* @param path 文件保存路径
3836
*/
3937
public FilePipeline(String path) {
40-
if (!path.endsWith("/")&&!path.endsWith("\\")){
41-
path+="/";
42-
}
43-
this.path = path;
38+
setPath(path);
4439
}
4540

4641
@Override
4742
public void process(ResultItems resultItems, Task task) {
48-
String path = this.path + "/" + task.getUUID() + "/";
49-
File file = new File(path);
50-
if (!file.exists()) {
51-
file.mkdirs();
52-
}
43+
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
5344
try {
54-
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
45+
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
5546
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
5647
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
5748
if (entry.getValue() instanceof Iterable) {
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package us.codecraft.webmagic.utils;
2+
3+
import java.io.File;
4+
5+
/**
6+
* 文件持久化的基础类。<br>
7+
*
8+
* @author [email protected] <br>
9+
* Date: 13-8-11 <br>
10+
* Time: 下午4:21 <br>
11+
*/
12+
public class FilePersistentBase {
13+
14+
protected String path;
15+
16+
public static String PATH_SEPERATOR = "/";
17+
18+
static {
19+
String property = System.getProperties().getProperty("file.separator");
20+
if (property != null) {
21+
PATH_SEPERATOR = property;
22+
}
23+
}
24+
25+
public void setPath(String path) {
26+
this.path = path;
27+
if (!path.endsWith(PATH_SEPERATOR)) {
28+
path += PATH_SEPERATOR;
29+
}
30+
}
31+
32+
public File getFile(String fullName) {
33+
checkAndMakeParentDirecotry(fullName);
34+
return new File(fullName);
35+
}
36+
37+
public void checkAndMakeParentDirecotry(String fullName) {
38+
int index = fullName.lastIndexOf(PATH_SEPERATOR);
39+
if (index > 0) {
40+
String path = fullName.substring(0, index);
41+
File file = new File(path);
42+
if (!file.exists()) {
43+
file.mkdirs();
44+
}
45+
}
46+
}
47+
48+
public String getPath() {
49+
return path;
50+
}
51+
}

webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import us.codecraft.webmagic.Site;
44
import us.codecraft.webmagic.Spider;
5+
import us.codecraft.webmagic.processor.PageProcessor;
56

67
/**
78
* 基于Model的Spider,封装后的入口类。<br>
@@ -20,6 +21,10 @@ protected OOSpider(ModelPageProcessor modelPageProcessor) {
2021
this.modelPageProcessor = modelPageProcessor;
2122
}
2223

24+
public OOSpider(PageProcessor pageProcessor) {
25+
super(pageProcessor);
26+
}
27+
2328
/**
2429
* 创建一个爬虫。<br>
2530
* @param site

webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import us.codecraft.webmagic.Task;
88
import us.codecraft.webmagic.model.HasKey;
99
import us.codecraft.webmagic.model.PageModelPipeline;
10+
import us.codecraft.webmagic.utils.FilePersistentBase;
1011

11-
import java.io.File;
1212
import java.io.FileWriter;
1313
import java.io.IOException;
1414
import java.io.PrintWriter;
@@ -21,46 +21,37 @@
2121
* Date: 13-4-21
2222
* Time: 下午6:28
2323
*/
24-
public class JsonFilePageModelPipeline implements PageModelPipeline {
25-
26-
private String path = "/data/webmagic/";
24+
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
2725

2826
private Logger logger = Logger.getLogger(getClass());
2927

3028
/**
31-
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
29+
* 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/"
3230
*/
3331
public JsonFilePageModelPipeline() {
34-
32+
setPath("/data/webmagic/");
3533
}
3634

3735
/**
38-
* 新建一个FilePipeline
36+
* 新建一个JsonFilePageModelPipeline
3937
*
4038
* @param path 文件保存路径
4139
*/
4240
public JsonFilePageModelPipeline(String path) {
43-
if (!path.endsWith("/") && !path.endsWith("\\")) {
44-
path += "/";
45-
}
46-
this.path = path;
41+
setPath(path);
4742
}
4843

4944
@Override
5045
public void process(Object o, Task task) {
5146
String path = this.path + "/" + task.getUUID() + "/";
52-
File file = new File(path);
53-
if (!file.exists()) {
54-
file.mkdirs();
55-
}
5647
try {
5748
String filename;
5849
if (o instanceof HasKey) {
5950
filename = path + ((HasKey)o).key() + ".json";
6051
} else {
6152
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
6253
}
63-
PrintWriter printWriter = new PrintWriter(new FileWriter(filename));
54+
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
6455
printWriter.write(JSON.toJSONString(o));
6556
printWriter.close();
6657
} catch (IOException e) {

webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import org.apache.log4j.Logger;
66
import us.codecraft.webmagic.ResultItems;
77
import us.codecraft.webmagic.Task;
8+
import us.codecraft.webmagic.utils.FilePersistentBase;
89

910
import java.io.File;
1011
import java.io.FileWriter;
@@ -18,40 +19,31 @@
1819
* Date: 13-4-21
1920
* Time: 下午6:28
2021
*/
21-
public class JsonFilePipeline implements Pipeline {
22-
23-
private String path = "/data/webmagic/";
22+
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
2423

2524
private Logger logger = Logger.getLogger(getClass());
2625

2726
/**
28-
* 新建一个FilePipeline,使用默认保存路径"/data/webmagic/"
27+
* 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/"
2928
*/
3029
public JsonFilePipeline() {
31-
30+
setPath("/data/webmagic");
3231
}
3332

3433
/**
35-
* 新建一个FilePipeline
34+
* 新建一个JsonFilePipeline
3635
*
3736
* @param path 文件保存路径
3837
*/
3938
public JsonFilePipeline(String path) {
40-
if (!path.endsWith("/")&&!path.endsWith("\\")){
41-
path+="/";
42-
}
43-
this.path = path;
39+
setPath(path);
4440
}
4541

4642
@Override
4743
public void process(ResultItems resultItems, Task task) {
4844
String path = this.path + "/" + task.getUUID() + "/";
49-
File file = new File(path);
50-
if (!file.exists()) {
51-
file.mkdirs();
52-
}
5345
try {
54-
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
46+
PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
5547
printWriter.write(JSON.toJSONString(resultItems.getAll()));
5648
printWriter.close();
5749
} catch (IOException e) {

0 commit comments

Comments
 (0)