Skip to content

Commit 2aee2e0

Browse files
committed
update pipeline api
1 parent 4e3ee63 commit 2aee2e0

File tree

26 files changed

+119
-95
lines changed

26 files changed

+119
-95
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Page.java

Lines changed: 6 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
import java.util.ArrayList;
88
import java.util.List;
9-
import java.util.Map;
10-
import java.util.concurrent.ConcurrentHashMap;
119

1210
/**
1311
* <pre>
@@ -27,52 +25,24 @@ public class Page {
2725

2826
private Request request;
2927

30-
private Map<String, Selectable> fields = new ConcurrentHashMap<String, Selectable>();
28+
private ResultItems resultItems = new ResultItems();
3129

3230
private Selectable html;
3331

3432
private Selectable url;
3533

3634
private List<Request> targetRequests = new ArrayList<Request>();
3735

38-
private boolean skip;
39-
40-
private Object extra;
41-
42-
/**
43-
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
44-
* @return 是否忽略 true 忽略
45-
*/
46-
public boolean isSkip() {
47-
return skip;
48-
}
49-
50-
/**
51-
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
52-
* @param skip 是否忽略 true 忽略
53-
*/
54-
public void setSkip(boolean skip) {
55-
this.skip = skip;
56-
}
57-
5836
public Page() {
5937
}
6038

61-
/**
62-
* 获取抽取的结果,在{@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
63-
* @return fields 抽取的结果
64-
*/
65-
public Map<String, Selectable> getFields() {
66-
return fields;
67-
}
68-
6939
/**
7040
* 保存抽取的结果
7141
* @param key 结果的key
7242
* @param field 结果的value
7343
*/
74-
public void putField(String key, Selectable field) {
75-
fields.put(key, field);
44+
public void putField(String key, Object field) {
45+
resultItems.put(key, field);
7646
}
7747

7848
/**
@@ -157,23 +127,10 @@ public Request getRequest() {
157127

158128
public void setRequest(Request request) {
159129
this.request = request;
130+
this.resultItems.setRequest(request);
160131
}
161132

162-
/**
163-
* 获取附加对象
164-
* @param <T> 对象类型
165-
* @return 对象内容
166-
*/
167-
public <T> T getExtra() {
168-
return (T)extra;
169-
}
170-
171-
/**
172-
* 设置附加对象
173-
* @param extra 对象内容
174-
* @param <T> 对象类型
175-
*/
176-
public <T> void setExtra(T extra) {
177-
this.extra = extra;
133+
public ResultItems getResultItems() {
134+
return resultItems;
178135
}
179136
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package us.codecraft.webmagic;
2+
3+
import java.util.HashMap;
4+
import java.util.Map;
5+
6+
/**
7+
* 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。<br>
8+
* @author [email protected] <br>
9+
* @date: 13-7-25 <br>
10+
* Time: 下午12:20 <br>
11+
*/
12+
public class ResultItems {
13+
14+
private Map<String, Object> fields = new HashMap<String, Object>();
15+
16+
private Request request;
17+
18+
private boolean skip;
19+
20+
public <T> T get(String key) {
21+
Object o = fields.get(key);
22+
if (o == null) {
23+
return null;
24+
}
25+
return (T) fields.get(key);
26+
}
27+
28+
public Map<String, Object> getAll() {
29+
return fields;
30+
}
31+
32+
public <T> ResultItems put(String key, T value) {
33+
fields.put(key, value);
34+
return this;
35+
}
36+
37+
public Request getRequest() {
38+
return request;
39+
}
40+
41+
public ResultItems setRequest(Request request) {
42+
this.request = request;
43+
return this;
44+
}
45+
46+
/**
47+
* 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
48+
* @return 是否忽略 true 忽略
49+
*/
50+
public boolean isSkip() {
51+
return skip;
52+
}
53+
54+
55+
/**
56+
* 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
57+
* @param skip
58+
* @return this
59+
*/
60+
public ResultItems setSkip(boolean skip) {
61+
this.skip = skip;
62+
return this;
63+
}
64+
}

webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ private void processRequest(Request request) {
196196
pageProcessor.process(page);
197197
addRequest(page);
198198
for (Pipeline pipeline : pipelines) {
199-
pipeline.process(page, this);
199+
pipeline.process(page.getResultItems(), this);
200200
}
201201
sleep(site.getSleepTime());
202202
}
Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
package us.codecraft.webmagic.pipeline;
22

3-
import us.codecraft.webmagic.Page;
3+
import us.codecraft.webmagic.ResultItems;
44
import us.codecraft.webmagic.Task;
5-
import us.codecraft.webmagic.selector.Selectable;
65

76
import java.util.Map;
87

@@ -15,13 +14,10 @@
1514
public class ConsolePipeline implements Pipeline{
1615

1716
@Override
18-
public void process(Page page,Task task) {
19-
System.out.println("get page: "+page.getUrl());
20-
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
21-
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());
22-
}
23-
if (page.getExtra()!=null){
24-
System.out.println(page.getExtra());
17+
public void process(ResultItems resultItems,Task task) {
18+
System.out.println("get page: "+resultItems.getRequest().getUrl());
19+
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
20+
System.out.println(entry.getKey()+":\t"+entry.getValue());
2521
}
2622
}
2723
}

webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
import org.apache.commons.codec.digest.DigestUtils;
44
import org.apache.log4j.Logger;
5-
import us.codecraft.webmagic.Page;
5+
import us.codecraft.webmagic.ResultItems;
66
import us.codecraft.webmagic.Task;
77

88
import java.io.File;
99
import java.io.FileWriter;
1010
import java.io.IOException;
1111
import java.io.PrintWriter;
12+
import java.util.Map;
1213

1314
/**
1415
* 持久化到文件的接口。
@@ -38,16 +39,18 @@ public FilePipeline(String path) {
3839
}
3940

4041
@Override
41-
public void process(Page page, Task task) {
42+
public void process(ResultItems resultItems, Task task) {
4243
String path = this.path + "/" + task.getUUID() + "/";
4344
File file = new File(path);
4445
if (!file.exists()) {
4546
file.mkdirs();
4647
}
4748
try {
48-
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(page.getUrl().toString())));
49-
printWriter.println("url:\t" + page.getUrl());
50-
printWriter.println("html:\t" + page.getHtml());
49+
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())));
50+
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
51+
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
52+
printWriter.println(entry.getKey()+":\t"+entry.getValue());
53+
}
5154
printWriter.close();
5255
} catch (IOException e) {
5356
logger.warn("write file error",e);

webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package us.codecraft.webmagic.pipeline;
22

3-
import us.codecraft.webmagic.Page;
3+
import us.codecraft.webmagic.ResultItems;
44
import us.codecraft.webmagic.Task;
55

66
/**
@@ -11,5 +11,5 @@
1111
*/
1212
public interface Pipeline {
1313

14-
public void process(Page page,Task task);
14+
public void process(ResultItems resultItems,Task task);
1515
}

webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ public SimplePageProcessor(String startUrl, String urlPattern) {
3030

3131
@Override
3232
public void process(Page page) {
33-
List<String> requests = page.getHtml().links().regex(urlPattern).toStrings();
33+
List<String> requests = page.getHtml().links().regex(urlPattern).all();
3434
//调用page.addTargetRequests()方法添加待抓取链接
3535
page.addTargetRequests(requests);
3636
//xpath方式抽取
3737
page.putField("title", page.getHtml().xpath("//title"));
3838
//sc表示使用Readability技术抽取正文
39+
page.putField("html", page.getHtml().toString());
3940
page.putField("content", page.getHtml().smartContent());
4041
}
4142

webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@ public Selectable replace(String regex, String replacement) {
8282
}
8383

8484
@Override
85-
public List<String> toStrings() {
85+
public List<String> all() {
8686
return strings;
8787
}
8888

8989
@Override
9090
public String toString() {
91-
if (CollectionUtils.isNotEmpty(toStrings())) {
92-
return toStrings().get(0);
91+
if (CollectionUtils.isNotEmpty(all())) {
92+
return all().get(0);
9393
} else {
9494
return null;
9595
}

webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,5 +69,5 @@ public interface Selectable {
6969
*
7070
* @return multi string result
7171
*/
72-
public List<String> toStrings();
72+
public List<String> all();
7373
}

webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,7 @@ public void test() {
13511351
public void testOschina() {
13521352
Html html1 = new Html(html);
13531353
Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString());
1354-
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").toStrings());
1354+
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
13551355
}
13561356

13571357
}

0 commit comments

Comments
 (0)