Skip to content

Commit b131878

Browse files
committed
add example
1 parent 95ab4ed commit b131878

File tree

8 files changed

+161
-20
lines changed

8 files changed

+161
-20
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package us.codecraft.webmagic.processor.example;
2+
3+
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Site;
5+
import us.codecraft.webmagic.Spider;
6+
import us.codecraft.webmagic.processor.PageProcessor;
7+
8+
/**
9+
* @author [email protected] <br>
10+
* @since 0.3.2
11+
*/
12+
public class GithubRepoPageProcesser implements PageProcessor {
13+
14+
private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100);
15+
16+
@Override
17+
public void process(Page page) {
18+
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
19+
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
20+
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
21+
if (page.getResultItems().get("name")==null){
22+
//skip this page
23+
page.setSkip(true);
24+
}
25+
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
26+
}
27+
28+
@Override
29+
public Site getSite() {
30+
return site;
31+
}
32+
33+
public static void main(String[] args) {
34+
Spider.create(new GithubRepoPageProcesser()).thread(5).run();
35+
}
36+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package us.codecraft.webmagic.processor.example;
2+
3+
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Site;
5+
import us.codecraft.webmagic.Spider;
6+
import us.codecraft.webmagic.processor.PageProcessor;
7+
8+
import java.util.List;
9+
10+
/**
11+
* @author [email protected] <br>
12+
*/
13+
public class OschinaBlogPageProcesser implements PageProcessor {
14+
15+
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
16+
17+
@Override
18+
public void process(Page page) {
19+
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
20+
page.addTargetRequests(links);
21+
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
22+
if (page.getResultItems().get("title") == null) {
23+
//skip this page
24+
page.setSkip(true);
25+
}
26+
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
27+
page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
28+
}
29+
30+
@Override
31+
public Site getSite() {
32+
return site;
33+
34+
}
35+
36+
public static void main(String[] args) {
37+
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
38+
}
39+
}

webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package us.codecraft.webmagic.example;
22

3+
import us.codecraft.webmagic.Site;
4+
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
35
import us.codecraft.webmagic.model.HasKey;
6+
import us.codecraft.webmagic.model.OOSpider;
47
import us.codecraft.webmagic.model.annotation.ExtractBy;
58
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
69
import us.codecraft.webmagic.model.annotation.HelpUrl;
@@ -10,6 +13,7 @@
1013

1114
/**
1215
* @author [email protected] <br>
16+
* @since 0.3.2
1317
*/
1418
@TargetUrl("https://github.com/\\w+/\\w+")
1519
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
@@ -27,15 +31,20 @@ public class GithubRepo implements HasKey {
2731
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true)
2832
private List<String> language;
2933

30-
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
34+
@ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()")
3135
private int star;
3236

33-
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
37+
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
3438
private int fork;
3539

3640
@ExtractByUrl
3741
private String url;
3842

43+
public static void main(String[] args) {
44+
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100)
45+
, new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run();
46+
}
47+
3948
@Override
4049
public String key() {
4150
return author + ":" + name;
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,38 @@
11
package us.codecraft.webmagic.example;
22

3-
import us.codecraft.webmagic.Page;
43
import us.codecraft.webmagic.Site;
5-
import us.codecraft.webmagic.model.AfterExtractor;
64
import us.codecraft.webmagic.model.OOSpider;
75
import us.codecraft.webmagic.model.annotation.ExtractBy;
86
import us.codecraft.webmagic.model.annotation.Formatter;
97
import us.codecraft.webmagic.model.annotation.TargetUrl;
108
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
119

10+
import java.util.Date;
1211
import java.util.List;
1312

1413
/**
1514
* @author [email protected] <br>
15+
* @since 0.3.2
1616
*/
1717
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
18-
public class OschinaBlog implements AfterExtractor{
18+
public class OschinaBlog {
1919

2020
@ExtractBy("//title/text()")
2121
private String title;
2222

23-
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
23+
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
2424
private String content;
2525

2626
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
2727
private List<String> tags;
2828

29-
@Formatter("YYYY-MM-dd HH:mm")
29+
@Formatter("yyyy-MM-dd HH:mm")
3030
@ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')")
31-
private String date;
31+
private Date date;
3232

3333
public static void main(String[] args) {
3434
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
35-
,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
35+
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
3636
}
3737

3838
public String getTitle() {
@@ -47,13 +47,8 @@ public List<String> getTags() {
4747
return tags;
4848
}
4949

50-
// public Date getDate() {
51-
// return date;
52-
// }
53-
54-
@Override
55-
public void afterProcess(Page page) {
56-
System.out.println(date);
57-
System.out.println(title);
50+
public Date getDate() {
51+
return date;
5852
}
53+
5954
}

webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java

+13-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ private ObjectFormatter getObjectFormatter(Field field, Class<?> fieldClazz) {
105105
Formatter formatter = field.getAnnotation(Formatter.class);
106106
if (formatter != null) {
107107
if (!formatter.formatter().equals(ObjectFormatter.class)) {
108-
return initFormatter(formatter.formatter());
108+
ObjectFormatter objectFormatter = initFormatter(formatter.formatter());
109+
objectFormatter.initParam(formatter.value());
109110
}
110111
}
111112
return initFormatter(ObjectFormatters.get(fieldClazz));
@@ -311,6 +312,9 @@ private Object processSingle(Page page, String html, boolean isRaw) {
311312
}
312313
if (fieldExtractor.getObjectFormatter() != null) {
313314
Object converted = convert(value, fieldExtractor.getObjectFormatter());
315+
if (converted == null && fieldExtractor.isNotNull()) {
316+
return null;
317+
}
314318
setField(o, fieldExtractor, converted);
315319
} else {
316320
setField(o, fieldExtractor, value);
@@ -332,7 +336,11 @@ private Object processSingle(Page page, String html, boolean isRaw) {
332336

333337
private Object convert(String value, ObjectFormatter objectFormatter) {
334338
try {
335-
return objectFormatter.format(value);
339+
Object format = objectFormatter.format(value);
340+
if (logger.isDebugEnabled()) {
341+
logger.debug("String " + value + " is converted to " + format);
342+
}
343+
return format;
336344
} catch (Exception e) {
337345
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
338346
}
@@ -351,6 +359,9 @@ private List<Object> convert(List<String> values, ObjectFormatter objectFormatte
351359
}
352360

353361
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
362+
if (value==null){
363+
return;
364+
}
354365
if (fieldExtractor.getSetterMethod() != null) {
355366
fieldExtractor.getSetterMethod().invoke(o, value);
356367
}

webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*/
1111
public class DateFormatter implements ObjectFormatter<Date> {
1212

13-
private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"};
13+
private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"};
1414

1515
@Override
1616
public Date format(String raw) throws Exception {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package us.codecraft.webmagic.formatter;
2+
3+
import org.junit.Test;
4+
import us.codecraft.webmagic.model.formatter.DateFormatter;
5+
6+
import java.util.Date;
7+
8+
/**
9+
10+
*/
11+
public class DateFormatterTest {
12+
13+
@Test
14+
public void testDateFormatter() throws Exception {
15+
DateFormatter dateFormatter = new DateFormatter();
16+
dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"});
17+
Date format = dateFormatter.format("2013-09-10 22:11");
18+
System.out.println(format);
19+
}
20+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
3+
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
4+
5+
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
6+
<layout class="org.apache.log4j.PatternLayout">
7+
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
8+
</layout>
9+
</appender>
10+
11+
<logger name="org.springframework" additivity="false">
12+
<level value="warn" />
13+
<appender-ref ref="stdout" />
14+
</logger>
15+
16+
<logger name="org.apache" additivity="false">
17+
<level value="warn" />
18+
<appender-ref ref="stdout" />
19+
</logger>
20+
21+
<logger name="net.sf.ehcache" additivity="false">
22+
<level value="warn" />
23+
<appender-ref ref="stdout" />
24+
</logger>
25+
26+
<root>
27+
<level value="debug" />
28+
<appender-ref ref="stdout" />
29+
</root>
30+
31+
</log4j:configuration>

0 commit comments

Comments
 (0)