Skip to content

Commit 95ab4ed

Browse files
committed
some bugfix
1 parent 250cc5e commit 95ab4ed

File tree

3 files changed

+64
-9
lines changed
  • webmagic-core/src/main/java/us/codecraft/webmagic
  • webmagic-extension/src/main/java/us/codecraft/webmagic/example
  • webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples

3 files changed

+64
-9
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Site.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,6 @@ public String getUserAgent() {
9696
* @return get domain
9797
*/
9898
public String getDomain() {
99-
if (domain == null) {
100-
if (startUrls.size() > 0) {
101-
domain = UrlUtils.getDomain(startUrls.get(0));
102-
}
103-
}
10499
return domain;
105100
}
106101

@@ -176,6 +171,11 @@ public List<String> getStartUrls() {
176171
*/
177172
public Site addStartUrl(String startUrl) {
178173
this.startUrls.add(startUrl);
174+
if (domain == null) {
175+
if (startUrls.size() > 0) {
176+
domain = UrlUtils.getDomain(startUrls.get(0));
177+
}
178+
}
179179
return this;
180180
}
181181

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package us.codecraft.webmagic.example;
2+
3+
import us.codecraft.webmagic.Page;
4+
import us.codecraft.webmagic.Site;
5+
import us.codecraft.webmagic.model.AfterExtractor;
6+
import us.codecraft.webmagic.model.OOSpider;
7+
import us.codecraft.webmagic.model.annotation.ExtractBy;
8+
import us.codecraft.webmagic.model.annotation.Formatter;
9+
import us.codecraft.webmagic.model.annotation.TargetUrl;
10+
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
11+
12+
import java.util.List;
13+
14+
/**
15+
* @author [email protected] <br>
16+
*/
17+
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
18+
public class OschinaBlog implements AfterExtractor{
19+
20+
@ExtractBy("//title/text()")
21+
private String title;
22+
23+
@ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
24+
private String content;
25+
26+
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
27+
private List<String> tags;
28+
29+
@Formatter("YYYY-MM-dd HH:mm")
30+
@ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')")
31+
private String date;
32+
33+
public static void main(String[] args) {
34+
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
35+
,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
36+
}
37+
38+
public String getTitle() {
39+
return title;
40+
}
41+
42+
public String getContent() {
43+
return content;
44+
}
45+
46+
public List<String> getTags() {
47+
return tags;
48+
}
49+
50+
// public Date getDate() {
51+
// return date;
52+
// }
53+
54+
@Override
55+
public void afterProcess(Page page) {
56+
System.out.println(date);
57+
System.out.println(title);
58+
}
59+
}

webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import us.codecraft.webmagic.model.annotation.TargetUrl;
77
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
88

9-
import java.util.Date;
109
import java.util.List;
1110

1211
/**
@@ -24,9 +23,6 @@ public class OschinaBlog{
2423
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
2524
private List<String> tags;
2625

27-
@ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')")
28-
private Date date;
29-
3026
public static void main(String[] args) {
3127
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
3228
,new JsonFilePageModelPipeline(), OschinaBlog.class).run();

0 commit comments

Comments
 (0)