File tree Expand file tree Collapse file tree 3 files changed +31
-5
lines changed
webmagic-core/src/main/java/us/codecraft/webmagic
webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples Expand file tree Collapse file tree 3 files changed +31
-5
lines changed Original file line number Diff line number Diff line change @@ -28,13 +28,15 @@ Release Notes
28
28
29
29
}
30
30
31
+ 增加一个Spider.test(url)方法,用于开发爬虫时进行调试。
32
+
31
33
增加基于redis的分布式支持。
32
34
33
35
增加XPath2.0语法支持(webmagic-saxon模块)。
34
36
35
37
增加基于Selenium的浏览器渲染支持,用于抓取动态加载内容(webmagic-selenium模块)。
36
38
37
- 修复一些已有bug 。
39
+ 修复了不支持https的bug 。
38
40
39
41
补充了文档:[ webmagic-0.2.0用户手册] ( http://code4craft.github.io/webmagic/ ) 。
40
42
Original file line number Diff line number Diff line change @@ -220,9 +220,17 @@ private void destroyEach(Object object) {
220
220
}
221
221
}
222
222
223
- public void test (String url ){
223
+ /**
224
+ * 用某些特定URL进行爬虫测试
225
+ * @param urls 要抓取的url
226
+ */
227
+ public void test (String ... urls ){
224
228
checkComponent ();
225
- processRequest (new Request (url ));
229
+ if (urls .length >0 ){
230
+ for (String url : urls ) {
231
+ processRequest (new Request (url ));
232
+ }
233
+ }
226
234
}
227
235
228
236
private void processRequest (Request request ) {
Original file line number Diff line number Diff line change 8
8
import us .codecraft .webmagic .model .annotation .HelpUrl ;
9
9
import us .codecraft .webmagic .model .annotation .TargetUrl ;
10
10
import us .codecraft .webmagic .pipeline .JsonFilePageModelPipeline ;
11
+ import us .codecraft .webmagic .scheduler .FileCacheQueueScheduler ;
11
12
12
13
import java .util .List ;
13
14
@@ -32,12 +33,19 @@ public class GithubRepo implements HasKey {
32
33
@ ExtractBy (value = "//div[@class='repository-lang-stats']//li//span[@class='lang']" ,multi = true )
33
34
private List <String > language ;
34
35
36
+ @ ExtractBy ("//a[@class='social-count js-social-count']/text()" )
37
+ private String star ;
38
+
39
+ @ ExtractBy ("//a[@class='social-count js-social-count']/text()" )
40
+ private String fork ;
41
+
35
42
@ ExtractByUrl
36
43
private String url ;
37
44
38
45
public static void main (String [] args ) {
39
- OOSpider .create (Site .me ().addStartUrl ("https://github.com/explore" ).setSleepTime (0 ),
40
- new JsonFilePageModelPipeline (), GithubRepo .class ).thread (15 ).run ();
46
+ OOSpider .create (Site .me ().addStartUrl ("https://github.com/explore" ).setSleepTime (0 ).setRetryTimes (3 ),
47
+ new JsonFilePageModelPipeline (), GithubRepo .class )
48
+ .scheduler (new FileCacheQueueScheduler ("/data/webmagic/cache/" )).thread (15 ).run ();
41
49
}
42
50
43
51
@ Override
@@ -64,4 +72,12 @@ public List<String> getLanguage() {
64
72
public String getUrl () {
65
73
return url ;
66
74
}
75
+
76
+ public String getStar () {
77
+ return star ;
78
+ }
79
+
80
+ public String getFork () {
81
+ return fork ;
82
+ }
67
83
}
You can’t perform that action at this time.
0 commit comments