Skip to content

Commit 4e3ee63

Browse files
committed
invite redis for distribution
1 parent 02a7f4e commit 4e3ee63

File tree

4 files changed

+98
-6
lines changed

4 files changed

+98
-6
lines changed

webmagic-plugin/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
<artifactId>freemarker</artifactId>
2626
<version>2.3.15</version>
2727
</dependency>
28+
<dependency>
29+
<groupId>redis.clients</groupId>
30+
<artifactId>jedis</artifactId>
31+
<version>2.0.0</version>
32+
</dependency>
2833
</dependencies>
2934

3035
<build>
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package us.codecraft.webmagic.scheduler;
2+
3+
import redis.clients.jedis.Jedis;
4+
import redis.clients.jedis.JedisPool;
5+
import redis.clients.jedis.JedisPoolConfig;
6+
import us.codecraft.webmagic.Request;
7+
import us.codecraft.webmagic.Task;
8+
import us.codecraft.webmagic.schedular.Scheduler;
9+
10+
/**
11+
* 使用redis管理url,构建一个分布式的爬虫。<br>
12+
* @author [email protected] <br>
13+
* @date: 13-7-25 <br>
14+
* Time: 上午7:07 <br>
15+
*/
16+
public class RedisScheduler implements Scheduler{
17+
18+
private JedisPool pool;
19+
20+
private static final String QUEUE_PREFIX = "queue_";
21+
22+
private static final String SET_PREFIX = "set_";
23+
24+
public RedisScheduler(String host){
25+
pool = new JedisPool(new JedisPoolConfig(), host);
26+
}
27+
28+
@Override
29+
public synchronized void push(Request request, Task task) {
30+
Jedis jedis = pool.getResource();
31+
if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){
32+
jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl());
33+
jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl());
34+
}
35+
pool.returnResource(jedis);
36+
}
37+
38+
@Override
39+
public synchronized Request poll(Task task) {
40+
Jedis jedis = pool.getResource();
41+
String url = jedis.lpop(QUEUE_PREFIX+task.getUUID());
42+
pool.returnResource(jedis);
43+
return new Request(url);
44+
}
45+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package us.codecraft.webmagic.scheduler;
2+
3+
import org.junit.Before;
4+
import org.junit.Test;
5+
import us.codecraft.webmagic.Request;
6+
import us.codecraft.webmagic.Site;
7+
import us.codecraft.webmagic.Task;
8+
9+
/**
10+
* @author [email protected] <br>
11+
* @date: 13-7-25 <br>
12+
* Time: 上午7:51 <br>
13+
*/
14+
public class RedisSchedulerTest {
15+
16+
private RedisScheduler redisScheduler;
17+
18+
@Before
19+
public void setUp() {
20+
redisScheduler = new RedisScheduler("localhost");
21+
}
22+
23+
@Test
24+
public void test() {
25+
Task task = new Task() {
26+
@Override
27+
public String getUUID() {
28+
return "1";
29+
}
30+
31+
@Override
32+
public Site getSite() {
33+
return null;
34+
}
35+
};
36+
redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task);
37+
Request poll = redisScheduler.poll(task);
38+
System.out.println(poll.getUrl());
39+
40+
}
41+
}

webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33
import us.codecraft.webmagic.Page;
44
import us.codecraft.webmagic.Site;
55
import us.codecraft.webmagic.Spider;
6-
import us.codecraft.webmagic.downloader.FileDownloader;
7-
import us.codecraft.webmagic.downloader.HttpClientDownloader;
86
import us.codecraft.webmagic.pipeline.FilePipeline;
97
import us.codecraft.webmagic.processor.PageProcessor;
10-
import us.codecraft.webmagic.schedular.FileCacheQueueScheduler;
8+
import us.codecraft.webmagic.scheduler.RedisScheduler;
119

1210
import java.util.List;
1311

@@ -40,9 +38,12 @@ public Site getSite() {
4038

4139
public static void main(String[] args) {
4240
Spider.create(new GlobalProcessor()).thread(10)
43-
.scheduler(new FileCacheQueueScheduler("/data/webmagic/test"))
44-
.downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader()))
45-
.pipeline(new FilePipeline("/data/webmagic/test"))
41+
.scheduler(new RedisScheduler("localhost"))
42+
.pipeline(new FilePipeline("/data/webmagic/test/"))
43+
.runAsync();
44+
Spider.create(new GlobalProcessor()).thread(10)
45+
.scheduler(new RedisScheduler("localhost"))
46+
.pipeline(new FilePipeline("/data/webmagic/test/"))
4647
.run();
4748
}
4849
}

0 commit comments

Comments
 (0)