Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions helpers/java/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.scrapy.scrapystreaming</groupId>
<artifactId>scrapystreaming</artifactId>
<version>0.1-SNAPSHOT</version>

<dependencies>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.7</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
18 changes: 18 additions & 0 deletions helpers/java/scrapystreaming.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_5" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.7" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
</component>
</module>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.scrapy.scrapystreaming;


import org.scrapy.scrapystreaming.core.Callback;
import org.scrapy.scrapystreaming.core.SpiderException;
import org.scrapy.scrapystreaming.messages.FromResponseMessage;
import org.scrapy.scrapystreaming.messages.FromResponseRequestMessage;
import org.scrapy.scrapystreaming.utils.Utils;

public class FromResponseRequest extends FromResponseRequestMessage{

public FromResponseRequest(String url, FromResponseMessage from_response_request) {
this.url = url;
this.from_response_request = from_response_request;
}

public void open(Callback callback) throws SpiderException {
String id = this.id;
if (id == null)
id = callback.toString();
this.id = id;

Utils.responseMapping.put(id, callback);
sendMessage();
}
}
69 changes: 69 additions & 0 deletions helpers/java/src/main/java/org/scrapy/scrapystreaming/Logger.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package org.scrapy.scrapystreaming;


import org.scrapy.scrapystreaming.core.SpiderException;
import org.scrapy.scrapystreaming.messages.LogMessage;

/**
* Helper class to handle log messages
*/
public class Logger {

public enum LEVEL {
CRITICAL, ERROR, WARNING, INFO, DEBUG
}

/**
* Print a log message in the scrapy streaming logger
* @param message message
* @param level log level
*/
public static void log(String message, LEVEL level) {
try {
new LogMessage(message, level.name()).sendMessage();
} catch (SpiderException e) {
// logger doesn't validate data
}
}

/**
* Print a critical message in the scrapy streaming logger
* @param message message
**/
public static void critical(String message) {
log(message, LEVEL.CRITICAL);
}

/**
* Print a error message in the scrapy streaming logger
* @param message message
**/
public static void error(String message) {
log(message, LEVEL.ERROR);
}

/**
* Print a warning in the scrapy streaming logger
* @param message message
**/
public static void warning(String message) {
log(message, LEVEL.WARNING);
}

/**
* Print a info message in the scrapy streaming logger
* @param message message
**/
public static void info(String message) {
log(message, LEVEL.INFO);
}

/**
* Print a debug message in the scrapy streaming logger
* @param message message
**/
public static void debug(String message) {
log(message, LEVEL.DEBUG);
}
}

37 changes: 37 additions & 0 deletions helpers/java/src/main/java/org/scrapy/scrapystreaming/Request.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package org.scrapy.scrapystreaming;


import org.scrapy.scrapystreaming.core.Callback;
import org.scrapy.scrapystreaming.messages.RequestMessage;
import org.scrapy.scrapystreaming.core.SpiderException;
import org.scrapy.scrapystreaming.utils.Utils;

/**
* Open a new request
*/
public class Request extends RequestMessage {

/**
* Creates the request object, passing its url
* @param url request URL
*/
public Request(String url) {
this.url = url;
}

/**
* Open the request given its callback.
* The callback function will be called with the response as soon as it's available.
* @param callback response callback
* @throws SpiderException
*/
public void open(Callback callback) throws SpiderException {
String id = this.id;
if (id == null)
id = callback.toString();
this.id = id;
Utils.responseMapping.put(id, callback);

sendMessage();
}
}
79 changes: 79 additions & 0 deletions helpers/java/src/main/java/org/scrapy/scrapystreaming/Spider.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package org.scrapy.scrapystreaming;

import org.scrapy.scrapystreaming.core.Callback;
import org.scrapy.scrapystreaming.core.CommunicationProtocol;
import org.scrapy.scrapystreaming.messages.*;
import org.scrapy.scrapystreaming.core.SpiderException;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;


/**
* This class lets you create the External Spider and run / stop it.
*/
public abstract class Spider implements Callback {
public String name = "ExternalSpider";
public List<String> start_urls = new ArrayList<String>(0);
public List<String> allowed_domains;
public Map custom_settings;

private SpiderMessage spiderMessage;
private boolean isRunning = false;
private CommunicationProtocol protocol;

/**
* Start the Spider execution
*
* @throws SpiderException
*/
public final void start() throws SpiderException {
if (isRunning)
throw new SpiderException("Spider already running");

spiderMessage = new SpiderMessage();
spiderMessage.name = name;
spiderMessage.start_urls = start_urls;
spiderMessage.allowed_domains = allowed_domains;
spiderMessage.custom_settings = custom_settings;
spiderMessage.sendMessage();

protocol = new CommunicationProtocol(this);
protocol.start();

isRunning = true;
}

/**
* Stop the spider execution, sending the close message.
* The process will be killed as soon as Scrapy Streaming receives this message.
*/
public void close() {
try {
new CloseMessage().sendMessage();
} catch (SpiderException e) {
e.printStackTrace();
}
}

/**
* The callback of initial_urls responses.
*
* @param response response data
*/
public abstract void parse(ResponseMessage response);

/**
* This method is called when Scrapy raises an exception and sends the exception message.
* If you want to analyze the exception, or just ignore the problem, override this function.
*
* @param exception exception message sent by Scrapy Streaming
* @throws SpiderException
*/
public void onException(ExceptionMessage exception) throws SpiderException {
throw new SpiderException("Scrapy raised an exception. Message sent: " + exception.received_message +
"; Exception message: " + exception.exception);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package org.scrapy.scrapystreaming.core;


import org.scrapy.scrapystreaming.messages.ResponseMessage;


/**
* Represents a callback function to handle a response
*/
public interface Callback {

/**
* Method to handle to response content
* @param response
*/
public void parse(ResponseMessage response);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.scrapy.scrapystreaming.core;


import org.scrapy.scrapystreaming.Spider;
import org.scrapy.scrapystreaming.messages.*;
import org.scrapy.scrapystreaming.utils.Utils;

import java.io.*;

public class CommunicationProtocol extends Thread {
BufferedReader in;
Spider spider;

public CommunicationProtocol(Spider spider) {
this.spider = spider;
in = new BufferedReader(new InputStreamReader(System.in));
}

@Override
public void run() {
while (true) {
try {
String line = in.readLine();
line = line.trim();
if (line.length() > 0) {
ReceivedMessage msg = Utils.gson.fromJson(line, ReceivedMessage.class);
messageReceived(msg, line);
}
} catch (IOException e) {
System.err.println("There is a problem in the communication channel: " + e.getMessage());
} catch (SpiderException e) {
System.err.println(e.getMessage());
}
}
}

protected void messageReceived(ReceivedMessage msg, String line) throws SpiderException {
if (msg.type.equals("ready")) {
ReadyMessage status = Utils.gson.fromJson(line, ReadyMessage.class);
onStatus(status);
} else if (msg.type.equals("response")) {
ResponseMessage response = Utils.gson.fromJson(line, ResponseMessage.class);
onResponse(response);
} else if (msg.type.equals("exception")) {
ExceptionMessage exception = Utils.gson.fromJson(line, ExceptionMessage.class);
onException(exception);
} else if (msg.type.equals("error")) {
ErrorMessage error = Utils.gson.fromJson(line, ErrorMessage.class);
onError(error);
} else {
throw new SpiderException("Invalid message type: " + msg.type);
}
}

protected void onStatus(ReadyMessage status) throws SpiderException {
if (!status.status.equals("ready")) {
throw new SpiderException("There is a problem in the communication channel. Received status: " + status.status);
}
}

protected void onResponse(ResponseMessage response) {
if (response.id.equals("parse")) {
spider.parse(response);
} else {
Utils.responseMapping.get(response.id).parse(response);
}
}

protected void onException(ExceptionMessage exception) throws SpiderException {
spider.onException(exception);
}

protected void onError(ErrorMessage error) throws SpiderException {
throw new SpiderException("Spider error. Message sent: " + error.received_message +
"; Error details: " + error.details);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.scrapy.scrapystreaming.core;


public class SpiderException extends Exception {
public SpiderException(){

}

public SpiderException(String message) {
super(message);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.scrapy.scrapystreaming.messages;


import java.util.List;

public class CloseMessage extends Message {
public final String type = "close";

public List<String> validator() {
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.scrapy.scrapystreaming.messages;


import java.util.List;

public class ErrorMessage extends Message {
public final String type = "error";
public String received_message;
public String details;

public List<String> validator() {
return null;
}
}
Loading