Making AgePredict work

smadha · smadha · commit 5f2b4f20f9f5 · 2017-04-19T16:17:31.000-07:00
diff --git a/README.md b/README.md
@@ -69,6 +69,13 @@ Note: Each document must be followed by an empty line to be detected as a separa
 Usage: bin/authorage AgeClassify model < documents
 ```
 
+```shell
+Usage: bin/authorage AgePredict ./model/classify-unigram.bin ./model/regression-global.bin  data/sample_test.txt
+```
+
+# Downloads
+For AgePredict to work you need to download `en-pos-maxent.bin`, `en-sent.bin` and `en-token.bin` from [http://opennlp.sourceforge.net/models-1.5/](http://opennlp.sourceforge.net/models-1.5/) to `model/opennlp/`
+
 # Contributors
 * Joey Hong, Caltech, CA
 
diff --git a/bin/authorage b/bin/authorage
@@ -17,7 +17,7 @@
 #   specific language governing permissions and limitations
 #   under the License.
 
-export SPARK_HOME="spark-2.0.0-bin-hadoop2.7"
+# export SPARK_HOME="spark-2.0.0-bin-hadoop2.7"
 
 # Created JAR Application
 export JAR="target/age-predictor-1.0-SNAPSHOT-jar-with-dependencies.jar"
diff --git a/data/sample_test.txt b/data/sample_test.txt
@@ -0,0 +1,3 @@
+Can AI really predict my age through what I wrote?
+
+That will be so cool of AI
diff --git a/src/main/java/gov/nasa/jpl/ml/cmdline/spark/authorage/AgePredictTool.java b/src/main/java/gov/nasa/jpl/ml/cmdline/spark/authorage/AgePredictTool.java
@@ -19,6 +19,7 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -27,7 +28,8 @@
 import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.ml.feature.CountVectorizerModel;
 import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.ml.linalg.SparseVector;
+import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LassoModel;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -54,115 +56,112 @@
 /**
  * TODO: Documentation
  */
-public class AgePredictTool extends BasicCmdLineTool {
-    
-    @Override
-    public String getShortDescription() {
-	return "age predictor";
-    }
-    
-    @Override
-	public String getHelp() {
-	return "Usage: " + CLI.CMD + " " + getName() + " [MaxEntModel] RegressionModel Documents";
-    }
-    
-    @Override
-    public void run(String[] args) {
-	AgePredictModel model = null;
-	AgeClassifyME classify = null;
-	if (args.length == 3) {
-	    try {
-		AgeClassifyModel classifyModel = new AgeClassifyModel(new File(args[0]));
-
-		classify = new AgeClassifyME(classifyModel);
-		model = AgePredictModel.readModel(new File(args[1]));
-	    } catch (Exception e) {
-		e.printStackTrace();
-		return;
-	    }
-	}
-	else if (args.length == 2) {
-	    try {
-		model = AgePredictModel.readModel(new File(args[0]));
-	    } catch (Exception e) {
-		e.printStackTrace();
-		return;
-	    }
+public class AgePredictTool extends BasicCmdLineTool implements Serializable {
+
+	@Override
+	public String getShortDescription() {
+		return "age predictor";
 	}
-	else {
-	    System.out.println(getHelp());
-	    return;
+
+	@Override
+	public String getHelp() {
+		return "Usage: " + CLI.CMD + " " + getName() + " [MaxEntModel] RegressionModel Documents";
 	}
-	
-	ObjectStream<String> documentStream;
-	List<Row> data = new ArrayList<Row>();
-	
-	SparkSession spark = SparkSession
-            .builder()
-            .appName("AgePredict")
-            .getOrCreate();
-	
-	try {
-	    documentStream = new ParagraphStream(
-	        new PlainTextByLineStream(new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
-	    
-	    String document;
-	    FeatureGenerator[] featureGenerators = model.getContext().getFeatureGenerators();
-	    while ((document = documentStream.read()) != null) {
-	        String[] tokens = model.getContext().getTokenizer().tokenize(document);
-
-		double prob[] = classify.getProbabilities(tokens);
-		String category = classify.getBestCategory(prob);
-		
-		Collection<String> context = new ArrayList<String>();
-
-		for (FeatureGenerator featureGenerator : featureGenerators) {
-		    Collection<String> extractedFeatures =
-			featureGenerator.extractFeatures(tokens);
-		    context.addAll(extractedFeatures);
+
+	@Override
+	public void run(String[] args) {
+		AgePredictModel model = null;
+		AgeClassifyME classify = null;
+		if (args.length == 3) {
+			try {
+				AgeClassifyModel classifyModel = new AgeClassifyModel(new File(args[0]));
+
+				classify = new AgeClassifyME(classifyModel);
+				model = AgePredictModel.readModel(new File(args[1]));
+			} catch (Exception e) {
+				e.printStackTrace();
+				return;
+			}
+		} else if (args.length == 2) {
+			try {
+				model = AgePredictModel.readModel(new File(args[0]));
+			} catch (Exception e) {
+				e.printStackTrace();
+				return;
+			}
+		} else {
+			System.out.println(getHelp());
+			return;
 		}
-		
-		if (category != null) {
-		    for (int i = 0; i < tokens.length / 18; i++) {
-			context.add("cat="+ category);
-		    }
+
+		ObjectStream<String> documentStream;
+		List<Row> data = new ArrayList<Row>();
+
+		SparkSession spark = SparkSession.builder().appName("AgePredict").getOrCreate();
+				
+		try {
+			System.out.println("Please enter your text separted by newline. When done press ctrl+d to terminate system input");
+			documentStream = new ParagraphStream(
+					new PlainTextByLineStream(new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+
+			String document;
+			FeatureGenerator[] featureGenerators = model.getContext().getFeatureGenerators();
+			while ((document = documentStream.read()) != null) {
+				String[] tokens = model.getContext().getTokenizer().tokenize(document);
+
+				double prob[] = classify.getProbabilities(tokens);
+				String category = classify.getBestCategory(prob);
+
+				Collection<String> context = new ArrayList<String>();
+
+				for (FeatureGenerator featureGenerator : featureGenerators) {
+					Collection<String> extractedFeatures = featureGenerator.extractFeatures(tokens);
+					context.addAll(extractedFeatures);
+				}
+
+				if (category != null) {
+					for (int i = 0; i < tokens.length / 18; i++) {
+						context.add("cat=" + category);
+					}
+				}
+				if (context.size() > 0) {
+					data.add(RowFactory.create(document, context.toArray()));
+				}
+				
+			}
+		} catch (IOException e) {
+			e.printStackTrace();
+			CmdLineUtil.handleStdinIoError(e);
 		}
-		if (context.size() > 0) {
-		    data.add(RowFactory.create(document, context.toArray()));
-		}	
-	    } 
-	} catch (IOException e) {
-		CmdLineUtil.handleStdinIoError(e);
+				
+		StructType schema = new StructType(
+				new StructField[] { new StructField("document", DataTypes.StringType, false, Metadata.empty()),
+						new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) });
+
+		Dataset<Row> df = spark.createDataFrame(data, schema);
+
+		CountVectorizerModel cvm = new CountVectorizerModel(model.getVocabulary()).setInputCol("text")
+				.setOutputCol("feature");
+
+		Dataset<Row> eventDF = cvm.transform(df);
+
+		Normalizer normalizer = new Normalizer().setInputCol("feature").setOutputCol("normFeature").setP(1.0);
+
+		JavaRDD<Row> normEventDF = normalizer.transform(eventDF).javaRDD();
+
+		//org.apache.spark.ml.linalg.SparseVector cannot be cast to org.apache.spark.mllib.linalg.Vector
+
+		final LassoModel linModel = model.getModel();
+		normEventDF.foreach(new VoidFunction<Row>() {
+			public void call(Row event) {
+				SparseVector sp = (SparseVector) event.getAs("normFeature");
+				
+				double prediction = linModel.predict(Vectors.sparse(sp.size(), sp.indices(), sp.values()));
+				System.out.println((String) event.getAs("document"));
+				System.out.println("Prediction: " + prediction);
+			}
+		});
+
+		spark.stop();
 	}
-	StructType schema = new StructType(new StructField [] {
-		new StructField("document", DataTypes.StringType, false, Metadata.empty()),
-		new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-	    });
-	
-	Dataset<Row> df = spark.createDataFrame(data, schema);
-	
-	CountVectorizerModel cvm = new CountVectorizerModel(model.getVocabulary())
-	    .setInputCol("text")
-	    .setOutputCol("feature");
-	
-	Dataset<Row> eventDF = cvm.transform(df);
-	
-	Normalizer normalizer = new Normalizer()
-            .setInputCol("feature")
-            .setOutputCol("normFeature")
-            .setP(1.0);
-
-        JavaRDD<Row> normEventDF= normalizer.transform(eventDF).javaRDD();
-	
-	final LassoModel linModel = model.getModel();
-	normEventDF.foreach( new VoidFunction<Row>() {
-		public void call(Row event) {
-		    double prediction = linModel.predict((Vector) event.getAs("normFeature"));
-		    System.out.println((String) event.getAs("document"));
-		    System.out.println("Prediction: "+ prediction);
-		}
-	    });
-	
-	spark.stop();
-    }
 }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Can AI really predict my age through what I wrote?`
	`2`	`+`
	`3`	`+That will be so cool of AI`