diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..fa78956 Binary files /dev/null and b/.DS_Store differ diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..d13e9b0 --- /dev/null +++ b/.classpath @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..364472d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target/ +data/dataset.csv +data/movies.txt +data/movies.txt.gz diff --git a/.project b/.project new file mode 100644 index 0000000..1d2c2d6 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + academy-exercises + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..8dd9b1d --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,3 @@ +eclipse.preferences.version=1 +encoding//src/test/java=UTF-8 +encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..5723a0f --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,8 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.5 diff --git a/lib/commons-math3-3.2.jar b/lib/commons-math3-3.2.jar new file mode 100644 index 0000000..f8b7db2 Binary files /dev/null and b/lib/commons-math3-3.2.jar differ diff --git a/lib/guava-15.0.jar b/lib/guava-15.0.jar new file mode 100644 index 0000000..eb9ef8a Binary files /dev/null and b/lib/guava-15.0.jar differ diff --git a/lib/mahout-core-0.8.jar b/lib/mahout-core-0.8.jar new file mode 100644 index 0000000..001ec4c Binary files /dev/null and b/lib/mahout-core-0.8.jar differ diff --git a/lib/mahout-integration-0.8.jar b/lib/mahout-integration-0.8.jar new file mode 100644 index 0000000..9458dcf Binary files /dev/null and b/lib/mahout-integration-0.8.jar differ diff --git a/lib/mahout-math-0.8.jar b/lib/mahout-math-0.8.jar new file mode 100644 index 0000000..395009a Binary files /dev/null and b/lib/mahout-math-0.8.jar differ diff --git a/lib/slf4j-api-1.7.5.jar b/lib/slf4j-api-1.7.5.jar new file mode 100644 index 0000000..8f004d3 Binary files /dev/null and b/lib/slf4j-api-1.7.5.jar differ diff --git a/lib/slf4j-nop-1.7.5.jar b/lib/slf4j-nop-1.7.5.jar new file mode 100644 index 0000000..42f5c15 Binary files /dev/null and b/lib/slf4j-nop-1.7.5.jar differ diff --git a/pom.xml b/pom.xml index 8169ff7..95af77d 100644 --- a/pom.xml +++ b/pom.xml @@ -27,4 +27,17 @@ test + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.12.4 + + false + + + + diff --git a/readme.md b/readme.md index ce4dc89..2d8d524 100644 --- a/readme.md +++ b/readme.md @@ -4,17 +4,18 @@ This repo contains several common big data exercises. * **MovieRecommender** Uses Amazon movie reviews sample data [stanford.edu/data/web-Movies.html](http://snap.stanford.edu/data/web-Movies.html) for a simple movie recommender - - - ## Setup 1. Install the JDK 7.0 2. [Download & Install Maven](http://maven.apache.org/download.cgi) - + -Import .jar files from lib/ folder to the project +3. The data file movies.txt.gz need to be in data/ folder +4. Execute MovieRecommenderTest to compile and generate the dataset.csv from movies.txt.gz +5. Wait ## How to run tests #from the repository root mvn test + diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..a58d1bd Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/test/.DS_Store b/src/test/.DS_Store new file mode 100644 index 0000000..8ade123 Binary files /dev/null and b/src/test/.DS_Store differ diff --git a/src/test/java/.DS_Store b/src/test/java/.DS_Store new file mode 100644 index 0000000..cc10eb8 Binary files /dev/null and b/src/test/java/.DS_Store differ diff --git a/src/test/java/nearsoft/.DS_Store b/src/test/java/nearsoft/.DS_Store new file mode 100644 index 0000000..fc93bd5 Binary files /dev/null and b/src/test/java/nearsoft/.DS_Store differ diff --git a/src/test/java/nearsoft/academy/.DS_Store b/src/test/java/nearsoft/academy/.DS_Store new file mode 100644 index 0000000..f565452 Binary files /dev/null and b/src/test/java/nearsoft/academy/.DS_Store differ diff --git a/src/test/java/nearsoft/academy/bigdata/.DS_Store b/src/test/java/nearsoft/academy/bigdata/.DS_Store new file mode 100644 index 0000000..2556cee Binary files /dev/null and b/src/test/java/nearsoft/academy/bigdata/.DS_Store differ diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..ee781e1 --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,131 @@ +package nearsoft.academy.bigdata.recommendation; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +public class MovieRecommender { + // Recommender model + private DataModel model; + + // Totals + private int totalReviews = 0, totalProducts = 0, totalUsers = 0; + + // HashMaps for objects + private Map products = new HashMap(); + private Map users = new HashMap(); + private Map productsInverted = new HashMap(); + + public MovieRecommender(String path) throws IOException { + // Get file + FileInputStream rawFile = new FileInputStream(path); + // Decompress file + GZIPInputStream decompressedFile = new GZIPInputStream(rawFile); + // Pass to stream + InputStreamReader streamFile = new InputStreamReader(decompressedFile); + + // Read streamed file + BufferedReader txtFile = new BufferedReader(streamFile); + // Create CSV file + FileWriter fileWriter = new FileWriter(System.getProperty("user.dir")+"/data/dataset.csv"); + + + String[] requiredFields = {"product/productId:", "review/userId:", "review/score:"}; + String productId = ""; + String userId = ""; + String score = ""; + String row; + while ((row = txtFile.readLine()) != null) { + + if(row.contains(requiredFields[0])) { + productId = row.split(" ")[1]; + + if (this.products.get(productId) == null) { + this.totalProducts++; + this.products.put(productId, this.totalProducts); + this.productsInverted.put(this.totalProducts, productId); + } + } else if(row.contains(requiredFields[1])) { + userId = row.split(" ")[1]; + + if (this.users.get(userId) == null) { + this.totalUsers++; + this.users.put(userId, this.totalUsers); + } + } else if(row.contains(requiredFields[2])) { + score = row.split(" ")[1]; + this.totalReviews++; + } + + if ((productId != "") && (userId != "") && (score != "")) { + fileWriter.write( + this.users.get(userId) + "," + + this.products.get(productId) + "," + + score + "\n" + ); + productId = ""; + score = ""; + userId = ""; + + } + + } + fileWriter.close(); + txtFile.close(); + System.out.println("ALL OK"); + } + + public List getRecommendationsForUser(String userId) throws IOException, TasteException { + this.model = new FileDataModel(new File("data/dataset.csv")); + + UserSimilarity simalirity = new PearsonCorrelationSimilarity(this.model); + UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, simalirity, this.model); + UserBasedRecommender recommender = new GenericUserBasedRecommender(this.model, neighborhood, simalirity); + + List recommendations = new ArrayList(); + + long user = users.get(userId); + + List recommendationsItems = recommender.recommend(user, 3); + + for (RecommendedItem recommendation : recommendationsItems) { + int productId = (int) recommendation.getItemID(); + recommendations.add(productsInverted.get(productId)); + } + + return recommendations; + } + + public int getTotalReviews() { + return this.totalReviews; + } + + public int getTotalProducts() { + return this.totalProducts; + } + + public int getTotalUsers() { + return this.totalUsers; + } + +} diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..60c6e10 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -15,7 +15,7 @@ public class MovieRecommenderTest { public void testDataInfo() throws IOException, TasteException { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + MovieRecommender recommender = new MovieRecommender("data/movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers());