diff --git a/pom.xml b/pom.xml index 8169ff7..ebb2a48 100644 --- a/pom.xml +++ b/pom.xml @@ -12,19 +12,46 @@ UTF-8 + 11 + 11 + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 11 + 11 + UTF-8 + + + + + - - org.apache.mahout - mahout-core - 0.9 - - - junit - junit - 4.7 - test - - + + org.apache.mahout + mahout-core + 0.9 + + + junit + junit + 4.7 + test + + + org.slf4j + slf4j-api + 2.0.0-alpha5 + + + org.slf4j + slf4j-simple + 2.0.0-alpha5 + + diff --git a/readme.md b/readme.md index ce4dc89..4397f27 100644 --- a/readme.md +++ b/readme.md @@ -9,7 +9,7 @@ This repo contains several common big data exercises. ## Setup -1. Install the JDK 7.0 +1. Install the JDK 11.0 2. [Download & Install Maven](http://maven.apache.org/download.cgi) diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..8b51868 --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,175 @@ +package nearsoft.academy.bigdata.recommendation; + +import java.io.*; +import java.util.Hashtable; +import java.util.zip.GZIPInputStream; +import java.util.List; +import java.util.ArrayList; + +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; + +public class MovieRecommender { + String dataPath; + int totalUsers; + int totalProducts; + int totalReviews; + + // Keep track of users and movies in the dataset + Hashtable users; + Hashtable products2Index; + + // For querying movies + Hashtable index2Products; + + DataModel model; + UserSimilarity similarity; + UserNeighborhood neighborhood; + UserBasedRecommender recommender; + + + MovieRecommender(String dataPath) throws IOException { + this.dataPath = dataPath; + this.totalUsers = 0; + this.totalProducts = 0; + this.totalReviews = 0; + + this.users = new Hashtable(); + this.index2Products = new Hashtable(); + this.products2Index = new Hashtable(); + + try { + dataPreprocess(); + loadDataModel(); + } catch (Exception e) { + e.printStackTrace(); + } + + + } + + public int getTotalReviews() { + return this.totalReviews; + } + + public int getTotalUsers() { + return this.totalUsers; + } + + public int getTotalProducts() { + return this.totalProducts; + } + + /** + * Get the list of the recommendations + */ + public List getRecommendationsForUser(String user) throws TasteException { + List recommendations = new ArrayList(); + + for (RecommendedItem recommendation : this.recommender.recommend(users.get(user), 3)) { + //System.out.println(recommendation.getItemID()); + long rec = recommendation.getItemID(); + int rec_index = (int) rec; + String rec_id = this.index2Products.get(rec_index); + recommendations.add(rec_id); + + } + return recommendations; + } + + /** + * Load .csv file + */ + private void loadDataModel() throws IOException, TasteException { + this.model = new FileDataModel(new File("data/movies.csv")); + this.similarity = new PearsonCorrelationSimilarity(this.model); + this.neighborhood = new ThresholdUserNeighborhood(0.1, this.similarity, this.model); + this.recommender = new GenericUserBasedRecommender(this.model, this.neighborhood, this.similarity); + } + + + /** + * Extract data from .gz, iterate through .txt and create a .csv + */ + private void dataPreprocess() throws IOException { + // Extract .gz and open .txt file + InputStream file = new FileInputStream(this.dataPath); + InputStream gzStream = new GZIPInputStream(file); + Reader read = new InputStreamReader(gzStream); + + // Read .txt file + BufferedReader txtFile = new BufferedReader(read); + + // Create .csv + BufferedWriter csvFile = new BufferedWriter(new FileWriter("data/movies.csv")); + + String productId = ""; + String score = ""; + String userId = ""; + + String line = txtFile.readLine(); + + while (line != null) { + //System.out.println(line); + + if (line.contains("product/productId")) { + productId = line.split(" ")[1]; + + if (this.products2Index.get(productId) == null) { + this.totalProducts++; + this.index2Products.put(this.totalProducts, productId); + this.products2Index.put(productId, this.totalProducts); + //System.out.println("Product: " + productId); + } + } else if (line.contains("review/userId:")) { + userId = line.split(" ")[1]; + + if (this.users.get(userId) == null) { + this.totalUsers++; + this.users.put(userId, this.totalUsers); + //System.out.println("User: " + userId); + } + } else if (line.contains("review/score:")) { + score = line.split(" ")[1]; + this.totalReviews++; + //System.out.println("Review: " + score); + + } + + + // If we have all the fields for one review + if ((userId != "") && (productId != "") && (score != "")) { + csvFile.write( + this.users.get(userId) + "," + + this.products2Index.get(productId) + "," + + score + "\n" + ); + // System.out.println( + // this.users.get(userId) + ", " + + // this.products2Index.get(productId) + ", " + + // productId + ", " + ", " + + // this.totalProducts + ", " + + // score + "\n" + // ); + + productId = ""; + score = ""; + userId = ""; + } + + line = txtFile.readLine(); + } + txtFile.close(); + csvFile.close(); + System.out.println("Everything is ok!"); + + } +} \ No newline at end of file diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..60c6e10 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -15,7 +15,7 @@ public class MovieRecommenderTest { public void testDataInfo() throws IOException, TasteException { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + MovieRecommender recommender = new MovieRecommender("data/movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers());