diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0910cdd --- /dev/null +++ b/.gitignore @@ -0,0 +1,147 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/vscode,intellij,java +# Edit at https://www.toptal.com/developers/gitignore?templates=vscode,intellij,java + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +### Java ### +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.nar +*.ear +*.zip +*.tar.gz +*.rar +*.gz + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + +### vscode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +target +.classpath +.project +.settings/ + +# End of https://www.toptal.com/developers/gitignore/api/vscode,intellij,java \ No newline at end of file diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..276cb0f --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,139 @@ +package nearsoft.academy.bigdata.recommendation; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +class MovieRecommender { + String filePath; + int totalUsers; + int totalReviews; + int totalProducts; + Hashtable users; + Hashtable products; + Hashtable productsById; + + MovieRecommender(String fileUrl) throws IOException { + this.filePath = fileUrl; + this.totalUsers = 0; + this.totalReviews = 0; + this.totalProducts = 0; + this.users = new Hashtable(); + this.products = new Hashtable(); + this.productsById = new Hashtable(); + + processFile(); + } + + public void processFile() throws IOException { + FileInputStream file = new FileInputStream(this.filePath); + GZIPInputStream gzipInput = new GZIPInputStream(file); + Reader decoder = new InputStreamReader(gzipInput); + BufferedReader reader = new BufferedReader(decoder); + + Pattern usersRegex = Pattern.compile("review\\/userId: ([\\D\\d]+)"); + Pattern reviewsRegex = Pattern.compile("review\\/score: ([\\D\\d]+)"); + Pattern productsRegex = Pattern.compile("product\\/productId: ([\\D\\d]+)"); + + Matcher match; + boolean matches; + + String currentLine = reader.readLine(); + + FileWriter writer = new FileWriter("movies.csv"); + + String userId = ""; + String reviewId = ""; + String productId = ""; + + while (currentLine != null) { + + match = usersRegex.matcher(currentLine); + matches = match.matches(); + + if (matches) { + userId = currentLine.split(" ")[1]; + + if (users.get(userId) == null) { + this.totalUsers++; + users.put(userId, this.totalUsers); + } + } + + match = reviewsRegex.matcher(currentLine); + matches = match.matches(); + + if (matches) { + reviewId = currentLine.split(" ")[1]; + this.totalReviews++; + } + + match = productsRegex.matcher(currentLine); + matches = match.matches(); + + if (matches) { + productId = currentLine.split(" ")[1]; + + if (products.get(productId) == null) { + this.totalProducts++; + products.put(productId, this.totalProducts); + productsById.put(this.totalProducts, productId); + } + } + + if (userId != "" && reviewId != "" && productId != "") { + writer.write(users.get(userId) + "," + products.get(productId) + "," + reviewId + "\n"); + userId = ""; + reviewId = ""; + productId = ""; + } + + currentLine = reader.readLine(); + } + + reader.close(); + writer.close(); + } + + public int getTotalUsers() { return this.totalUsers; } + + public int getTotalReviews() { return this.totalReviews; } + + public int getTotalProducts() { return this.totalProducts; } + + public List getRecommendationsForUser(String userId) throws IOException, TasteException { + DataModel model = new FileDataModel(new File("movies.csv")); + UserSimilarity similarity = new PearsonCorrelationSimilarity(model); + UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, similarity, model); + UserBasedRecommender recommender = new GenericUserBasedRecommender(model, neighborhood, similarity); + + List recommendations = new ArrayList(); + + for (RecommendedItem recommendation : recommender.recommend(users.get(userId), 3)) { + recommendations.add(productsById.get((int)(recommendation.getItemID()))); + } + + return recommendations; + } +} \ No newline at end of file diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..fe7874d 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -15,7 +15,7 @@ public class MovieRecommenderTest { public void testDataInfo() throws IOException, TasteException { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + MovieRecommender recommender = new MovieRecommender("movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers());