diff --git a/pom.xml b/pom.xml index 8169ff7..d96233e 100644 --- a/pom.xml +++ b/pom.xml @@ -12,6 +12,8 @@ UTF-8 + 11 + 11 @@ -26,5 +28,66 @@ 4.7 test + + org.apache.logging.log4j + log4j-api + 2.13.3 + + + org.apache.logging.log4j + log4j-core + 2.13.3 + + + org.apache.logging.log4j + log4j-slf4j-impl + 2.13.3 + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..0552adb --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,104 @@ +package nearsoft.academy.bigdata.recommendation; + +import java.io.File; +import java.io.FileWriter; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +public class MovieRecommender { + private long totalReviews; + private long totalUsers; + private long totalMovies; + private UserBasedRecommender mahoutRecommender; + Hashtable users = new Hashtable(); + Hashtable movies = new Hashtable(); + Hashtable hashToMovie = new Hashtable(); + + public MovieRecommender(String pathToRecommendations) { + this.totalReviews = 0; + this.totalUsers = 0; + this.totalMovies = 0; + users = new Hashtable(); + movies = new Hashtable(); + hashToMovie = new Hashtable(); + + try { + ReviewParser parser = new ReviewParser(pathToRecommendations); + String pathToCvs = pathToRecommendations.split("\\.")[0] + ".cvs"; + + FileWriter cvsWriter = new FileWriter(pathToCvs); + Review review = parser.getReview(); + while (review != null) { + totalReviews += 1; + + if (users.putIfAbsent(review.userId, totalUsers) == null) { + totalUsers += 1; + } + if (movies.putIfAbsent(review.movieId, totalMovies) == null) { + hashToMovie.put(totalMovies, review.movieId); + totalMovies += 1; + } + + long hashUser = users.get(review.userId); + long hashMovie = movies.get(review.movieId); + String cvsLine = String.format("%d,%d,%.1f", hashUser, hashMovie, review.rating); + cvsWriter.append(cvsLine + "\n"); + + review = parser.getReview(); + } + + cvsWriter.close(); + // Now we create the recommender. + DataModel model = new FileDataModel(new File(pathToCvs)); + UserSimilarity similarity = new PearsonCorrelationSimilarity(model); + UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, similarity, model); + mahoutRecommender = new GenericUserBasedRecommender(model, neighborhood, similarity); + + } catch (Exception e) { + e.printStackTrace(); + System.out.println(e.getMessage() + totalReviews); + } + } + + public long getTotalReviews() { + return totalReviews; + } + + public long getTotalProducts() { + return totalMovies; + } + + public long getTotalUsers() { + return totalUsers; + } + + public List getRecommendationsForUser(String user) { + long userHash = users.get(user); + List moviesId = new ArrayList(); + + try { + List recommendations = mahoutRecommender.recommend(userHash, 3); + for (RecommendedItem item : recommendations) { + Long movieHash = item.getItemID(); + moviesId.add(hashToMovie.get(movieHash)); + } + + } catch (TasteException e) { + e.printStackTrace(); + } + + return moviesId; + } +} diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/Review.java b/src/test/java/nearsoft/academy/bigdata/recommendation/Review.java new file mode 100644 index 0000000..a895296 --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/Review.java @@ -0,0 +1,13 @@ +package nearsoft.academy.bigdata.recommendation; + +public class Review { + public String userId; + public String movieId; + public float rating; + + public Review(String userId, String movieId, float rating) { + this.userId = userId ; + this.movieId = movieId ; + this.rating = rating ; + } +} diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/ReviewParser.java b/src/test/java/nearsoft/academy/bigdata/recommendation/ReviewParser.java new file mode 100644 index 0000000..233ed25 --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/ReviewParser.java @@ -0,0 +1,137 @@ +package nearsoft.academy.bigdata.recommendation; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.List; + +public class ReviewParser { + private BufferedReader buffer; + + public ReviewParser(String pathToRecommendations) throws Exception { + this.buffer = new BufferedReader(new FileReader(pathToRecommendations)); + } + + public Review getReview() throws Exception { + List lines = new ArrayList(); + String line = ""; + String userId = ""; + String movieId = ""; + String ratingString = "" ; + float rating = -1; + + line = buffer.readLine(); + + if (line == null) + return null; + + while (!line.isEmpty()) { + lines.add(line); + line = buffer.readLine(); + + if( line.isEmpty() && lines.size() < 8 ) + line = buffer.readLine(); + } + + userId = lines.stream().filter(title -> title.contains("review/userId")).findFirst().get().split(" ")[1]; + movieId = lines.stream().filter(title -> title.contains("product/productId")).findFirst().get() + .split(" ")[1]; + + ratingString = lines.stream().filter(title -> title.contains("review/score")).findFirst().get() + .split(" ")[1]; + + rating = Float.parseFloat(ratingString); + + return new Review(userId, movieId, rating); + } +} + +// public class ReviewParser { +// private GZIPInputStream stream; +// private int bufferLen = 4098; +// private String nextReviewStart = ""; + +// public ReviewParser(FileInputStream compressed) throws IOException { +// this.stream = new GZIPInputStream(compressed); +// } + +// public List getReview() throws IOException { +// List reviews = new ArrayList(); +// byte[] buffer = new byte[bufferLen]; +// boolean reviewEnd = false; +// String reviewText = ""; +// int bytesParsed ; + +// while ((bytesParsed = stream.read(buffer)) > 0) { + +// } + +// while (!reviewEnd) { +// int bytesParsed = stream.read(buffer, 0, bufferLen); + +// if (bytesParsed == -1) +// reviewEnd = true; + +// String line = new String(Arrays.asList(buffer).stream().filter(predicate)); +// reviewText = reviewText + buffer.stream().filter; +// String[] splitText = reviewText.split("\n\n"); + +// if (splitText.length > 1 || reviewEnd) { +// int i; +// for (i = 0; i < splitText.length - 1; ++i) { +// if (i == 0) +// reviewText = nextReviewStart + splitText[i]; + +// else +// reviewText = splitText[i]; + +// List reviewElements = Arrays.asList(reviewText.split("\n")); +// String movieId = reviewElements.stream().filter(title -> +// title.contains("product/productId")).findFirst() +// .get().split(" ")[1]; +// String userId = reviewElements.stream().filter(title -> +// title.contains("review/userId")).findFirst().get() +// .split(" ")[1]; + +// float rating; +// String ratingString = reviewElements.stream().filter(title -> +// title.contains("review/score")).findFirst() +// .get().split(" ")[1]; + +// try { +// rating = Float.parseFloat(ratingString); +// } catch (NumberFormatException e) { +// rating = getFloatFromMaybeCorrupted(ratingString); +// } + +// reviews.add(new Review(userId, movieId, rating)); + +// } + +// nextReviewStart = splitText[i]; +// reviewEnd = true; +// } +// } + +// return reviews; +// } + +// private float getFloatFromMaybeCorrupted(String corruptedFloat) { +// String goodString = ""; +// for (char c : corruptedFloat.toCharArray()) { +// if ((c >= '0' && c <= '9') || c == '.') { +// goodString += c; +// } +// } + +// float goodFloat ; +// try { +// goodFloat = Float.parseFloat(goodString); +// } catch (Exception e) { +// // Probably text +// goodFloat = -1 ; +// } + +// return goodFloat ; +// } +// } \ No newline at end of file diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/readme.md b/src/test/java/nearsoft/academy/bigdata/recommendation/readme.md new file mode 100644 index 0000000..9640598 --- /dev/null +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/readme.md @@ -0,0 +1,7 @@ +# Important + +This code takes a `.txt` file; that means that you may need to decompress before the `.gz` of the movies. + +The Maven `pom.xml` file also changed, so it would be a good idea to run `mvn compile` before running the tests. + +Also, change the path to the `.txt` in the test before running. \ No newline at end of file