From 781da79b4b39dd458bec486e9d7ff64a2342790b Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Sat, 7 Apr 2018 15:19:50 +0530 Subject: [PATCH 1/3] Added Spearman Correlation for similarities module. --- surprise/similarities.pyx | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx index 2a808017..4267901a 100644 --- a/surprise/similarities.pyx +++ b/surprise/similarities.pyx @@ -24,6 +24,8 @@ import numpy as np from six.moves import range from six import iteritems +from scipy.stats import rankdata + def cosine(n_x, yr, min_support): """Compute the cosine similarity between all pairs of users (or items). @@ -359,3 +361,103 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases, sim[xj, xi] = sim[xi, xj] return sim + + +def spearman(n_x, yr, min_support): + """Compute the Spearman correlation coefficient between all pairs of users + (or items). + + Only **common** users (or items) are taken into account. The Spearman + correlation coefficient can be seen as a non parametric Pearson's + Similarity, and is defined as: + + .. math :: + \\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \in I_{uv}} + (k_{ui} - \mu_u) \cdot (k_{vi} - \mu_{v})} {\\sqrt{\\sum\\limits_{i + \in I_{uv}} (r_{ui} - \mu_u)^2} \cdot \\sqrt{\\sum\\limits_{i \in + I_{uv}} (r_{vi} - \mu_{v})^2} } + + or + + .. math :: + \\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \in U_{ij}} + (k_{ui} - \mu_i) \cdot (k_{uj} - \mu_{j})} {\\sqrt{\\sum\\limits_{u + \in U_{ij}} (k_{ui} - \mu_i)^2} \cdot \\sqrt{\\sum\\limits_{u \in + U_{ij}} (k_{uj} - \mu_{j})^2} } + + depending on the ``user_based`` field of ``sim_options`` (see + :ref:`similarity_measures_configuration`). + + + Note: if there are no common users or items, similarity will be 0 (and not + -1). + + For details on Spearman coefficient, see `Wikipedia + `__. + + """ + + # number of common ys + cdef np.ndarray[np.int_t, ndim=2] freq + # sum (rank_xy * rank_x'y) for common ys + cdef np.ndarray[np.double_t, ndim=2] prods + # sum (rank_xy ^ 2) for common ys + cdef np.ndarray[np.double_t, ndim=2] sqi + # sum (rank_x'y ^ 2) for common ys + cdef np.ndarray[np.double_t, ndim=2] sqj + # sum (rank_xy) for common ys + cdef np.ndarray[np.double_t, ndim=2] si + # sum (rank_x'y) for common ys + cdef np.ndarray[np.double_t, ndim=2] sj + # the similarity matrix + cdef np.ndarray[np.double_t, ndim=2] sim + + cdef np.ndarray[np.int, ndim=1] ranks + cdef np.ndarray[np.int, ndim=1] rows + + cdef int xi, xj + cdef double ri, rj + cdef int min_sprt = min_support + + freq = np.zeros((n_x, n_x), np.int) + prods = np.zeros((n_x, n_x), np.double) + sqi = np.zeros((n_x, n_x), np.double) + sqj = np.zeros((n_x, n_x), np.double) + si = np.zeros((n_x, n_x), np.double) + sj = np.zeros((n_x, n_x), np.double) + sim = np.zeros((n_x, n_x), np.double) + ranks = np.zeros(n_x, np.int) + rows = np.zeros(n_x, np.int) + + for y, y_ratings in iteritems(yr): + for xi, ri in y_ratings: + rows[xi] = ri + ranks = rankdata(rows) + for xi in range(n_x): + for xj in range(n_x): + prods[xi, xj] += ranks[xi] * ranks[xj] + freq[xi, xj] += 1 + sqi[xi, xj] += ranks[xi]**2 + sqj[xi, xj] += ranks[xj]**2 + si[xi, xj] += ranks[xi] + sj[xi, xj] += ranks[xj] + + for xi in range(n_x): + sim[xi, xi] = 1 + for xj in range(xi + 1, n_x): + + if freq[xi, xj] < min_sprt: + sim[xi, xj] == 0 + else: + n = freq[xi, xj] + num = n * prods[xi, xj] - si[xi, xj] * sj[xi, xj] + denum = np.sqrt((n * sqi[xi, xj] - si[xi, xj]**2) * + (n * sqj[xi, xj] - sj[xi, xj]**2)) + if denum == 0: + sim[xi, xj] = 0 + else: + sim[xi, xj] = num / denum + + sim[xj, xi] = sim[xi, xj] + + return sim From dec5d9a605bd7efb1372cb3d353d390bd8632cd1 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Tue, 1 May 2018 18:51:07 +0530 Subject: [PATCH 2/3] Included spearman similarity in algo_base.py --- surprise/prediction_algorithms/algo_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 9becf7bd..39026e31 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -265,7 +265,8 @@ def compute_similarities(self): construction_func = {'cosine': sims.cosine, 'msd': sims.msd, 'pearson': sims.pearson, - 'pearson_baseline': sims.pearson_baseline} + 'pearson_baseline': sims.pearson_baseline + 'spearman': sims.spearman} if self.sim_options['user_based']: n_x, yr = self.trainset.n_users, self.trainset.ir From 061a55142b876f0ee80865810ad910db9a22c2d7 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Wed, 27 Jun 2018 12:06:00 +0530 Subject: [PATCH 3/3] added tests --- tests/test_similarities.py | 63 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/tests/test_similarities.py b/tests/test_similarities.py index da2fbae6..685ed8d3 100644 --- a/tests/test_similarities.py +++ b/tests/test_similarities.py @@ -12,11 +12,11 @@ n_x = 8 yr_global = { - 0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa + 0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa 1: [(0, 4), (1, 4), (2, 4), ], # noqa 2: [ (2, 5), (3, 2), (4, 3) ], # noqa - 3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa - 4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa + 3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa + 4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa } @@ -205,3 +205,60 @@ def test_pearson_baseline_sim(): for j in range(i + 1, n_x): if i != 1 and j != 2: assert sim[i, j] == 0 + +def test_spearman_sim(): + """Test for spearman similarity""" + + yr = yr_global.copy() + + # shuffle every rating list, to ensure the order in which ratings are + # processed does not matter (it's important because it used to be error + # prone when we were using itertools.combinations) + for _, ratings in yr.items(): + random.shuffle(ratings) + + sim = sims.spearman(n_x, yr, min_support=1) + # check symetry and bounds. -1 <= pearson coeff <= 1 + for xi in range(n_x): + assert sim[xi, xi] == 1 + for xj in range(n_x): + assert sim[xi, xj] == sim[xj, xi] + assert -1 <= sim[xi, xj] <= 1 + + # on common items, users 0, 1 and 2 have the same ratings + assert sim[0, 1] == 1 + assert sim[0, 2] == 1 + + # for vectors with constant ratings, pearson sim is necessarily zero (as + # ratings are centered) + assert sim[3, 4] == 0 + assert sim[2, 3] == 0 + assert sim[2, 4] == 0 + + # pairs of users (0, 3), have no common items + assert sim[0, 3] == 0 + assert sim[0, 4] == 0 + + # ratings have same rankings + assert sim[5, 6] == 1 + + # check for float point support and computation correctness + mean6 = (1 + 2 + 3) / 3 + var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2 + mean7 = (1 + 2 + 3) / 3 + var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2 + num = sum([((3 - mean6) * (1 - mean7)), + ((1 - mean6) * (3 - mean7)), + ((2 - mean6) * (2 - mean7)) + ]) + assert sim[6, 7] == num / (var6 * var7) ** 0.5 + + # ensure min_support is taken into account. Only users 1 and 2 have more + # than 4 common ratings. + sim = sims.spearman(n_x, yr, min_support=4) + for i in range(n_x): + for j in range(i + 1, n_x): + if i != 1 and j != 2: + assert sim[i, j] == 0 + +