sb-ai-lab · zakharova-anastasiia · Mar 16, 2023 · Mar 16, 2023 · Mar 23, 2023 · Mar 24, 2023
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = replay/ann/index_stores/hdfs_index_store.py
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -46,4 +46,4 @@ jobs:
       - name: pytest
         run: |
           . ./venv/bin/activate
-          pytest --cov=replay --cov-report=term-missing --doctest-modules replay --cov-fail-under=93 tests
+          pytest --cov-config=.coveragerc --cov=replay --cov-report=term-missing --doctest-modules replay --cov-fail-under=93 tests
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.7, <3.10"
+poetry-core = "1.6.0"
 lightfm = "*"
 lightautoml = ">=0.3.1, <0.3.7"
 numpy = ">=1.20.0"
@@ -46,6 +47,11 @@ seaborn = "*"
 pyarrow = "*"
 implicit = ">=0.5"
 pytorch-ranger = "^0.1.1"
+nmslib = "*"
+hnswlib = "*"
+cached-property = "*"
+# extra packages
+pygloo-rec = { version = "*", optional = true }
 
 [tool.poetry.dev-dependencies]
 # dev only
@@ -77,6 +83,9 @@ virtualenv = "*"
 data-science-types = "*"
 pyspark-stubs = "*"
 
+[tool.poetry.extras]
+distributed-lightfm = ["pygloo-rec"]
+
 [tool.black]
 line-length = 79
 

diff --git a/replay/ann/__init__.py b/replay/ann/__init__.py
diff --git a/replay/ann/ann_mixin.py b/replay/ann/ann_mixin.py
@@ -0,0 +1,202 @@
+import importlib
+import logging
+from abc import abstractmethod
+from typing import Optional, Dict, Any
+
+from pyspark.sql import DataFrame
+from pyspark.sql import functions as sf
+
+from replay.ann.index_builders.base_index_builder import IndexBuilder
+from replay.ann.index_stores.spark_files_index_store import (
+    SparkFilesIndexStore,
+)
+from replay.models.base_rec import BaseRecommender
+
+logger = logging.getLogger("replay")
+
+
+class ANNMixin(BaseRecommender):
+    """
+    This class overrides the `_fit_wrap` and `_inner_predict_wrap` methods of the base class,
+    adding an index construction in the `_fit_wrap` step
+    and an index inference in the `_inner_predict_wrap` step.
+    """
+
+    index_builder: Optional[IndexBuilder] = None
+
+    @property
+    def _use_ann(self) -> bool:
+        """
+        Property that determines whether the ANN (index) is used.
+        If `True`, then the index will be built (at the `fit` stage)
+        and index will be inferred (at the `predict` stage).
+        """
+        return self.index_builder is not None
+
+    @abstractmethod
+    def _get_vectors_to_build_ann(self, log: DataFrame) -> DataFrame:
+        """Implementations of this method must return a dataframe with item vectors.
+        Item vectors from this method are used to build the index.
+
+        Args:
+            log: DataFrame with interactions
+
+        Returns: DataFrame[item_idx int, vector array<double>] or DataFrame[vector array<double>].
+        Column names in dataframe can be anything.
+        """
+
+    @abstractmethod
+    def _get_ann_build_params(self, log: DataFrame) -> Dict[str, Any]:
+        """Implementation of this method must return dictionary
+        with arguments for `_build_ann_index` method.
+
+        Args:
+            log: DataFrame with interactions
+
+        Returns: Dictionary with arguments to build index. For example: {
+            "id_col": "item_idx",
+            "features_col": "item_factors",
+            ...
+        }
+
+        """
+
+    def _fit_wrap(
+        self,
+        log: DataFrame,
+        user_features: Optional[DataFrame] = None,
+        item_features: Optional[DataFrame] = None,
+    ) -> None:
+        """Wrapper extends `_fit_wrap`, adds construction of ANN index by flag.
+
+        Args:
+            log: historical log of interactions
+                ``[user_idx, item_idx, timestamp, relevance]``
+            user_features: user features
+                ``[user_idx, timestamp]`` + feature columns
+            item_features: item features
+                ``[item_idx, timestamp]`` + feature columns
+
+        """
+        super()._fit_wrap(log, user_features, item_features)
+
+        if self._use_ann:
+            vectors = self._get_vectors_to_build_ann(log)
+            ann_params = self._get_ann_build_params(log)
+            self.index_builder.build_index(vectors, **ann_params)
+
+    @abstractmethod
+    def _get_vectors_to_infer_ann_inner(
+        self, log: DataFrame, users: DataFrame
+    ) -> DataFrame:
+        """Implementations of this method must return a dataframe with user vectors.
+        User vectors from this method are used to infer the index.
+
+        Args:
+            log: DataFrame with interactions
+            users: DataFrame with users
+
+        Returns: DataFrame[user_idx int, vector array<double>] or DataFrame[vector array<double>].
+        Vector column name in dataframe can be anything.
+        """
+
+    def _get_vectors_to_infer_ann(
+        self, log: DataFrame, users: DataFrame, filter_seen_items: bool
+    ) -> DataFrame:
+        """This method wraps `_get_vectors_to_infer_ann_inner`
+        and adds seen items to dataframe with user vectors by flag.
+
+        Args:
+            log: DataFrame with interactions
+            users: DataFrame with users
+            filter_seen_items: flag to remove seen items from recommendations based on ``log``.
+
+        Returns:
+
+        """
+        users = self._get_vectors_to_infer_ann_inner(log, users)
+
+        # here we add `seen_item_idxs` to filter the viewed items in UDFs (see infer_index_udf)
+        if filter_seen_items:
+            user_to_max_items = log.groupBy("user_idx").agg(
+                sf.count("item_idx").alias("num_items"),
+                sf.collect_set("item_idx").alias("seen_item_idxs"),
+            )
+            users = users.join(user_to_max_items, on="user_idx")
+
+        return users
+
+    @abstractmethod
+    def _get_ann_infer_params(self) -> Dict[str, Any]:
+        """Implementation of this method must return dictionary
+        with arguments for `_infer_ann_index` method.
+
+        Returns: Dictionary with arguments to infer index. For example: {
+            "features_col": "user_vector",
+            ...
+        }
+
+        """
+
+    def _inner_predict_wrap(  # pylint: disable=too-many-arguments
+        self,
+        log: DataFrame,
+        k: int,
+        users: DataFrame,
+        items: DataFrame,
+        user_features: Optional[DataFrame] = None,
+        item_features: Optional[DataFrame] = None,
+        filter_seen_items: bool = True,
+    ) -> DataFrame:
+        """Override base `_inner_predict_wrap` and adds ANN inference by condition"""
+        if self._use_ann:
+            vectors = self._get_vectors_to_infer_ann(
+                log, users, filter_seen_items
+            )
+            ann_params = self._get_ann_infer_params()
+            inferer = self.index_builder.produce_inferer(filter_seen_items)
+            return inferer.infer(vectors, ann_params["features_col"], k)
+        else:
+            return self._predict(
+                log,
+                k,
+                users,
+                items,
+                user_features,
+                item_features,
+                filter_seen_items,
+            )
+
+    def _filter_seen(
+        self, recs: DataFrame, log: DataFrame, k: int, users: DataFrame
+    ):
+        """
+        Overridden _filter_seen method from base class.
+        Filtering is not needed for ann methods, because the data is already filtered in udf.
+        """
+        if self._use_ann:
+            return recs
+
+        return super()._filter_seen(recs, log, k, users)
+
+    def _save_index(self, path):
+        self.index_builder.index_store.dump_index(path)
+
+    def _load_index(self, path: str):
+        self.index_builder.index_store = SparkFilesIndexStore()
+        self.index_builder.index_store.load_from_path(path)
+
+    def init_builder_from_dict(self, init_meta: dict):
+        """Inits an index builder instance from a dict with init meta."""
+
+        # index param entity instance initialization
+        module = importlib.import_module(init_meta["index_param"]["module"])
+        class_ = getattr(module, init_meta["index_param"]["class"])
+        index_params = class_(**init_meta["index_param"]["init_args"])
+
+        # index builder instance initialization
+        module = importlib.import_module(init_meta["builder"]["module"])
+        class_ = getattr(module, init_meta["builder"]["class"])
+        index_builder = class_(index_params=index_params, index_store=None)
+
+        self.index_builder = index_builder
diff --git a/replay/ann/entities/__init__.py b/replay/ann/entities/__init__.py
diff --git a/replay/ann/entities/base_hnsw_param.py b/replay/ann/entities/base_hnsw_param.py
@@ -0,0 +1,32 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class BaseHnswParam:
+    """
+    Base hnsw params.
+    """
+
+    space: str
+    m: int = 200  # pylint: disable=invalid-name
+    ef_c: int = 20000
+    post: int = 0
+    ef_s: Optional[int] = None
+
+    def init_meta_as_dict(self) -> dict:
+        """
+        Returns meta-information for class instance initialization. Used to save the entity to disk.
+        :return: dictionary with init meta.
+        """
+        return {
+            "module": type(self).__module__,
+            "class": type(self).__name__,
+            "init_args": {
+                "space": self.space,
+                "m": self.m,
+                "ef_c": self.ef_c,
+                "post": self.post,
+                "ef_s": self.ef_s,
+            },
+        }
diff --git a/replay/ann/entities/hnswlib_param.py b/replay/ann/entities/hnswlib_param.py
@@ -0,0 +1,66 @@
+from dataclasses import dataclass, field
+from typing_extensions import Literal
+
+from replay.ann.entities.base_hnsw_param import BaseHnswParam
+
+
+@dataclass
+class HnswlibParam(BaseHnswParam):
+    """
+    Parameters for hnswlib methods.
+
+    For example,
+
+    >>> HnswlibParam(space="ip",\
+                     m=100,\
+                     ef_c=200,\
+                     post=0,\
+                     ef_s=2000,\
+        )
+    HnswlibParam(space='ip', m=100, ef_c=200, post=0, ef_s=2000, dim=None, max_elements=None)
+
+    or
+
+    >>> HnswlibParam(space="ip",\
+                     m=100,\
+                     ef_c=200,\
+                     post=0,\
+                     ef_s=2000,\
+        )
+    HnswlibParam(space='ip', m=100, ef_c=200, post=0, ef_s=2000, dim=None, max_elements=None)
+
+    The "space" parameter described on the page https://github.com/nmslib/hnswlib/blob/master/README.md#supported-distances
+    Parameters "m", "ef_s" and "ef_c" are described at https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+    The reasonable range of values for `m` parameter is 5-100,
+    for `ef_c` and `ef_s` is 100-2000.
+    Increasing these values improves the prediction quality
+    but increases index_time and inference_time too.
+
+     We recommend using these settings:
+
+    - m=16, ef_c=200 and ef_s=200 for simple datasets like MovieLens.
+    - m=50, ef_c=1000 and ef_s=1000 for average quality with an average prediction time.
+    - m=75, ef_c=2000 and ef_s=2000 for the highest quality with a long prediction time.
+
+    note: choosing these parameters depends on the dataset
+    and quality/time tradeoff.
+
+    note: while reducing parameter values the highest range metrics
+    like Metric@1000 suffer first.
+
+    note: even in a case with a long training time,
+    profit from ann could be obtained while inference will be used multiple times.
+    """
+
+    space: Literal["l2", "ip", "cosine"] = "ip"
+    # Dimension of vectors in index
+    dim: int = field(default=None, init=False)
+    # Max number of elements that will be stored in the index
+    max_elements: int = field(default=None, init=False)
+
+    # def init_args_as_dict(self):
+    #     # union dicts
+    #     return dict(
+    #         super().init_args_as_dict()["init_args"], **{"space": self.space}
+    #     )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[run]
		omit = replay/ann/index_stores/hdfs_index_store.py