Merge branch 'feature/list_of_tokens' into 'main'

List of tokens support inside SequentialTokenizer (categorical and numerical) See merge request ai-lab-pmo/mltools/recsys/RePlay!242
sb-ai-lab · Feb 10, 2025 · e2dcc5e · e2dcc5e
2 parents bb1178a + b209db6
commit e2dcc5e
Show file tree

Hide file tree

Showing 20 changed files with 1,678 additions and 1,660 deletions.
diff --git a/examples/09_sasrec_example.ipynb b/examples/09_sasrec_example.ipynb
@@ -1940,7 +1940,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.20"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/replay/data/dataset.py b/replay/data/dataset.py
@@ -458,13 +458,23 @@ def callback(column: str) -> int:
             if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
                 return nunique(self._ids_feature_map[feature.feature_hint], column)
             assert feature.feature_source
+            if feature.feature_type == FeatureType.CATEGORICAL_LIST:
+                if self.is_spark:
+                    data = (
+                        self._feature_source_map[feature.feature_source]
+                        .select(column)
+                        .withColumn(column, sf.explode(column))
+                    )
+                else:
+                    data = self._feature_source_map[feature.feature_source][[column]].explode(column)
+                return nunique(data, column)
             return nunique(self._feature_source_map[feature.feature_source], column)
 
         return callback
 
     def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
         for feature in features_list:
-            if feature.feature_type == FeatureType.CATEGORICAL:
+            if feature.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
                 feature._set_cardinality_callback(self._get_cardinality(feature))
 
     def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
@@ -581,6 +591,7 @@ def _check_column_encoded(
         data: DataFrameLike,
         column: str,
         source: FeatureSource,
+        feature_type: FeatureType,
         cardinality: Optional[int],
     ) -> None:
         """
@@ -593,6 +604,16 @@ def _check_column_encoded(
         Option: Keep this criterion, but suggest the user to disable the check if he understands
         that the criterion will not pass.
         """
+        if feature_type == FeatureType.CATEGORICAL_LIST:  # explode column if list
+            data = data.withColumn(column, sf.explode(column)) if self.is_spark else data[[column]].explode(column)
+
+            if self.is_pandas:
+                try:
+                    data[column] = data[column].astype(int)
+                except Exception:
+                    msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
+                    raise ValueError(msg)
+
         if self.is_pandas:
             is_int = np.issubdtype(dict(data.dtypes)[column], int)
         elif self.is_spark:
@@ -632,27 +653,31 @@ def _check_encoded(self) -> None:
                     self.interactions,
                     feature.column,
                     FeatureSource.INTERACTIONS,
+                    feature.feature_type,
                     feature.cardinality,
                 )
                 if self.item_features is not None:
                     self._check_column_encoded(
                         self.item_features,
                         feature.column,
                         FeatureSource.ITEM_FEATURES,
+                        feature.feature_type,
                         feature.cardinality,
                     )
             elif feature.feature_hint == FeatureHint.QUERY_ID:
                 self._check_column_encoded(
                     self.interactions,
                     feature.column,
                     FeatureSource.INTERACTIONS,
+                    feature.feature_type,
                     feature.cardinality,
                 )
                 if self.query_features is not None:
                     self._check_column_encoded(
                         self.query_features,
                         feature.column,
                         FeatureSource.QUERY_FEATURES,
+                        feature.feature_type,
                         feature.cardinality,
                     )
             else:
@@ -661,6 +686,7 @@ def _check_encoded(self) -> None:
                     data,
                     feature.column,
                     feature.feature_source,
+                    feature.feature_type,
                     feature.cardinality,
                 )
 

diff --git a/replay/data/dataset_utils/dataset_label_encoder.py b/replay/data/dataset_utils/dataset_label_encoder.py
@@ -8,8 +8,8 @@
 import warnings
 from typing import Dict, Iterable, Iterator, Optional, Sequence, Set, Union
 
-from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource
-from replay.preprocessing import LabelEncoder, LabelEncodingRule
+from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
+from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
 from replay.preprocessing.label_encoder import HandleUnknownStrategies
 
 
@@ -62,7 +62,10 @@ def fit(self, dataset: Dataset) -> "DatasetLabelEncoder":
 
         self._fill_features_columns(dataset.feature_schema)
         for column, feature_info in dataset.feature_schema.categorical_features.items():
-            encoding_rule = LabelEncodingRule(
+            encoding_rule_class = (
+                SequenceEncodingRule if feature_info.feature_type == FeatureType.CATEGORICAL_LIST else LabelEncodingRule
+            )
+            encoding_rule = encoding_rule_class(
                 column, handle_unknown=self._handle_unknown_rule, default_value=self._default_value_rule
             )
             if feature_info.feature_hint == FeatureHint.QUERY_ID:

diff --git a/replay/data/nn/schema.py b/replay/data/nn/schema.py
@@ -70,6 +70,8 @@ class TensorFeatureInfo:
     Information about a tensor feature.
     """
 
+    DEFAULT_EMBEDDING_DIM = 64
+
     def __init__(
         self,
         name: str,
@@ -109,18 +111,17 @@ def __init__(
             raise ValueError(msg)
         self._feature_type = feature_type
 
-        if feature_type == FeatureType.NUMERICAL and (cardinality or embedding_dim):
+        if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and (cardinality or embedding_dim):
             msg = "Cardinality and embedding dimensions are needed only with categorical feature type."
             raise ValueError(msg)
         self._cardinality = cardinality
 
-        if feature_type == FeatureType.CATEGORICAL and tensor_dim:
+        if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST] and tensor_dim:
             msg = "Tensor dimensions is needed only with numerical feature type."
             raise ValueError(msg)
 
-        if feature_type == FeatureType.CATEGORICAL:
-            default_embedding_dim = 64
-            self._embedding_dim = embedding_dim or default_embedding_dim
+        if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
+            self._embedding_dim = embedding_dim or self.DEFAULT_EMBEDDING_DIM
         else:
             self._tensor_dim = tensor_dim
 
@@ -176,7 +177,8 @@ def feature_source(self) -> Optional[TensorFeatureSource]:
     @property
     def is_seq(self) -> bool:
         """
-        :returns: Flag that feature is sequential.
+        :returns: Flag that feature is sequential.\n
+        Sequential means that the value of the feature will be determined for each element of the user's sequence.
         """
         return self._is_seq
 
@@ -185,21 +187,28 @@ def is_cat(self) -> bool:
         """
         :returns: Flag that feature is categorical.
         """
-        return self.feature_type == FeatureType.CATEGORICAL
+        return self.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]
 
     @property
     def is_num(self) -> bool:
         """
         :returns: Flag that feature is numerical.
         """
-        return self.feature_type == FeatureType.NUMERICAL
+        return self.feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST]
+
+    @property
+    def is_list(self) -> bool:
+        """
+        :returns: Flag that feature is numerical list or categorical list.
+        """
+        return self.feature_type in [FeatureType.CATEGORICAL_LIST, FeatureType.NUMERICAL_LIST]
 
     @property
     def cardinality(self) -> Optional[int]:
         """
         :returns: Cardinality of the feature.
         """
-        if self.feature_type != FeatureType.CATEGORICAL:
+        if not self.is_cat:
             msg = f"Can not get cardinality because feature type of {self.name} column is not categorical."
             raise RuntimeError(msg)
         return self._cardinality
@@ -212,7 +221,7 @@ def tensor_dim(self) -> Optional[int]:
         """
         :returns: Dimensions of the numerical feature.
         """
-        if self.feature_type != FeatureType.NUMERICAL:
+        if not self.is_num:
             msg = f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
             raise RuntimeError(msg)
         return self._tensor_dim
@@ -225,7 +234,7 @@ def embedding_dim(self) -> Optional[int]:
         """
         :returns: Embedding dimensions of the feature.
         """
-        if self.feature_type != FeatureType.CATEGORICAL:
+        if not self.is_cat:
             msg = f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
             raise RuntimeError(msg)
         return self._embedding_dim
@@ -317,14 +326,16 @@ def categorical_features(self) -> "TensorSchema":
         """
         :returns: Sequence of categorical features in a schema.
         """
-        return self.filter(feature_type=FeatureType.CATEGORICAL)
+        return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
+            feature_type=FeatureType.CATEGORICAL_LIST
+        )
 
     @property
     def numerical_features(self) -> "TensorSchema":
         """
         :returns: Sequence of numerical features in a schema.
         """
-        return self.filter(feature_type=FeatureType.NUMERICAL)
+        return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
 
     @property
     def query_id_features(self) -> "TensorSchema":
@@ -423,9 +434,9 @@ def _get_object_args(self) -> Dict:
                     if feature.feature_sources
                     else None
                 ),
-                "cardinality": feature.cardinality if feature.feature_type == FeatureType.CATEGORICAL else None,
-                "embedding_dim": feature.embedding_dim if feature.feature_type == FeatureType.CATEGORICAL else None,
-                "tensor_dim": feature.tensor_dim if feature.feature_type == FeatureType.NUMERICAL else None,
+                "cardinality": feature.cardinality if feature.is_cat else None,
+                "embedding_dim": feature.embedding_dim if feature.is_cat else None,
+                "tensor_dim": feature.tensor_dim if feature.is_num else None,
             }
             for feature in self.all_features
         ]