Merge branch 'fix/tokenizer_lists' into 'main'

Fix categorical events processing inside SequenceTokenizer See merge request ai-lab-pmo/mltools/recsys/RePlay!252
sb-ai-lab · Feb 28, 2025 · 4252fdd · 4252fdd
2 parents 6fea0e1 + cf1f726
commit 4252fdd
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 2 deletions.
diff --git a/replay/data/nn/sequence_tokenizer.py b/replay/data/nn/sequence_tokenizer.py
@@ -686,7 +686,9 @@ def _process_num_query_feature(self, tensor_feature: TensorFeatureInfo) -> List[
         """
         return self._process_cat_query_feature(tensor_feature)
 
-    def _process_cat_interaction_feature(self, tensor_feature: TensorFeatureInfo) -> List[np.ndarray]:
+    def _process_cat_interaction_feature(
+        self, tensor_feature: TensorFeatureInfo
+    ) -> Union[List[np.ndarray], List[List]]:
         """
         Process categorical interaction feature.
 
@@ -699,7 +701,13 @@ def _process_cat_interaction_feature(self, tensor_feature: TensorFeatureInfo) ->
         source = tensor_feature.feature_source
         assert source is not None
 
-        return [np.array(sequence, dtype=np.int64) for sequence in self._grouped_interactions[source.column]]
+        values = []
+        for sequence in self._grouped_interactions[source.column].values:
+            if tensor_feature.feature_type == FeatureType.CATEGORICAL_LIST:
+                values.append(list(sequence))
+            else:
+                values.append(np.array(sequence))
+        return values
 
     def _process_cat_query_feature(self, tensor_feature: TensorFeatureInfo) -> List[np.ndarray]:
         """

diff --git a/tests/data/nn/conftest.py b/tests/data/nn/conftest.py
@@ -218,6 +218,20 @@ def small_numerical_events():
                 [-1.95, 1.4],
                 [-1.55, 1.3],
             ],
+            "cat_list_feature": [
+                [-1, 0],
+                [1, 2],
+                [3, 4],
+                [12, 11],
+                [9, 10],
+                [8, 7],
+                [0, 5],
+                [6, 7],
+                [7, 13],
+                [-1, 14],
+                [-2, 3],
+                [-3, 9],
+            ],
         }
     )
 
@@ -232,6 +246,7 @@ def small_numerical_feature_schema():
             FeatureInfo("timestamp", FeatureType.NUMERICAL, FeatureHint.TIMESTAMP, FeatureSource.INTERACTIONS),
             FeatureInfo("num_feature", FeatureType.NUMERICAL, None, FeatureSource.INTERACTIONS),
             FeatureInfo("num_list_feature", FeatureType.NUMERICAL_LIST, None, FeatureSource.INTERACTIONS),
+            FeatureInfo("cat_list_feature", FeatureType.CATEGORICAL_LIST, None, FeatureSource.INTERACTIONS),
             FeatureInfo("item_num", FeatureType.NUMERICAL, None, FeatureSource.ITEM_FEATURES),
         ]
     )

diff --git a/tests/data/nn/test_sequence_tokenizer.py b/tests/data/nn/test_sequence_tokenizer.py
@@ -537,6 +537,46 @@ def test_process_numerical_features(dataset, request):
     )
 
 
+@pytest.mark.torch
+@pytest.mark.parametrize("dataset", ["small_numerical_dataset", "small_numerical_dataset_polars"])
+def test_process_categorical_features(dataset, request):
+    data = request.getfixturevalue(dataset)
+    schema = TensorSchema(
+        [
+            TensorFeatureInfo(
+                "item_id",
+                cardinality=6,
+                is_seq=True,
+                feature_type=FeatureType.CATEGORICAL,
+                feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, "item_id")],
+                feature_hint=FeatureHint.ITEM_ID,
+            ),
+            TensorFeatureInfo(
+                "cat_list_feature",
+                is_seq=True,
+                feature_type=FeatureType.CATEGORICAL_LIST,
+                feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, "cat_list_feature")],
+            ),
+        ]
+    )
+    tokenizer = SequenceTokenizer(schema)
+    sequential_dataset = tokenizer.fit_transform(data)
+
+    answers = {
+        1: [[-1, 0], [1, 2]],
+        2: [[3, 4], [12, 11], [9, 10]],
+        3: [[8, 7]],
+        4: [[0, 5], [6, 7], [7, 13], [-1, 14], [-2, 3], [-3, 9]],
+    }
+    _compare_sequence(
+        sequential_dataset,
+        tokenizer,
+        "cat_list_feature",
+        answers,
+        tokenizer.interactions_encoder.inverse_mapping["cat_list_feature"],
+    )
+
+
 @pytest.mark.torch
 @pytest.mark.parametrize("dataset", ["small_dataset", "small_dataset_polars"])
 def test_tokenizer_properties(item_id_and_item_features_schema, dataset, request):