Skip to content

Commit

Permalink
Merge branch 'fix/tokenizer_lists' into 'main'
Browse files Browse the repository at this point in the history
Fix categorical events processing inside SequenceTokenizer

See merge request ai-lab-pmo/mltools/recsys/RePlay!252
  • Loading branch information
OnlyDeniko committed Feb 28, 2025
2 parents 6fea0e1 + cf1f726 commit 4252fdd
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 2 deletions.
12 changes: 10 additions & 2 deletions replay/data/nn/sequence_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,9 @@ def _process_num_query_feature(self, tensor_feature: TensorFeatureInfo) -> List[
"""
return self._process_cat_query_feature(tensor_feature)

def _process_cat_interaction_feature(self, tensor_feature: TensorFeatureInfo) -> List[np.ndarray]:
def _process_cat_interaction_feature(
self, tensor_feature: TensorFeatureInfo
) -> Union[List[np.ndarray], List[List]]:
"""
Process categorical interaction feature.
Expand All @@ -699,7 +701,13 @@ def _process_cat_interaction_feature(self, tensor_feature: TensorFeatureInfo) ->
source = tensor_feature.feature_source
assert source is not None

return [np.array(sequence, dtype=np.int64) for sequence in self._grouped_interactions[source.column]]
values = []
for sequence in self._grouped_interactions[source.column].values:
if tensor_feature.feature_type == FeatureType.CATEGORICAL_LIST:
values.append(list(sequence))
else:
values.append(np.array(sequence))
return values

def _process_cat_query_feature(self, tensor_feature: TensorFeatureInfo) -> List[np.ndarray]:
"""
Expand Down
15 changes: 15 additions & 0 deletions tests/data/nn/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,20 @@ def small_numerical_events():
[-1.95, 1.4],
[-1.55, 1.3],
],
"cat_list_feature": [
[-1, 0],
[1, 2],
[3, 4],
[12, 11],
[9, 10],
[8, 7],
[0, 5],
[6, 7],
[7, 13],
[-1, 14],
[-2, 3],
[-3, 9],
],
}
)

Expand All @@ -232,6 +246,7 @@ def small_numerical_feature_schema():
FeatureInfo("timestamp", FeatureType.NUMERICAL, FeatureHint.TIMESTAMP, FeatureSource.INTERACTIONS),
FeatureInfo("num_feature", FeatureType.NUMERICAL, None, FeatureSource.INTERACTIONS),
FeatureInfo("num_list_feature", FeatureType.NUMERICAL_LIST, None, FeatureSource.INTERACTIONS),
FeatureInfo("cat_list_feature", FeatureType.CATEGORICAL_LIST, None, FeatureSource.INTERACTIONS),
FeatureInfo("item_num", FeatureType.NUMERICAL, None, FeatureSource.ITEM_FEATURES),
]
)
Expand Down
40 changes: 40 additions & 0 deletions tests/data/nn/test_sequence_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,46 @@ def test_process_numerical_features(dataset, request):
)


@pytest.mark.torch
@pytest.mark.parametrize("dataset", ["small_numerical_dataset", "small_numerical_dataset_polars"])
def test_process_categorical_features(dataset, request):
data = request.getfixturevalue(dataset)
schema = TensorSchema(
[
TensorFeatureInfo(
"item_id",
cardinality=6,
is_seq=True,
feature_type=FeatureType.CATEGORICAL,
feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, "item_id")],
feature_hint=FeatureHint.ITEM_ID,
),
TensorFeatureInfo(
"cat_list_feature",
is_seq=True,
feature_type=FeatureType.CATEGORICAL_LIST,
feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, "cat_list_feature")],
),
]
)
tokenizer = SequenceTokenizer(schema)
sequential_dataset = tokenizer.fit_transform(data)

answers = {
1: [[-1, 0], [1, 2]],
2: [[3, 4], [12, 11], [9, 10]],
3: [[8, 7]],
4: [[0, 5], [6, 7], [7, 13], [-1, 14], [-2, 3], [-3, 9]],
}
_compare_sequence(
sequential_dataset,
tokenizer,
"cat_list_feature",
answers,
tokenizer.interactions_encoder.inverse_mapping["cat_list_feature"],
)


@pytest.mark.torch
@pytest.mark.parametrize("dataset", ["small_dataset", "small_dataset_polars"])
def test_tokenizer_properties(item_id_and_item_features_schema, dataset, request):
Expand Down

0 comments on commit 4252fdd

Please sign in to comment.