Skip to content

Commit

Permalink
Merge branch 'feature/list_of_tokens' into 'main'
Browse files Browse the repository at this point in the history
List of tokens support inside SequentialTokenizer (categorical and numerical)

See merge request ai-lab-pmo/mltools/recsys/RePlay!242
  • Loading branch information
OnlyDeniko committed Feb 10, 2025
2 parents bb1178a + b209db6 commit e2dcc5e
Show file tree
Hide file tree
Showing 20 changed files with 1,678 additions and 1,660 deletions.
2 changes: 1 addition & 1 deletion examples/09_sasrec_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1940,7 +1940,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.20"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
28 changes: 27 additions & 1 deletion replay/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,13 +458,23 @@ def callback(column: str) -> int:
if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
return nunique(self._ids_feature_map[feature.feature_hint], column)
assert feature.feature_source
if feature.feature_type == FeatureType.CATEGORICAL_LIST:
if self.is_spark:
data = (
self._feature_source_map[feature.feature_source]
.select(column)
.withColumn(column, sf.explode(column))
)
else:
data = self._feature_source_map[feature.feature_source][[column]].explode(column)
return nunique(data, column)
return nunique(self._feature_source_map[feature.feature_source], column)

return callback

def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
for feature in features_list:
if feature.feature_type == FeatureType.CATEGORICAL:
if feature.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
feature._set_cardinality_callback(self._get_cardinality(feature))

def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
Expand Down Expand Up @@ -581,6 +591,7 @@ def _check_column_encoded(
data: DataFrameLike,
column: str,
source: FeatureSource,
feature_type: FeatureType,
cardinality: Optional[int],
) -> None:
"""
Expand All @@ -593,6 +604,16 @@ def _check_column_encoded(
Option: Keep this criterion, but suggest the user to disable the check if he understands
that the criterion will not pass.
"""
if feature_type == FeatureType.CATEGORICAL_LIST: # explode column if list
data = data.withColumn(column, sf.explode(column)) if self.is_spark else data[[column]].explode(column)

if self.is_pandas:
try:
data[column] = data[column].astype(int)
except Exception:
msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
raise ValueError(msg)

if self.is_pandas:
is_int = np.issubdtype(dict(data.dtypes)[column], int)
elif self.is_spark:
Expand Down Expand Up @@ -632,27 +653,31 @@ def _check_encoded(self) -> None:
self.interactions,
feature.column,
FeatureSource.INTERACTIONS,
feature.feature_type,
feature.cardinality,
)
if self.item_features is not None:
self._check_column_encoded(
self.item_features,
feature.column,
FeatureSource.ITEM_FEATURES,
feature.feature_type,
feature.cardinality,
)
elif feature.feature_hint == FeatureHint.QUERY_ID:
self._check_column_encoded(
self.interactions,
feature.column,
FeatureSource.INTERACTIONS,
feature.feature_type,
feature.cardinality,
)
if self.query_features is not None:
self._check_column_encoded(
self.query_features,
feature.column,
FeatureSource.QUERY_FEATURES,
feature.feature_type,
feature.cardinality,
)
else:
Expand All @@ -661,6 +686,7 @@ def _check_encoded(self) -> None:
data,
feature.column,
feature.feature_source,
feature.feature_type,
feature.cardinality,
)

Expand Down
9 changes: 6 additions & 3 deletions replay/data/dataset_utils/dataset_label_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import warnings
from typing import Dict, Iterable, Iterator, Optional, Sequence, Set, Union

from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource
from replay.preprocessing import LabelEncoder, LabelEncodingRule
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
from replay.preprocessing.label_encoder import HandleUnknownStrategies


Expand Down Expand Up @@ -62,7 +62,10 @@ def fit(self, dataset: Dataset) -> "DatasetLabelEncoder":

self._fill_features_columns(dataset.feature_schema)
for column, feature_info in dataset.feature_schema.categorical_features.items():
encoding_rule = LabelEncodingRule(
encoding_rule_class = (
SequenceEncodingRule if feature_info.feature_type == FeatureType.CATEGORICAL_LIST else LabelEncodingRule
)
encoding_rule = encoding_rule_class(
column, handle_unknown=self._handle_unknown_rule, default_value=self._default_value_rule
)
if feature_info.feature_hint == FeatureHint.QUERY_ID:
Expand Down
43 changes: 27 additions & 16 deletions replay/data/nn/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ class TensorFeatureInfo:
Information about a tensor feature.
"""

DEFAULT_EMBEDDING_DIM = 64

def __init__(
self,
name: str,
Expand Down Expand Up @@ -109,18 +111,17 @@ def __init__(
raise ValueError(msg)
self._feature_type = feature_type

if feature_type == FeatureType.NUMERICAL and (cardinality or embedding_dim):
if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and (cardinality or embedding_dim):
msg = "Cardinality and embedding dimensions are needed only with categorical feature type."
raise ValueError(msg)
self._cardinality = cardinality

if feature_type == FeatureType.CATEGORICAL and tensor_dim:
if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST] and tensor_dim:
msg = "Tensor dimensions is needed only with numerical feature type."
raise ValueError(msg)

if feature_type == FeatureType.CATEGORICAL:
default_embedding_dim = 64
self._embedding_dim = embedding_dim or default_embedding_dim
if feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
self._embedding_dim = embedding_dim or self.DEFAULT_EMBEDDING_DIM
else:
self._tensor_dim = tensor_dim

Expand Down Expand Up @@ -176,7 +177,8 @@ def feature_source(self) -> Optional[TensorFeatureSource]:
@property
def is_seq(self) -> bool:
"""
:returns: Flag that feature is sequential.
:returns: Flag that feature is sequential.\n
Sequential means that the value of the feature will be determined for each element of the user's sequence.
"""
return self._is_seq

Expand All @@ -185,21 +187,28 @@ def is_cat(self) -> bool:
"""
:returns: Flag that feature is categorical.
"""
return self.feature_type == FeatureType.CATEGORICAL
return self.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]

@property
def is_num(self) -> bool:
"""
:returns: Flag that feature is numerical.
"""
return self.feature_type == FeatureType.NUMERICAL
return self.feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST]

@property
def is_list(self) -> bool:
"""
:returns: Flag that feature is numerical list or categorical list.
"""
return self.feature_type in [FeatureType.CATEGORICAL_LIST, FeatureType.NUMERICAL_LIST]

@property
def cardinality(self) -> Optional[int]:
"""
:returns: Cardinality of the feature.
"""
if self.feature_type != FeatureType.CATEGORICAL:
if not self.is_cat:
msg = f"Can not get cardinality because feature type of {self.name} column is not categorical."
raise RuntimeError(msg)
return self._cardinality
Expand All @@ -212,7 +221,7 @@ def tensor_dim(self) -> Optional[int]:
"""
:returns: Dimensions of the numerical feature.
"""
if self.feature_type != FeatureType.NUMERICAL:
if not self.is_num:
msg = f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
raise RuntimeError(msg)
return self._tensor_dim
Expand All @@ -225,7 +234,7 @@ def embedding_dim(self) -> Optional[int]:
"""
:returns: Embedding dimensions of the feature.
"""
if self.feature_type != FeatureType.CATEGORICAL:
if not self.is_cat:
msg = f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
raise RuntimeError(msg)
return self._embedding_dim
Expand Down Expand Up @@ -317,14 +326,16 @@ def categorical_features(self) -> "TensorSchema":
"""
:returns: Sequence of categorical features in a schema.
"""
return self.filter(feature_type=FeatureType.CATEGORICAL)
return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
feature_type=FeatureType.CATEGORICAL_LIST
)

@property
def numerical_features(self) -> "TensorSchema":
"""
:returns: Sequence of numerical features in a schema.
"""
return self.filter(feature_type=FeatureType.NUMERICAL)
return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)

@property
def query_id_features(self) -> "TensorSchema":
Expand Down Expand Up @@ -423,9 +434,9 @@ def _get_object_args(self) -> Dict:
if feature.feature_sources
else None
),
"cardinality": feature.cardinality if feature.feature_type == FeatureType.CATEGORICAL else None,
"embedding_dim": feature.embedding_dim if feature.feature_type == FeatureType.CATEGORICAL else None,
"tensor_dim": feature.tensor_dim if feature.feature_type == FeatureType.NUMERICAL else None,
"cardinality": feature.cardinality if feature.is_cat else None,
"embedding_dim": feature.embedding_dim if feature.is_cat else None,
"tensor_dim": feature.tensor_dim if feature.is_num else None,
}
for feature in self.all_features
]
Expand Down
Loading

0 comments on commit e2dcc5e

Please sign in to comment.