diff --git a/deeptables/models/deepmodel.py b/deeptables/models/deepmodel.py index 8645cc8..076c989 100644 --- a/deeptables/models/deepmodel.py +++ b/deeptables/models/deepmodel.py @@ -384,18 +384,6 @@ def __build_inputs(self, categorical_columns: List[CategoricalColumn], continuou return categorical_inputs, continuous_inputs, var_len_categorical_inputs - def __construct_var_len_embedding(self, column: VarLenCategoricalColumn, var_len_inputs, embedding_dropout): - input_layer = var_len_inputs[column.name] - var_len_embeddings = VarLenColumnEmbedding(pooling_strategy=column.pooling_strategy, - input_dim=column.vocabulary_size, - output_dim=column.embeddings_output_dim, - dropout_rate=embedding_dropout, - name=consts.LAYER_PREFIX_EMBEDDING + column.name, - embeddings_initializer=self.config.embeddings_initializer, - embeddings_regularizer=self.config.embeddings_regularizer, - activity_regularizer=self.config.embeddings_activity_regularizer - )(input_layer) - return var_len_embeddings def __build_embeddings(self, categorical_columns, categorical_inputs, var_len_categorical_columns: List[VarLenCategoricalColumn], var_len_inputs, @@ -416,10 +404,19 @@ def __build_embeddings(self, categorical_columns, categorical_inputs, # do embedding for var len feature if var_len_categorical_columns is not None and len(var_len_categorical_columns) > 0: - for c in var_len_categorical_columns: + for column in var_len_categorical_columns: # todo add var len embedding description - var_len_embedding = self.__construct_var_len_embedding(c, var_len_inputs, embedding_dropout) - embeddings.append(var_len_embedding) + input_layer = var_len_inputs[column.name] + var_len_embeddings = VarLenColumnEmbedding(pooling_strategy=column.pooling_strategy, + input_dim=column.vocabulary_size, + output_dim=column.embeddings_output_dim, + dropout_rate=embedding_dropout, + name=consts.LAYER_PREFIX_EMBEDDING + column.name, + embeddings_initializer=self.config.embeddings_initializer, + embeddings_regularizer=self.config.embeddings_regularizer, + activity_regularizer=self.config.embeddings_activity_regularizer + )(input_layer) + embeddings.append(var_len_embeddings) return embeddings diff --git a/deeptables/models/layers.py b/deeptables/models/layers.py index d235c93..fdd88a2 100644 --- a/deeptables/models/layers.py +++ b/deeptables/models/layers.py @@ -929,29 +929,27 @@ def __init__(self, pooling_strategy='max', dropout_rate=0., **kwargs): self.pooling_strategy = pooling_strategy self.dropout_rate = dropout_rate # 支持dropout super(VarLenColumnEmbedding, self).__init__(**kwargs) + self._dropout = None - def build(self, input_shape): + def build(self, input_shape=None): super(VarLenColumnEmbedding, self).build(input_shape) if self.dropout_rate > 0: - self._dropout = SpatialDropout1D(self.dropout_rate) + self._dropout = SpatialDropout1D(self.dropout_rate, name='var_len_emb_dropout') else: self._dropout = None self.built = True def call(self, inputs): - # 1. do embedding embedding_output = super(VarLenColumnEmbedding, self).call(inputs) - # 2. add dropout if self._dropout is not None: dropout_output = self._dropout(embedding_output) else: dropout_output = embedding_output - # 3. format output return dropout_output - def compute_mask(self, inputs, mask): + def compute_mask(self, inputs, mask=None): return None def get_config(self, ): diff --git a/deeptables/utils/dataset_generator.py b/deeptables/utils/dataset_generator.py index 07dbb31..dc66151 100644 --- a/deeptables/utils/dataset_generator.py +++ b/deeptables/utils/dataset_generator.py @@ -35,29 +35,29 @@ def __call__(self, X, y=None, *, batch_size, shuffle, drop_remainder): class _TFDGForPandas(TFDatasetGenerator): def __call__(self, X, y=None, *, batch_size, shuffle, drop_remainder): - train_data = {} + train_data = [] # Note: tensorflow v2.17 will rank inputs into a wrong order, so use a list instead of dict # add categorical data if self.categorical_columns is not None and len(self.categorical_columns) > 0: - train_data['input_categorical_vars_all'] = \ - X[[c.name for c in self.categorical_columns]].values.astype(consts.DATATYPE_TENSOR_FLOAT) + train_data.append(tf.constant( + X[[c.name for c in self.categorical_columns]].values.astype(consts.DATATYPE_TENSOR_FLOAT).tolist())) # add continuous data if self.continuous_columns is not None and len(self.continuous_columns) > 0: for c in self.continuous_columns: - train_data[c.name] = X[c.column_names].values.astype(consts.DATATYPE_TENSOR_FLOAT) + train_data.append(tf.constant( X[c.column_names].values.astype(consts.DATATYPE_TENSOR_FLOAT).tolist())) # add var len categorical data if self.var_len_categorical_columns is not None and len(self.var_len_categorical_columns) > 0: for col in self.var_len_categorical_columns: - train_data[col.name] = np.array(X[col.name].tolist()) + train_data.append(tf.constant(np.array(X[col.name].tolist()).astype(consts.DATATYPE_TENSOR_FLOAT).tolist())) if y is None: - ds = tf.data.Dataset.from_tensor_slices(train_data) + ds = tf.data.Dataset.from_tensor_slices(train_data, name='train_x') else: - y = np.array(y) + y = tf.constant(np.array(y).tolist()) if self.task == consts.TASK_MULTICLASS: y = tf_to_categorical(y, num_classes=self.num_classes) - ds = tf.data.Dataset.from_tensor_slices((train_data, y)) + ds = tf.data.Dataset.from_tensor_slices((tuple(train_data), y), name='train_x_y') if shuffle: ds = ds.shuffle(buffer_size=X.shape[0])