Skip to content

Commit

Permalink
fix tensorflow v2.17 ranking inputs into a wrong order
Browse files Browse the repository at this point in the history
  • Loading branch information
oaksharks committed Nov 21, 2024
1 parent dcf28c4 commit 4643439
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 29 deletions.
27 changes: 12 additions & 15 deletions deeptables/models/deepmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,18 +384,6 @@ def __build_inputs(self, categorical_columns: List[CategoricalColumn], continuou

return categorical_inputs, continuous_inputs, var_len_categorical_inputs

def __construct_var_len_embedding(self, column: VarLenCategoricalColumn, var_len_inputs, embedding_dropout):
input_layer = var_len_inputs[column.name]
var_len_embeddings = VarLenColumnEmbedding(pooling_strategy=column.pooling_strategy,
input_dim=column.vocabulary_size,
output_dim=column.embeddings_output_dim,
dropout_rate=embedding_dropout,
name=consts.LAYER_PREFIX_EMBEDDING + column.name,
embeddings_initializer=self.config.embeddings_initializer,
embeddings_regularizer=self.config.embeddings_regularizer,
activity_regularizer=self.config.embeddings_activity_regularizer
)(input_layer)
return var_len_embeddings

def __build_embeddings(self, categorical_columns, categorical_inputs,
var_len_categorical_columns: List[VarLenCategoricalColumn], var_len_inputs,
Expand All @@ -416,10 +404,19 @@ def __build_embeddings(self, categorical_columns, categorical_inputs,

# do embedding for var len feature
if var_len_categorical_columns is not None and len(var_len_categorical_columns) > 0:
for c in var_len_categorical_columns:
for column in var_len_categorical_columns:
# todo add var len embedding description
var_len_embedding = self.__construct_var_len_embedding(c, var_len_inputs, embedding_dropout)
embeddings.append(var_len_embedding)
input_layer = var_len_inputs[column.name]
var_len_embeddings = VarLenColumnEmbedding(pooling_strategy=column.pooling_strategy,
input_dim=column.vocabulary_size,
output_dim=column.embeddings_output_dim,
dropout_rate=embedding_dropout,
name=consts.LAYER_PREFIX_EMBEDDING + column.name,
embeddings_initializer=self.config.embeddings_initializer,
embeddings_regularizer=self.config.embeddings_regularizer,
activity_regularizer=self.config.embeddings_activity_regularizer
)(input_layer)
embeddings.append(var_len_embeddings)

return embeddings

Expand Down
10 changes: 4 additions & 6 deletions deeptables/models/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,29 +929,27 @@ def __init__(self, pooling_strategy='max', dropout_rate=0., **kwargs):
self.pooling_strategy = pooling_strategy
self.dropout_rate = dropout_rate # 支持dropout
super(VarLenColumnEmbedding, self).__init__(**kwargs)
self._dropout = None

def build(self, input_shape):
def build(self, input_shape=None):
super(VarLenColumnEmbedding, self).build(input_shape)
if self.dropout_rate > 0:
self._dropout = SpatialDropout1D(self.dropout_rate)
self._dropout = SpatialDropout1D(self.dropout_rate, name='var_len_emb_dropout')
else:
self._dropout = None
self.built = True

def call(self, inputs):
# 1. do embedding
embedding_output = super(VarLenColumnEmbedding, self).call(inputs)

# 2. add dropout
if self._dropout is not None:
dropout_output = self._dropout(embedding_output)
else:
dropout_output = embedding_output

# 3. format output
return dropout_output

def compute_mask(self, inputs, mask):
def compute_mask(self, inputs, mask=None):
return None

def get_config(self, ):
Expand Down
16 changes: 8 additions & 8 deletions deeptables/utils/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,29 @@ def __call__(self, X, y=None, *, batch_size, shuffle, drop_remainder):

class _TFDGForPandas(TFDatasetGenerator):
def __call__(self, X, y=None, *, batch_size, shuffle, drop_remainder):
train_data = {}
train_data = [] # Note: tensorflow v2.17 will rank inputs into a wrong order, so use a list instead of dict
# add categorical data
if self.categorical_columns is not None and len(self.categorical_columns) > 0:
train_data['input_categorical_vars_all'] = \
X[[c.name for c in self.categorical_columns]].values.astype(consts.DATATYPE_TENSOR_FLOAT)
train_data.append(tf.constant(
X[[c.name for c in self.categorical_columns]].values.astype(consts.DATATYPE_TENSOR_FLOAT).tolist()))

# add continuous data
if self.continuous_columns is not None and len(self.continuous_columns) > 0:
for c in self.continuous_columns:
train_data[c.name] = X[c.column_names].values.astype(consts.DATATYPE_TENSOR_FLOAT)
train_data.append(tf.constant( X[c.column_names].values.astype(consts.DATATYPE_TENSOR_FLOAT).tolist()))

# add var len categorical data
if self.var_len_categorical_columns is not None and len(self.var_len_categorical_columns) > 0:
for col in self.var_len_categorical_columns:
train_data[col.name] = np.array(X[col.name].tolist())
train_data.append(tf.constant(np.array(X[col.name].tolist()).astype(consts.DATATYPE_TENSOR_FLOAT).tolist()))

if y is None:
ds = tf.data.Dataset.from_tensor_slices(train_data)
ds = tf.data.Dataset.from_tensor_slices(train_data, name='train_x')
else:
y = np.array(y)
y = tf.constant(np.array(y).tolist())
if self.task == consts.TASK_MULTICLASS:
y = tf_to_categorical(y, num_classes=self.num_classes)
ds = tf.data.Dataset.from_tensor_slices((train_data, y))
ds = tf.data.Dataset.from_tensor_slices((tuple(train_data), y), name='train_x_y')

if shuffle:
ds = ds.shuffle(buffer_size=X.shape[0])
Expand Down

0 comments on commit 4643439

Please sign in to comment.