batch norm, cleanup unused args (#539)

albertz · web-flow · commit 6410df5eeb26 · 2021-06-13T01:14:37.000+02:00
At least I assume they are unused.
Likely a user would not provide a tf.Tensor in a config.
Or the specified type was wrong
and this was supposed to be a float.
Anyway, I still don't see
how this would have been used potentially.
Maybe this was a relict from the Theano code conversion.
diff --git a/returnn/tf/layers/base.py b/returnn/tf/layers/base.py
@@ -1194,11 +1194,9 @@ def get_constraints_value(self):
   def batch_norm(self, data,
                  use_shift=True, use_std=True, use_sample=0.0, force_sample=False,
                  momentum=0.99, epsilon=1e-3,
-                 sample_mean=None, sample_variance=None,
                  update_sample_only_in_training=False,
                  delay_sample_update=False,
                  param_version=0,
-                 gamma=None, beta=None,
                  gamma_init=1.0, beta_init=0.0,
                  masked_time=True):
     """
@@ -1212,10 +1210,6 @@ def batch_norm(self, data,
     :param bool delay_sample_update:
     :param int param_version: 0 or 1
     :param float epsilon:
-    :param tf.Tensor sample_mean:
-    :param tf.Tensor sample_variance:
-    :param tf.Tensor gamma:
-    :param tf.Tensor beta:
     :param str|float gamma_init: see :func:`TFUtil.get_initializer`, for the scale
     :param str|float beta_init: see :func:`TFUtil.get_initializer`, for the mean
     :param bool masked_time: flatten and mask input tensor
@@ -1253,31 +1247,29 @@ def batch_norm(self, data,
         param_name_prefix = ""
       else:
         raise NotImplementedError("%s: batch_norm param_version %r" % (self, param_version))
-      if sample_mean is None:
-        with self.var_creation_scope():
-          sample_mean = self.add_param(tf_compat.v1.get_variable(
-            shape=data.get_bc_spatial_batch_shape(), initializer=tf_compat.v1.zeros_initializer(),
-            name="%smean" % param_name_prefix,
-            trainable=False))
-        # Use exponential moving average of batch mean.
-        # Note: We could also use cumulative moving average. Our Theano implementation does that for inference.
-        updated_sample_mean = tf_compat.v1.assign_add(sample_mean, (mean - sample_mean) * momentum)
-        if delay_sample_update:
-          delayed_ops.append(updated_sample_mean.op)
-        else:
-          sample_mean = updated_sample_mean
-      if sample_variance is None:
-        # Note: Our Theano implementation does not use a moving average for this.
-        with self.var_creation_scope():
-          sample_variance = self.add_param(tf_compat.v1.get_variable(
-            shape=data.get_bc_spatial_batch_shape(), initializer=tf_compat.v1.ones_initializer(),
-            name="%svariance" % param_name_prefix,
-            trainable=False))
-        updated_sample_variance = tf_compat.v1.assign_add(sample_variance, (variance - sample_variance) * momentum)
-        if delay_sample_update:
-          delayed_ops.append(updated_sample_variance.op)
-        else:
-          sample_variance = updated_sample_variance
+      with self.var_creation_scope():
+        sample_mean = self.add_param(tf_compat.v1.get_variable(
+          shape=data.get_bc_spatial_batch_shape(), initializer=tf_compat.v1.zeros_initializer(),
+          name="%smean" % param_name_prefix,
+          trainable=False))
+      # Use exponential moving average of batch mean.
+      # Note: We could also use cumulative moving average. Our Theano implementation does that for inference.
+      updated_sample_mean = tf_compat.v1.assign_add(sample_mean, (mean - sample_mean) * momentum)
+      if delay_sample_update:
+        delayed_ops.append(updated_sample_mean.op)
+      else:
+        sample_mean = updated_sample_mean
+      # Note: Our Theano implementation does not use a moving average for this.
+      with self.var_creation_scope():
+        sample_variance = self.add_param(tf_compat.v1.get_variable(
+          shape=data.get_bc_spatial_batch_shape(), initializer=tf_compat.v1.ones_initializer(),
+          name="%svariance" % param_name_prefix,
+          trainable=False))
+      updated_sample_variance = tf_compat.v1.assign_add(sample_variance, (variance - sample_variance) * momentum)
+      if delay_sample_update:
+        delayed_ops.append(updated_sample_variance.op)
+      else:
+        sample_variance = updated_sample_variance
       # If train or if force_sample, use default use_sample=0.0, otherwise use_sample=1.0.
       if self.network.train_flag is not False or force_sample:
         if force_sample:
@@ -1295,26 +1287,24 @@ def batch_norm(self, data,
           tf_util.add_control_input(op, control_input=bn.op)
         self.network.register_post_control_dependencies(delayed_ops)
       if use_std:
-        if gamma is None:
-          with self.var_creation_scope():
-            from returnn.tf.util.basic import get_initializer
-            gamma_initializer = get_initializer(
-              gamma_init, seed=self.network.random.randint(2 ** 31) if gamma_init else 0, eval_local_ns={"layer": self})
-            gamma = self.add_param(tf_compat.v1.get_variable(
-              shape=data.get_bc_spatial_batch_shape(), initializer=gamma_initializer,
-              name="%sgamma" % param_name_prefix,
-              trainable=True))
+        with self.var_creation_scope():
+          from returnn.tf.util.basic import get_initializer
+          gamma_initializer = get_initializer(
+            gamma_init, seed=self.network.random.randint(2 ** 31) if gamma_init else 0, eval_local_ns={"layer": self})
+          gamma = self.add_param(tf_compat.v1.get_variable(
+            shape=data.get_bc_spatial_batch_shape(), initializer=gamma_initializer,
+            name="%sgamma" % param_name_prefix,
+            trainable=True))
         bn *= gamma
       if use_shift:
-        if beta is None:
-          with self.var_creation_scope():
-            from returnn.tf.util.basic import get_initializer
-            beta_initializer = get_initializer(
-              beta_init, seed=self.network.random.randint(2 ** 31) if beta_init else 0, eval_local_ns={"layer": self})
-            beta = self.add_param(tf_compat.v1.get_variable(
-              shape=data.get_bc_spatial_batch_shape(), initializer=beta_initializer,
-              name="%sbeta" % param_name_prefix,
-              trainable=True))
+        with self.var_creation_scope():
+          from returnn.tf.util.basic import get_initializer
+          beta_initializer = get_initializer(
+            beta_init, seed=self.network.random.randint(2 ** 31) if beta_init else 0, eval_local_ns={"layer": self})
+          beta = self.add_param(tf_compat.v1.get_variable(
+            shape=data.get_bc_spatial_batch_shape(), initializer=beta_initializer,
+            name="%sbeta" % param_name_prefix,
+            trainable=True))
         bn += beta
       return bn
 
diff --git a/returnn/tf/layers/basic.py b/returnn/tf/layers/basic.py
@@ -575,11 +575,9 @@ class BatchNormLayer(CopyLayer):
 
   def __init__(self, use_shift=NotSpecified, use_std=NotSpecified, use_sample=NotSpecified, force_sample=NotSpecified,
                momentum=NotSpecified, epsilon=NotSpecified,
-               sample_mean=NotSpecified, sample_variance=NotSpecified,
                update_sample_only_in_training=NotSpecified,
                delay_sample_update=NotSpecified,
                param_version=NotSpecified,
-               gamma=NotSpecified, beta=NotSpecified,
                gamma_init=NotSpecified, beta_init=NotSpecified,
                masked_time=NotSpecified, **kwargs):
     """
@@ -592,10 +590,6 @@ def __init__(self, use_shift=NotSpecified, use_std=NotSpecified, use_sample=NotS
     :param bool delay_sample_update:
     :param int param_version: 0 or 1
     :param float epsilon:
-    :param tf.Tensor sample_mean:
-    :param tf.Tensor sample_variance:
-    :param tf.Tensor gamma:
-    :param tf.Tensor beta:
     :param str|float gamma_init: see :func:`TFUtil.get_initializer`, for the scale
     :param str|float beta_init: see :func:`TFUtil.get_initializer`, for the mean
     :param bool masked_time: flatten and mask input tensor
diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py
@@ -243,127 +243,128 @@ def test_batch_norm_vars():
 
 def test_batch_norm():
   with make_scope() as session:
-    import numpy as np
-    net = TFNetwork(extern_data=ExternData())
-    net.train_flag = True
+    net = TFNetwork(extern_data=ExternData(), train_flag=True)
     with tf_compat.v1.variable_scope("src_nchw"):
-      src_nhwc = InternalLayer(name="src_nchw", network=net, out_type={"dim": 16,
-                                                                       "shape": (None, 16, 16),
-                                                                       "batch_dim_axis": 0,
-                                                                       "time_dim_axis": 1,
-                                                                       "feature_dim_axis": 3,
-                                                                       "sparse": False
-                                                                       })
+      src_nhwc = InternalLayer(
+        name="src_nchw", network=net,
+        out_type={
+          "dim": 16,
+          "shape": (None, 16, 16),
+          "batch_dim_axis": 0,
+          "time_dim_axis": 1,
+          "feature_dim_axis": 3,
+          "sparse": False})
       src_nhwc.output.placeholder = tf_compat.v1.placeholder(shape=(None, None, 16, 16), dtype=tf.float32)
       src_nhwc.output.size_placeholder = {0: tf_compat.v1.placeholder(shape=(None,), dtype=tf.int32)}
 
-    rnd = np.random.RandomState(42)
-    mean =  tf.constant(rnd.rand(1, 1, 1, 16), name="rand_mean", dtype=tf.float32)
-    variance = tf.constant(rnd.rand(1, 1, 1, 16), name="rand_var", dtype=tf.float32)
+    rnd = numpy.random.RandomState(42)
     input_data = rnd.rand(10, 11, 16, 16)
-    seq_lens = np.array([11, 11, 11, 11, 11, 11, 11, 11, 11, 11])
+    seq_lens = numpy.array([11] * 10)
 
     with tf_compat.v1.variable_scope("batch_norm_masked_nchw"):
-      batch_norm_1 = BatchNormLayer(name="batch_norm_masked_nchw", network=net, masked_time=True,
-                                    sample_mean=mean, sample_variance=variance,
-                                    sources=[src_nhwc],
-                                    output=BatchNormLayer.get_out_data_from_opts(name="batch_norm_masked_nchw",
-                                                                                 sources=[src_nhwc],
-                                                                                 network=net))
+      batch_norm_1 = BatchNormLayer(
+        name="batch_norm_masked_nchw", network=net, masked_time=True,
+        sources=[src_nhwc],
+        output=BatchNormLayer.get_out_data_from_opts(
+          name="batch_norm_masked_nchw",
+          sources=[src_nhwc],
+          network=net))
       batch_norm_1.post_init(layer_desc=None)
     with tf_compat.v1.variable_scope("batch_norm_nonmasked_nchw"):
-      batch_norm_2 = BatchNormLayer(name="batch_norm_nonmasked_nchw", network=net, masked_time=False,
-                                    sample_mean=mean, sample_variance=variance,
-                                    sources=[src_nhwc],
-                                    output=BatchNormLayer.get_out_data_from_opts(name="batch_norm_nonmasked_nchw",
-                                                                                 sources=[src_nhwc],
-                                                                                 network=net))
+      batch_norm_2 = BatchNormLayer(
+        name="batch_norm_nonmasked_nchw", network=net, masked_time=False,
+        sources=[src_nhwc],
+        output=BatchNormLayer.get_out_data_from_opts(
+          name="batch_norm_nonmasked_nchw",
+          sources=[src_nhwc],
+          network=net))
       batch_norm_2.post_init(layer_desc=None)
-    tf_compat.v1.global_variables_initializer().run()
-    out_1, seq_lens_1 = session.run([batch_norm_1.output.placeholder,
-                                 batch_norm_1.output.size_placeholder[0]],
-                                feed_dict={src_nhwc.output.placeholder: input_data,
-                                           src_nhwc.output.size_placeholder[0]: seq_lens}
-                                )
-    out_2, seq_lens_2 = session.run([batch_norm_2.output.placeholder,
-                                 batch_norm_2.output.size_placeholder[0]],
-                                feed_dict={src_nhwc.output.placeholder: input_data,
-                                           src_nhwc.output.size_placeholder[0]: seq_lens}
-                                )
-    assert np.array_equal(out_1, out_2)
-    print(np.sum(out_1 - out_2))
+    tf_compat.v1.global_variables_initializer().run(session=session)
+    out_1, seq_lens_1 = session.run(
+      [batch_norm_1.output.placeholder, batch_norm_1.output.size_placeholder[0]],
+      feed_dict={
+        src_nhwc.output.placeholder: input_data,
+        src_nhwc.output.size_placeholder[0]: seq_lens})
+    out_2, seq_lens_2 = session.run(
+      [batch_norm_2.output.placeholder, batch_norm_2.output.size_placeholder[0]],
+      feed_dict={
+        src_nhwc.output.placeholder: input_data,
+        src_nhwc.output.size_placeholder[0]: seq_lens})
+    assert numpy.array_equal(out_1, out_2)
+    print(numpy.sum(out_1 - out_2))
 
 
 def test_batch_norm_unequal_seq_len():
   with make_scope() as session:
-    import numpy as np
-    import numpy.testing as npt
-    net = TFNetwork(extern_data=ExternData())
-    net.train_flag = True
+    net = TFNetwork(extern_data=ExternData(), train_flag=True)
     with tf_compat.v1.variable_scope("src_nhwc"):
-      src_nhwc = InternalLayer(name="src_nhwc", network=net, out_type={"dim": 16,
-                                                                       "shape": (None, 16, 16),
-                                                                       "batch_dim_axis": 0,
-                                                                       "time_dim_axis": 1,
-                                                                       "feature_dim_axis": 3,
-                                                                       "sparse": False
-                                                                       })
+      src_nhwc = InternalLayer(
+        name="src_nhwc", network=net,
+        out_type={
+          "dim": 16,
+          "shape": (None, 16, 16),
+          "batch_dim_axis": 0,
+          "time_dim_axis": 1,
+          "feature_dim_axis": 3,
+          "sparse": False})
       src_nhwc.output.placeholder = tf_compat.v1.placeholder(shape=(None, None, 16, 16), dtype=tf.float32)
       src_nhwc.output.size_placeholder = {0: tf_compat.v1.placeholder(shape=(None,), dtype=tf.int32)}
 
-    rnd = np.random.RandomState(42)
-    mean = tf.constant(rnd.rand(1, 1, 1, 16), name="rand_mean", dtype=tf.float32)
-    variance = tf.constant(rnd.rand(1, 1, 1, 16), name="rand_var", dtype=tf.float32)
+    rnd = numpy.random.RandomState(42)
     input_data = rnd.rand(10, 11, 16, 16).astype('f')
     input_data[2, 5:, :, :] = 0
-    data_mean = np.mean(input_data, axis=(0, 1, 2), keepdims=True, dtype=np.float32)
-    data_var = np.var(input_data, axis=(0, 1, 2), keepdims=True, dtype=np.float32)
-    input_data_masked = np.copy(input_data)
-    seq_lens = np.array([11, 11, 5, 11, 11, 11, 11, 11, 11, 11], dtype=np.float32)
+    input_data_masked = numpy.copy(input_data)
+    seq_lens = numpy.array([11, 11, 5, 11, 11, 11, 11, 11, 11, 11], dtype=numpy.float32)
     n1 = 9 * 11 * 16 + 5 * 16
     n2 = 10 * 11 * 16
 
     with tf_compat.v1.variable_scope("batch_norm_masked_nchw"):
-      batch_norm_1 = BatchNormLayer(name="batch_norm_masked_nchw", network=net, masked_time=True,
-                                    sample_mean=mean, sample_variance=variance,
-                                    use_shift=False, use_std=False, epsilon=0.0,
-                                    sources=[src_nhwc],
-                                    output=BatchNormLayer.get_out_data_from_opts(name="batch_norm_masked_nchw",
-                                                                                 sources=[src_nhwc],
-                                                                                 network=net))
+      batch_norm_1 = BatchNormLayer(
+        name="batch_norm_masked_nchw", network=net, masked_time=True,
+        use_shift=False, use_std=False, epsilon=0.0,
+        sources=[src_nhwc],
+        output=BatchNormLayer.get_out_data_from_opts(
+          name="batch_norm_masked_nchw",
+          sources=[src_nhwc],
+          network=net))
       batch_norm_1.post_init(layer_desc=None)
     with tf_compat.v1.variable_scope("batch_norm_nonmasked_nchw"):
-      batch_norm_2 = BatchNormLayer(name="batch_norm_nonmasked_nchw", network=net, masked_time=False,
-                                    sample_mean=mean, sample_variance=variance,
-                                    use_shift=False, use_std=False, epsilon=0,
-                                    sources=[src_nhwc],
-                                    output=BatchNormLayer.get_out_data_from_opts(name="batch_norm_nonmasked_nchw",
-                                                                                 sources=[src_nhwc],
-                                                                                 network=net))
+      batch_norm_2 = BatchNormLayer(
+        name="batch_norm_nonmasked_nchw", network=net, masked_time=False,
+        use_shift=False, use_std=False, epsilon=0,
+        sources=[src_nhwc],
+        output=BatchNormLayer.get_out_data_from_opts(
+          name="batch_norm_nonmasked_nchw",
+          sources=[src_nhwc],
+          network=net))
       batch_norm_2.post_init(layer_desc=None)
-    tf_compat.v1.global_variables_initializer().run()
-    out_1, seq_lens_1 = session.run([batch_norm_1.output.placeholder,
-                                     batch_norm_1.output.size_placeholder[0]],
-                                    feed_dict={src_nhwc.output.placeholder: input_data,
-                                               src_nhwc.output.size_placeholder[0]: seq_lens}
-                                    )
-    out_2, seq_lens_2 = session.run([batch_norm_2.output.placeholder,
-                                 batch_norm_2.output.size_placeholder[0]],
-                                feed_dict={src_nhwc.output.placeholder: input_data_masked,
-                                           src_nhwc.output.size_placeholder[0]: seq_lens}
-                                )
+    tf_compat.v1.global_variables_initializer().run(session=session)
+    out_1, seq_lens_1 = session.run(
+      [batch_norm_1.output.placeholder, batch_norm_1.output.size_placeholder[0]],
+      feed_dict={
+        src_nhwc.output.placeholder: input_data,
+        src_nhwc.output.size_placeholder[0]: seq_lens})
+    out_2, seq_lens_2 = session.run(
+      [batch_norm_2.output.placeholder, batch_norm_2.output.size_placeholder[0]],
+      feed_dict={
+        src_nhwc.output.placeholder: input_data_masked,
+        src_nhwc.output.size_placeholder[0]: seq_lens})
+
     # Manually calculating batch_norm and compare to the tf output
-    np_bn2 = (input_data - data_mean) * (1.0 / np.sqrt(data_var))
-    npt.assert_array_almost_equal(np_bn2, out_2, decimal=5)
+    data_mean = numpy.mean(input_data, axis=(0, 1, 2), keepdims=True, dtype=numpy.float32)
+    data_var = numpy.var(input_data, axis=(0, 1, 2), keepdims=True, dtype=numpy.float32)
+    np_bn2 = (input_data - data_mean) * (1.0 / numpy.sqrt(data_var))
+    numpy.testing.assert_array_almost_equal(np_bn2, out_2, decimal=5)
     # Manually calculating batch_norm with different seq_lens, having:
     # Mean_1 = n2 / n1 * Mean_2
     # Var_1 = n2 / n1 * (Var_2 + Mean_2 ^ 2 (1 - n2 / n1))
     # bn_1 = (x - Mean_1) * 1 / sqrt(Var_1)
     # Substituting Mean_1 and Var_1:
-    np_bn1 = (input_data - n2 / n1 * data_mean) * \
-             (1.0 / np.sqrt(n2 / n1 * (data_var + data_mean ** 2 * (1 - n2 / n1))))
+    np_bn1 = (
+      (input_data - n2 / n1 * data_mean) *
+      (1.0 / numpy.sqrt(n2 / n1 * (data_var + data_mean ** 2 * (1 - n2 / n1)))))
     # Check with tf output.
-    npt.assert_array_almost_equal(np_bn1, out_1, decimal=5)
+    numpy.testing.assert_array_almost_equal(np_bn1, out_1, decimal=5)
 
 
 def test_activation_layer_net_construct():