Recover relaxed behavior, strict with new behavior version (#1144)

albertz · web-flow · commit f38f5ae6f64e · 2022-10-13T14:02:24.000+02:00
A check on matching time dim of RecLayer sub output layer to the RecLayer time dim. Fix #1140 This introduces a new behavior version 13 (#508).
diff --git a/docs/configuration_reference/behavior_version.rst b/docs/configuration_reference/behavior_version.rst
@@ -22,6 +22,24 @@ and not listing legacy/deprecated parameters.
 Version History
 ---------------
 
+Behavior version 13 (2022-10-13)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This enables some extra checks in the :class:`RecLayer` which break some old configs,
+where the old configs where actually broken,
+but those broken parts did not play a role for the training
+and thus it did not matter.
+However, we don't want to allow such broken configs anymore.
+More specifically, an optimized-out ``output`` sub-layer of a :class:`RecLayer`
+must have the same time dim as the :class:`RecLayer` itself.
+For some specific transducer configs, we have this problem
+(`example <https://github.com/rwth-i6/returnn-experiments/blob/264d13aef3321d48f685cc9750fd277fb70cc74e/2020-rnn-transducer/configs/rna-tf2.blank0.enc6l-grow2l.scratch-lm.rdrop02.lm1-1024.attwb5-drop02.l2_1e_4.mlr50.config#L778>`__).
+
+This behavior version might also require
+that the dim tags of ``extern_data`` are properly defined.
+
+See issue `#1140 <https://github.com/rwth-i6/returnn/issues/1140>`__.
+
 Behavior version 12 (2022-01-06)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/returnn/tf/layers/rec.py b/returnn/tf/layers/rec.py
@@ -2325,6 +2325,7 @@ def get_output(self):
     :rtype: tf.Tensor
     """
     from returnn.tf.util.basic import check_input_dim, tensor_array_stack, Dim, get_valid_scope_name_from_str
+    from returnn.util.basic import BehaviorVersion
     assert self.parent_rec_layer
     rec_layer = self.parent_rec_layer
 
@@ -3067,6 +3068,14 @@ def cond(i, net_vars, acc_tas, seq_len_info=None, allow_inf_max_len=False):
       layer = self.net.layers[name]
       assert layer.search_choices
 
+    for key in (
+          self.net.used_data_keys |
+          (self.input_layers_net.used_data_keys if self.input_layers_net else set()) |
+          (self.output_layers_net.used_data_keys if self.output_layers_net else set())):
+      if key == "source":
+        continue
+      self.parent_net.used_data_keys.add(key)
+
     with tf.name_scope("output"):
       output_layer = None
       if self.input_layers_net and "output" in self.input_layers_net.layers:
@@ -3076,7 +3085,19 @@ def cond(i, net_vars, acc_tas, seq_len_info=None, allow_inf_max_len=False):
       if output_layer:
         assert isinstance(output_layer, LayerBase)
         output_data = output_layer.output.copy_as_time_major()
-        self.time_dim_tag.declare_same_as(output_data.get_time_dim_tag())
+        if not self.time_dim_tag.is_dim_known():
+          self.time_dim_tag.declare_same_as(output_data.get_time_dim_tag())
+        elif self.time_dim_tag not in output_data.dim_tags:
+          # We allow this for older behavior version to not break some older setups.
+          # https://github.com/rwth-i6/returnn/issues/1140
+          BehaviorVersion.require(
+            False,
+            "%s: time-dim-tag mismatch: self %r vs sub-output-layer %r time-dim-tag %r" % (
+              rec_layer, self.time_dim_tag, output_data, output_data.get_time_dim_tag()), version=13)
+          # No further checks, it would fail anyway.
+          # Replace the actual rec layer output and return.
+          rec_layer.output = output_data
+          return output_data.placeholder
         assert len(rec_layer.output.dim_tags) == len(output_data.dim_tags)
         for tag1, tag2 in zip(rec_layer.output.dim_tags, output_data.dim_tags):
           try:
@@ -3090,24 +3111,14 @@ def cond(i, net_vars, acc_tas, seq_len_info=None, allow_inf_max_len=False):
           # and then created once for the template layer, and again for the real layer.
           # Make sure they are really the same such that we get all information like dyn sizes.
           tag1.declare_same_as(tag2)
-        output = output_data.placeholder
+        return output_data.placeholder
       else:
         assert seq_len is not None
         rec_layer.output.size_placeholder[0] = seq_len
         assert not self.net.layers["output"].get_search_choices()
-        output = tensor_array_stack(
+        return tensor_array_stack(
           self.final_acc_tas_dict["output_output"], stop=max_seq_len, name="output_stack")  # e.g. (time, batch, dim)
 
-    for key in (
-          self.net.used_data_keys |
-          (self.input_layers_net.used_data_keys if self.input_layers_net else set()) |
-          (self.output_layers_net.used_data_keys if self.output_layers_net else set())):
-      if key == "source":
-        continue
-      self.parent_net.used_data_keys.add(key)
-
-    return output
-
   def _get_search_choice_seq(self, search_choices):
     """
     :param SearchChoices search_choices:
diff --git a/returnn/util/basic.py b/returnn/util/basic.py
@@ -238,7 +238,7 @@ class BehaviorVersion:
   The version will be set after the config is defined at __main__.init_config() or Engine.__init__()
   """
 
-  _latest_behavior_version = 12
+  _latest_behavior_version = 13
   _behavior_version = None  # type: typing.Optional[int]
 
   @classmethod
diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py
@@ -5871,14 +5871,16 @@ def test_SliceNdLayer_dyn_size():
 
 def test_SliceNdLayer_multidimensional_start():
   with make_scope() as session:
-    n_out = 5
+    from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
+    out_dim = FeatureDim("feat", 3)
+    time_dim = SpatialDim("time")
     n_batch = 3
     max_seq_len = 10
     config = Config({
       "debug_print_layer_output_template": True,
       "extern_data": {
-        "data": {"dim": n_out},
-        "classes": {"dim": n_out, "sparse": True}
+        "data": {"dim_tags": [batch_dim, time_dim, out_dim]},
+        "classes": {"dim_tags": [batch_dim, time_dim], "sparse_dim": out_dim}
       }})
     net = TFNetwork(config=config, train_flag=True)
     net.construct_from_dict({
@@ -5902,7 +5904,7 @@ def test_SliceNdLayer_multidimensional_start():
     input_data = feed[net.extern_data.data["data"].placeholder]
     max_size = numpy.amax(seq_lens[:, None] - starts)
     max_size = max(max_size, 0)
-    assert segments.shape == (n_batch, max_seq_len, max_size, n_out)
+    assert segments.shape == (n_batch, max_seq_len, max_size, out_dim.dimension)
     for b in range(n_batch):
       for t in range(max_seq_len):
         s = starts[b, t]
@@ -5911,22 +5913,24 @@ def test_SliceNdLayer_multidimensional_start():
           orig_seq = numpy.pad(orig_seq, [(0, max_size - len(orig_seq)), (0, 0)], "constant")
         elif len(orig_seq) > max_size:
           orig_seq = orig_seq[:max_size]
-        assert orig_seq.shape == (max_size, n_out)
+        assert orig_seq.shape == (max_size, out_dim.dimension)
         orig_seq = numpy.where((numpy.arange(s, s + max_size) >= seq_lens[b])[:, None], 0.0, orig_seq)
         for t2 in range(max_size):
           numpy.testing.assert_equal(orig_seq[t2], segments[b, t, t2])
 
 
 def test_SliceNdLayer_multidimensional_size():
   with make_scope() as session:
-    n_out = 5
+    from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
+    out_dim = FeatureDim("feat", 3)
+    time_dim = SpatialDim("time")
     n_batch = 3
     max_seq_len = 10
     config = Config({
       "debug_print_layer_output_template": True,
       "extern_data": {
-        "data": {"dim": n_out},
-        "classes": {"dim": n_out, "sparse": True}
+        "data": {"dim_tags": [batch_dim, time_dim, out_dim]},
+        "classes": {"dim_tags": [batch_dim, time_dim], "sparse_dim": out_dim}
       }})
     net = TFNetwork(config=config, train_flag=True)
     net.construct_from_dict({
@@ -5954,7 +5958,7 @@ def test_SliceNdLayer_multidimensional_size():
     input_data = feed[net.extern_data.data["data"].placeholder]
     max_size = numpy.amax(sizes)
     max_size = max(max_size, 0)
-    assert segments.shape == (n_batch, max_seq_len, max_size, n_out)
+    assert segments.shape == (n_batch, max_seq_len, max_size, out_dim.dimension)
     for b in range(n_batch):
       for t in range(max_seq_len):
         s = starts[b, t]
@@ -5965,7 +5969,7 @@ def test_SliceNdLayer_multidimensional_size():
           orig_seq = numpy.pad(orig_seq, [(0, max_size - len(orig_seq)), (0, 0)], "constant")
         elif len(orig_seq) > max_size:
           orig_seq = orig_seq[:max_size]
-        assert orig_seq.shape == (max_size, n_out)
+        assert orig_seq.shape == (max_size, out_dim.dimension)
         orig_seq = numpy.where((numpy.arange(s, s + max_size) >= seq_lens[b])[:, None], 0.0, orig_seq)
         for t2 in range(max_size):
           numpy.testing.assert_equal(orig_seq[t2], segments[b, t, t2])
diff --git a/tests/test_TFNetworkRecLayer.py b/tests/test_TFNetworkRecLayer.py
@@ -600,11 +600,17 @@ def _enc_func(source, **_):
 
 
 def test_rec_subnet_with_choice():
+  from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
+  in_dim = FeatureDim("feat", 3)
+  out_dim = FeatureDim("classes", 4)
+  time_dim = SpatialDim("time")
   with tf_compat.v1.Session():
     config = Config()
     config.update({
-      "num_outputs": 3,
-      "num_inputs": 4,
+      "extern_data": {
+        "data": {"dim_tags": [batch_dim, time_dim, in_dim]},
+        "classes": {"dim_tags": [batch_dim, time_dim], "sparse_dim": out_dim}
+      },
       "network": {
         "output": {"class": "rec", "from": "data:data", "target": "classes", "unit": {
           "prob": {"class": "softmax", "from": ["prev:output"], "loss": "ce", "target": "classes"},
@@ -1220,12 +1226,16 @@ def test_rec_RecStepInfoLayer_broadcast_moved_out():
       },
     }
   }
+  from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
+  in_dim = FeatureDim("feat", 3)
+  out_dim = FeatureDim("classes", 5)
+  time_dim = SpatialDim("time")
   config = Config({
     "debug_print_layer_output_template": True,
     "extern_data": {
-      "data": {"dim": 3},
-      "classes": {"sparse": True, "dim": 5},
-    }
+      "data": {"dim_tags": [batch_dim, time_dim, in_dim]},
+      "classes": {"dim_tags": [batch_dim, time_dim], "sparse_dim": out_dim}
+    },
   })
   from test_TFNetworkLayer import make_feed_dict
   with make_scope() as session:
@@ -6158,6 +6168,93 @@ def test_reclayer_shape_from_initial():
     session.run(out.placeholder, feed_dict=make_feed_dict(net.extern_data))
 
 
+def test_reclayer_time_sync_target_diff():
+  # https://github.com/rwth-i6/returnn/issues/1140
+  from returnn.util.basic import BehaviorVersion
+  from returnn.tf.util.data import batch_dim, SpatialDim, FeatureDim
+  from returnn.tf.layers.rec import _SubnetworkRecCell
+  src_dim = FeatureDim("src-feat", 5)
+  tgt_dim = FeatureDim("tgt-classes", 7)
+  tgt_with_blank_dim = tgt_dim + 1
+  src_time_dim = SpatialDim("src-time")
+  tgt_time_dim = SpatialDim("out-spatial")
+
+  config = Config({
+    "extern_data": {
+      "data": {"dim_tags": [batch_dim, src_time_dim, src_dim]},
+      "classes": {"dim_tags": [batch_dim, tgt_time_dim], "sparse_dim": tgt_dim, "available_for_inference": False},
+      "align_classes": {
+        "dim_tags": [batch_dim, src_time_dim], "sparse_dim": tgt_with_blank_dim, "available_for_inference": False},
+    },
+    "network": {
+      "encoder": {"class": "linear", "activation": "tanh", "n_out": 5, "from": "data:data"},
+
+      "output": {"class": "rec", "from": "encoder", "unit": {
+        "output_prob": {"class": "softmax", "from": "data:source", "out_dim": tgt_with_blank_dim},
+
+        # Note: This is actually not correct to have 'classes' here.
+        # In practice, in search, it would use output_prob and then have actually one more label.
+        # classes also has the wrong spatial dim, which actually causes the error.
+        # However, then this output is actually never used.
+        # We had such training configs for transducer, and we want to make sure that they still work.
+        # In that case, in search, the config switched to a different target, so that is why it worked.
+        'output': {'class': 'choice', 'target': 'classes', 'beam_size': 12, 'from': "output_prob",
+                   "initial_output": 0},
+
+        # Would also look different for recognition.
+        "classes_embed": {"class": "linear", "activation": "tanh", "n_out": 5, "from": "base:data:classes"},
+        "joint": {
+          "class": "combine", "from": ["output_prob", "classes_embed"], "kind": "mul",
+          "allow_broadcast_all_sources": True},
+
+        # Dummy loss. In transducer, this would be the full-sum after joint network.
+        # Here we just need sth to trigger the dependencies.
+        "loss": {
+          "class": "eval", "from": "joint",
+          "eval": "tf.reduce_mean(source(0,auto_convert=False))",
+          "out_type": {"shape": (), "dtype": "float32", "batch_dim_axis": None, "time_dim_axis": None},
+          "loss": "as_is"
+        },
+
+      }, "target": "classes"},
+    }})
+
+  print("Constructing train network (old behavior).")
+  with make_scope() as session:
+    net = TFNetwork(train_flag=True, config=config)
+    orig_behavior_version = BehaviorVersion._behavior_version
+    try:
+      BehaviorVersion._behavior_version = 0
+      # The net dict requires an older behavior version. This is important for the test.
+      # We want to make sure such old config still works.
+      net.construct_from_dict(config.typed_value("network"))
+    finally:
+      BehaviorVersion._behavior_version = orig_behavior_version
+    # Check whether we triggered the dim tag bug.
+    assert src_time_dim != tgt_time_dim
+    net.initialize_params(session)
+    rec_layer = net.get_layer("output")
+    assert isinstance(rec_layer, RecLayer)
+    cell = rec_layer.cell
+    assert isinstance(cell, _SubnetworkRecCell)
+    assert_equal(cell.layers_in_loop, [])
+    loss = net.get_total_loss()
+    from test_TFNetworkLayer import make_feed_dict
+    loss_v = session.run(loss, feed_dict=make_feed_dict(net.extern_data))
+    print("Loss:", loss_v)
+
+  print("Constructing train network (new behavior).")
+  with make_scope():
+    net = TFNetwork(train_flag=True, config=config)
+    try:
+      net.construct_from_dict(config.typed_value("network"))
+    except BehaviorVersion.RequirementNotSatisfied as exc:
+      assert "time-dim-tag mismatch" in str(exc)
+      print("Got expected exception:", exc)
+    else:
+      raise Exception("did not get expected exception")
+
+
 def test_convert_lstm_params_save_load():
   """
   Test conversions from different units to different units.