MergeDimsLayer, only allow keep_order=True (#784)

albertz · web-flow · commit 18c05e90fe12 · 2021-11-27T04:43:19.000+01:00
Fix #654. Also introduce Data.get_axes_from_description "dim:%i" variant. This is mostly as a simple way to fix many of the test cases. But I guess it could also be useful for the user in general.
diff --git a/docs/configuration_reference/behavior_version.rst b/docs/configuration_reference/behavior_version.rst
@@ -22,6 +22,16 @@ and not listing legacy/deprecated parameters.
 Version History
 ---------------
 
+Behavior version 6 (2021-11-27)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`MergeDimsLayer` uses ``keep_order=True`` and does not allow ``keep_order=False``.
+There never should be a reason to use ``keep_order=False`` anyway.
+If you have that, just remove it.
+If that causes any problems, there is probably some other issue in your config.
+
+See issue `#654 <https://github.com/rwth-i6/returnn/issues/654>`__.
+
 Behavior version 5 (2021-11-26)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/returnn/tf/layers/basic.py b/returnn/tf/layers/basic.py
@@ -2848,26 +2848,34 @@ class MergeDimsLayer(_ConcatInputLayer):
   """
   layer_class = "merge_dims"
 
-  def __init__(self, axes, keep_order=False, n_out=None, **kwargs):
+  def __init__(self, axes, keep_order=NotSpecified, n_out=None, **kwargs):
     """
-    :param str|list[str]|list[int] axes: see Data.get_axes_from_description(), e.g. "except_time"
-    :param bool keep_order: By default (for historical reasons), the axes are sorted, and then merged.
+    :param str|list[DimensionTag|str] axes: see :func:`Data.get_axis_from_description`
+    :param bool|NotSpecified keep_order: The old default was: the axes are sorted, and then merged.
       Thus, the order of incoming axes will influence the result.
       E.g. inputs [B,S,F] and [B,F,S], with ``axes=["S","F"]``, will get different results,
       although the output shape is [B,S*F] in both cases.
       This is bad: In general, other layers in RETURNN might reorder the axes for various reasons,
       and all layers should behave in the same way, no matter the order.
       It is recommended to set ``keep_order=True``, such that the order defined in ``axes`` defines the behavior,
       and not the incoming axis order.
+      Since behavior version 6, this is already the case.
     :param int|None n_out:
     """
+    from returnn.util import BehaviorVersion
     super(MergeDimsLayer, self).__init__(**kwargs)
+    if keep_order is NotSpecified:
+      keep_order = True if BehaviorVersion.get() >= 6 else False
+    BehaviorVersion.require(
+      condition=keep_order, message="MergeDimsLayer, only keep_order=True is allowed", version=6)
     if keep_order:
-      assert isinstance(axes, (tuple, list)), "%s: unique axes %r required" % (self, axes)
+      assert isinstance(axes, (tuple, list)), (
+        "%s: axes %r must be a list or tuple, to have a well defined order in input %s" % (self, axes, self.input_data))
       axes_ = []
       for axis in axes:
         axis_ = self.input_data.get_axes_from_description(axis, allow_int=False)
-        assert len(axis_) <= 1, "%s: unique axes %r required, but got %r -> %r" % (self, axes, axis, axis_)
+        assert len(axis_) <= 1, (
+          "%s: unique axes %r required in input %s, but got %r -> %r" % (self, axes, self.input_data, axis, axis_))
         axes_ += axis_
       axes = axes_
     else:
@@ -2981,18 +2989,20 @@ def _set_output_sizes(self, merge_axes):
     target_tag.dyn_size_ext = out_size
 
   @classmethod
-  def get_out_data_from_opts(cls, name, axes, keep_order=False,
+  def get_out_data_from_opts(cls, name, axes, keep_order=NotSpecified,
                              sources=(), n_out=NotSpecified, out_type=None, **kwargs):
     """
     :param str name:
     :param str|list[str] axes:
-    :param bool keep_order:
+    :param bool|NotSpecified keep_order:
     :param list[LayerBase] sources:
     :param int|None|NotSpecified n_out:
     :param None|dict[str] out_type:
     :rtype: Data
     """
-    from ..util.data import DimensionTag
+    from returnn.util import BehaviorVersion
+    if keep_order is NotSpecified:
+      keep_order = True if BehaviorVersion.get() >= 6 else False
     assert not out_type, "currently ignored"
     input_data = get_concat_sources_data_template(sources)
     data = input_data.copy(name="%s_output" % name)
diff --git a/returnn/tf/util/data.py b/returnn/tf/util/data.py
@@ -3530,6 +3530,11 @@ def get_axes_from_description(self, axes, allow_int=NotSpecified):
           s += len(static_axes)
         assert 0 <= s < len(static_axes), "%s get_axes_from_description: %r invalid" % (self, axes)
         return [static_axes[s]]
+      elif re.match("(dim):\\d+$", axes):
+        s = int(axes.split(":")[1])
+        dims = [a for a in range(self.batch_ndim) if self.batch_shape[a] == s]
+        assert dims, "%s get_axes_from_description: no dim %i found" % (self, s)
+        return dims
       elif axes in ["f", "feature", "non_spatial"]:
         return self.get_feature_batch_axes()
       elif all([a in "btf" for a in axes]):
diff --git a/returnn/util/basic.py b/returnn/util/basic.py
@@ -209,7 +209,7 @@ class BehaviorVersion:
   The version will be set after the config is defined at __main__.init_config() or Engine.__init__()
   """
 
-  _latest_behavior_version = 5
+  _latest_behavior_version = 6
   _behavior_version = None  # type: typing.Optional[int]
 
   @classmethod
diff --git a/tests/test_TFEngine.py b/tests/test_TFEngine.py
@@ -1233,7 +1233,7 @@ def test_attention_no_encoder_dependency():
                             'n_out': 4, 'padding': 'same'},
           "location_feedback": {'class': 'linear', 'from': ['convolved_att'], 'n_out': 6, 'activation': None},
           "att_energy_in": {'class': 'combine', 'kind': 'add', 'from': ['location_feedback', 's_transformed']},
-          "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights"},
+          "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights", "auto_squeeze": True},
         },
       },
       "decision": {"class": "decide", "from": ["output"], "loss": "edit_distance"}
@@ -1345,7 +1345,7 @@ def test_attention_convolutional_feedback_variant1():
     "location_feedback": {'class': 'linear', 'from': ['convolved_att'], 'n_out': 6, 'activation': None},
     "att_energy_in": {'class': 'combine', 'kind': 'add', 'from': [
       'base:enc_transformed', 'location_feedback', 's_transformed']},
-    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights"},
+    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights", "auto_squeeze": True},
   }
 
   check_attention_variant(recurrent_unit_dict)
@@ -1373,7 +1373,7 @@ def test_attention_convolutional_feedback_variant2():
     "location_feedback": {'class': 'linear', 'from': ['convolved_att'], 'n_out': 6, 'activation': None},
     "att_energy_in": {'class': 'combine', 'kind': 'add', 'from': [
       'base:enc_transformed', 'location_feedback', 's_transformed']},
-    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights"},
+    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights", "auto_squeeze": True},
   }
 
   check_attention_variant(recurrent_unit_dict)
@@ -1412,7 +1412,7 @@ def test_attention_convolutional_feedback_variant3():
     "location_feedback": {'class': 'linear', 'from': ['convolved_att'], 'n_out': 6, 'activation': None},
     "att_energy_in": {'class': 'combine', 'kind': 'add', 'from': [
       'base:enc_transformed', 'location_feedback', 's_transformed']},
-    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights"},
+    "c": {"class": "generic_attention", "base": "base:encoder", "weights": "att_weights", "auto_squeeze": True},
   }
 
   check_attention_variant(recurrent_unit_dict)
@@ -2135,7 +2135,7 @@ def test_rec_subnet_construct_1():
       "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
                             "eval": "source(0) + source(1) * source(2) * 0.5",
                             "out_type": {"dim": 1, "shape": (None, 1)}},
-      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder"},
+      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder", "auto_squeeze": True},
       "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["target_embed", "att"], "n_out": 10},
       "s2": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["s"], "n_out": 10},
       "readout_in": {"class": "linear", "from": ["prev:s2", "prev:target_embed", "att"], "activation": None, "n_out": 10},
@@ -2192,7 +2192,7 @@ def test_rec_subnet_construct_2():
       "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
                             "eval": "source(0) + source(1) * source(2) * 0.5",
                             "out_type": {"dim": 1, "shape": (None, 1)}},
-      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder"},
+      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder", "auto_squeeze": True},
       "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["target_embed", "att"], "n_out": 10},
       "s2": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["s"], "n_out": 10},
       "readout_in": {"class": "linear", "from": ["prev:s2", "prev:target_embed", "att"], "activation": None, "n_out": 10},
@@ -2255,7 +2255,7 @@ def test_rec_subnet_construct_3():
       "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
                             "eval": "source(0) + source(1) * source(2) * 0.5",
                             "out_type": {"dim": 1, "shape": (None, 1)}},
-      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder"},
+      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder", "auto_squeeze": True},
       "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["target_embed", "att"], "n_out": 10},
       "s2": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:s", "prev:target_embed", "att"], "n_out": 10},
       "readout_in": {"class": "linear", "from": ["s2"], "activation": None, "n_out": 10},
@@ -2288,9 +2288,9 @@ def test_rec_subnet_eval_init_out_apply0():
   # (also defined by num_inputs & num_outputs)
   beam_size = 3
   AttNumHeads = 2
-  EncKeyTotalDim = AttNumHeads * 2
+  EncKeyTotalDim = AttNumHeads * 5
   EncKeyPerHeadDim = EncKeyTotalDim // AttNumHeads
-  EncValueTotalDim = AttNumHeads * 2
+  EncValueTotalDim = AttNumHeads * 5
   EncValuePerHeadDim = EncValueTotalDim // AttNumHeads
   network = {
     "lstm0_fw": {"class": "rec", "unit": "nativelstm2", "n_out": 2, "direction": 1, "from": "data:data"},
@@ -2334,7 +2334,7 @@ def test_rec_subnet_eval_init_out_apply0():
                             "eval": "source(0) + source(1) * source(2) * 0.5",
                             "out_type": {"dim": 1, "shape": (None, 1)}, "initial_output": "apply(0)"},  # (B, enc-T, 1)
       "att0": {"class": "generic_attention", "weights": "att_weights", "base": "base:enc_value"},  # (B, H, V)
-      "att": {"class": "merge_dims", "axes": "except_batch", "from": ["att0"]},  # (B, H*V)
+      "att": {"class": "merge_dims", "axes": ["dim:%i" % AttNumHeads, "dim:%i" % EncValuePerHeadDim], "from": "att0"},  # (B, H*V)
 
       "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["target_embed", "att"], "n_out": 2},  # transform
       "readout_in": {"class": "linear", "from": ["prev:s", "prev:target_embed", "att"], "activation": None,
@@ -2768,13 +2768,13 @@ def custom_construction_algo(idx, net_dict):
 def test_net_safe_log_to_log_softmax():
   n_out = 5
   net_dict = {
-    "ff_in_window": {"class": "window", "window_size": 3, "from": "data:data"},  # (B,T,3,3)
-    "ff_in": {"class": "merge_dims", "axes": "except_time", "from": ["ff_in_window"]},  # (B,T,9)
-    "ff0": {"class": "hidden", "activation": "relu", "n_out": 8, "L2": 0.01, "from": ["ff_in"]},  # (B,T,8)
-    "ff_out": {"class": "softmax", "n_out": n_out, "from": ["ff0"]},  # (B,T,5)
+    "ff_in_window": {"class": "window", "window_size": 4, "from": "data:data"},  # (B,T,4,3)
+    "ff_in": {"class": "merge_dims", "axes": ["dim:3", "dim:4"], "from": "ff_in_window"},  # (B,T,9)
+    "ff0": {"class": "hidden", "activation": "relu", "n_out": 8, "L2": 0.01, "from": "ff_in"},  # (B,T,8)
+    "ff_out": {"class": "softmax", "n_out": n_out, "from": "ff0"},  # (B,T,5)
     "ff_out_prior": {
       "class": "accumulate_mean", "exp_average": 0.001,
-      "is_prob_distribution": True, "from": ["ff_out"]},  # (5,)
+      "is_prob_distribution": True, "from": "ff_out"},  # (5,)
     "output": {
       "class": "combine", "kind": "eval", "from": ["ff_out", "ff_out_prior"],
       "eval": "safe_log(source(0)) - safe_log(source(1))",
@@ -2826,7 +2826,7 @@ def test_preload_from_files():
           "class": "linear", "activation": None, "n_out": n_hidden, "from": "data:data",
           'bias_init': 1.0, 'forward_weights_init': 'orthogonal'},
         "output": {
-          "class": "linear", "activation": None, "n_out": n_out, "from": ["l1"],
+          "class": "linear", "activation": None, "n_out": n_out, "from": "l1",
           'bias_init': 2.0, 'forward_weights_init': 'orthogonal'}
       }
     })
@@ -3366,7 +3366,7 @@ def test_attention_forward_hdf_then_unflatten_2d():
       # (B, enc-T, 1)
       "energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"], "n_out": 1},
       "att_weights": {"class": "softmax_over_spatial", "from": ["energy"], "is_output_layer": True},  # (B, enc-T, 1)
-      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder"},
+      "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder", "auto_squeeze": True},
       "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["prev:target_embed", "prev:att"], "n_out": 10},
       "readout_in": {"class": "linear", "from": ["s", "prev:target_embed", "att"], "activation": None, "n_out": 10},
       "readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]},
diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py
diff --git a/tests/test_TFNetworkRecLayer.py b/tests/test_TFNetworkRecLayer.py