decoder interface more

albertz · albertz · commit ca493ba9b7d5 · 2022-08-22T15:42:54.000+02:00
#49
diff --git a/nn/decoder/base.py b/nn/decoder/base.py
@@ -16,7 +16,7 @@
 """
 
 from __future__ import annotations
-from typing import Union, Tuple
+from typing import Union, Optional, Tuple
 from enum import Enum
 import dataclasses
 from ... import nn
@@ -36,14 +36,14 @@ class Decoder(nn.Module):
   """
   Generic decoder, for attention-based encoder-decoder or transducer.
   Can use label-sync label topology, or time-sync (RNA/CTC), or with vertical transitions (RNN-T).
-  The label emitted in the current step is referred to as alignment-label (or step-label),
+  The label emitted in the current (align) step is referred to as alignment-label (or step-label),
   and can include blank in case this is not label-sync.
 
   None of this is really enforced here, and what mainly defines the interfaces
   is the dependency graph.
   The returned shapes and time axes could be anything,
   as long as it fits together.
-  The step_sync_rnn could also return a 4D tensor with both time-axis and label-axis.
+  The predictor could also return a 4D tensor with both time-axis and label-axis.
 
   Dependency graph:
 
@@ -59,19 +59,129 @@ class Decoder(nn.Module):
 
   def __init__(self, *,
                label_topology: LabelTopology,
-               label_sync_rnn: TDecoderLabelSync,
-               joint_net_log_prob: TDecoderJointNetLogProb,
+               label_predict_enc: Optional[TDecoderLabelSync],
+               predictor: TDecoderJointNetLogProb,
+               target_dim: nn.Dim,
+               target_bos_symbol: int = 0,
+               target_eos_symbol: int = 0,
                ):
     super().__init__()
     self.label_topology = label_topology
-    self.label_sync_rnn = label_sync_rnn  # earlier: slow_rnn
-    self.joint_net_log_prob = joint_net_log_prob  # earlier: fast_rnn + readout
+    self.label_predict_enc = label_predict_enc  # earlier: slow_rnn. label-sync. incl (nb) label embedding
+    self.predictor = predictor  # earlier: fast_rnn + readout. align-sync or matrix time * label. predicts align label
+    self.target_dim = target_dim  # includes blank if not label-sync
+    self.target_bos_symbol = target_bos_symbol
+    self.target_eos_symbol = target_eos_symbol
 
-  def __call__(self, encoder: nn.Tensor) -> nn.Tensor:
+  def __call__(self, *,
+               encoder: nn.Tensor,
+               encoder_spatial_axis: nn.Dim,
+               target: Optional[Union[nn.Tensor, nn.SearchFuncInterface]] = None,
+               axis: Optional[nn.Dim] = None,
+               state: Optional[nn.LayerState] = None,
+               ) -> Tuple[nn.Tensor, nn.LayerState]:
     """
     Make one decoder step (train and/or recognition).
     """
     # TODO ...
+    search = None
+    if isinstance(target, nn.SearchFuncInterface):
+      search = target
+      target = None
+    if target is not None:
+      assert axis, f"{self}: Target spatial axis must be specified when target is given"
+    loop = nn.Loop(axis=axis)
+    loop.state = state if state else self.default_initial_state()
+    with loop:
+
+      if self.label_predict_enc is None:
+        label_predict_enc = None
+      elif isinstance(self.label_predict_enc, IDecoderLabelSyncRnn):
+        label_predict_enc, loop.state.label_predict_enc = self.label_predict_enc(
+          prev_label=loop.state.label_nb,
+          encoder_seq=encoder,
+          state=loop.state.label_predict_enc)
+      elif isinstance(self.label_predict_enc, IDecoderLabelSyncLabelsOnlyRnn):
+        label_predict_enc, loop.state.label_predict_enc = self.label_predict_enc(
+          prev_label=loop.state.label_nb,
+          state=loop.state.label_predict_enc)
+      elif isinstance(self.label_predict_enc, IDecoderLabelSyncAlignDepRnn):
+        encoder_frame = ...  # TODO align dep. or unstack if time-sync
+        label_predict_enc, loop.state.label_predict_enc = self.label_predict_enc(
+          prev_label=loop.state.label_nb,
+          encoder_seq=encoder,
+          encoder_frame=encoder_frame,
+          state=loop.state.label_predict_enc)
+      else:
+        raise TypeError(f"{self}: Unsupported label_predict_enc type {type(self.label_predict_enc)}")
+
+      if isinstance(self.predictor, IDecoderLabelSyncLogits):
+        assert self.label_topology == LabelTopology.LABEL_SYNC, f"{self}: Label topology must be label-sync"
+        assert label_predict_enc is not None, f"{self}: Label predict encoder must be specified"
+        probs = self.predictor(label_sync_in=label_predict_enc)
+        probs_type = "logits"
+      elif isinstance(self.predictor, IDecoderJointNoStateLogProb):
+        assert self.label_topology != LabelTopology.LABEL_SYNC, f"{self}: Label topology must not be label-sync"
+        assert label_predict_enc is not None, f"{self}: Label predict encoder must be specified"
+        encoder_frame = ...  # TODO share with above
+        predictor_out = self.predictor(time_sync_in=encoder_frame, label_sync_in=label_predict_enc)
+        probs = predictor_out.prob_like_wb
+        probs_type = predictor_out.prob_like_type
+      elif isinstance(self.predictor, IDecoderJointAlignStateLogProb):
+        assert self.label_topology != LabelTopology.LABEL_SYNC, f"{self}: Label topology must not be label-sync"
+        assert label_predict_enc is not None, f"{self}: Label predict encoder must be specified"
+        encoder_frame = ...  # TODO share with above
+        predictor_out, loop.state.predictor = self.predictor(
+          time_sync_in=encoder_frame,
+          label_sync_in=label_predict_enc,
+          prev_align_label=loop.state.label_wb,
+          state=loop.state.predictor)
+        probs = predictor_out.prob_like_wb
+        probs_type = predictor_out.prob_like_type
+      elif isinstance(self.predictor, IDecoderJointNoCtxLogProb):
+        assert self.label_topology != LabelTopology.LABEL_SYNC, f"{self}: Label topology must not be label-sync"
+        assert label_predict_enc is None, f"{self}: Label predict encoder not used"
+        encoder_frame = ...  # TODO share with above
+        predictor_out = self.predictor(time_sync_in=encoder_frame)
+        probs = predictor_out.prob_like_wb
+        probs_type = predictor_out.prob_like_type
+      elif isinstance(self.predictor, IDecoderAlignStateLogProb):
+        assert self.label_topology != LabelTopology.LABEL_SYNC, f"{self}: Label topology must not be label-sync"
+        assert label_predict_enc is None, f"{self}: Label predict encoder not used"
+        encoder_frame = ...  # TODO share with above
+        predictor_out, loop.state.predictor = self.predictor(
+          time_sync_in=encoder_frame,
+          prev_align_label=loop.state.label_wb,
+          state=loop.state.predictor)
+        probs = predictor_out.prob_like_wb
+        probs_type = predictor_out.prob_like_type
+      else:
+        raise TypeError(f"{self}: Unsupported predictor type {type(self.predictor)}")
+
+      # TODO loss handling here? in that case, cleverly do the most efficient?
+      # TODO logits instead of log probs?
+      # TODO see below, related is whether and we output
+
+      target = loop.unstack(target) if target is not None else None
+      if search:
+        search.apply_loop(loop)
+        align_label = search.choice(probs=probs, probs_type=probs_type)
+      else:
+        assert target is not None
+        align_label = target
+      if self.label_topology == LabelTopology.LABEL_SYNC:
+        loop.state.label_nb = align_label
+        loop.end(loop.state.label_nb == self.target_eos_symbol, include_eos=False)
+      else:
+        loop.state.label_wb = align_label
+
+      out_labels = loop.stack(align_label) if target is None else None
+      # TODO? out_logits = loop.stack(logits)  # TODO not necessarily logits...
+
+    return out_labels, loop.state
+
+  def default_initial_state(self) -> Optional[nn.LayerState]:
+    """default init state"""
 
 
 # TODO enc ctx module
@@ -90,6 +200,20 @@ def blank_idx(self) -> int:
     """
     raise NotImplementedError
 
+  @property
+  def prob_like_wb(self) -> nn.Tensor:
+    """
+    :return: logits if possible, else log probs. see prob_like_type
+    """
+    return self.log_prob_wb
+
+  @property
+  def prob_like_type(self) -> str:
+    """
+    :return: type of prob_like_wb. "logits" or "log_prob"
+    """
+    return "log_prob"
+
   @property
   def log_prob_wb(self) -> nn.Tensor:
     """
@@ -152,12 +276,12 @@ class DecoderJointLogProbSeparatedOutput(IDecoderJointLogProbOutput):
   log_prob_not_blank: nn.Tensor  # log(-expm1(log_prob_blank)) but you maybe could calc it more directly
 
 
-class IDecoderLabelSyncLogProb(nn.Module):
+class IDecoderLabelSyncLogits(nn.Module):
   """
   For simple (maybe attention-based) encoder-decoder models,
   getting input from some label-sync encoding (TDecoderLabelSync).
 
-  This will produce log probs for non-blank labels.
+  This will produce logits (non-normalized log probs) for non-blank labels.
   There is no blank in this concept.
   """
 
@@ -167,7 +291,7 @@ def __call__(self, *, label_sync_in: nn.Tensor) -> nn.Tensor:
 
 class IDecoderJointNoStateLogProb(nn.Module):
   """
-  Joint network for transducer-like models:
+  Joint network for transducer-like models (e.g. the original RNN-T):
 
   Getting in time-sync inputs, label-sync inputs,
   producing probabilities for labels + blank.
@@ -186,7 +310,7 @@ def __call__(self, *, time_sync_in: nn.Tensor, label_sync_in: nn.Tensor) -> IDec
 
 class IDecoderJointAlignStateLogProb(nn.Module):
   """
-  Joint network for transducer-like models:
+  Joint network for transducer-like models (specifically the extended transducer model):
 
   Getting in time-sync inputs, label-sync inputs,
   producing probabilities for labels + blank.
@@ -196,12 +320,42 @@ def __call__(self, *,
                time_sync_in: nn.Tensor,
                label_sync_in: nn.Tensor,
                prev_align_label: nn.Tensor,
-               state: nn.LayerState,
+               state: nn.LayerState,  # align-sync
+               ) -> Tuple[IDecoderJointLogProbOutput, nn.LayerState]:
+    raise NotImplementedError
+
+
+class IDecoderJointNoCtxLogProb(nn.Module):
+  """
+  Joint network for CTC-like models, having no dependence on the label context:
+
+  Getting in time-sync inputs,
+  producing probabilities for labels + blank.
+  """
+
+  def __call__(self, *, time_sync_in: nn.Tensor) -> IDecoderJointLogProbOutput:
+    raise NotImplementedError
+
+
+class IDecoderAlignStateLogProb(nn.Module):
+  """
+  Joint network for transducer-like models, no explicit nb label dep, only align-label (like RNA):
+
+  Getting in time-sync inputs,
+  producing probabilities for labels + blank.
+  """
+
+  def __call__(self, *,
+               time_sync_in: nn.Tensor,
+               prev_align_label: nn.Tensor,
+               state: nn.LayerState,  # align-sync
                ) -> Tuple[IDecoderJointLogProbOutput, nn.LayerState]:
     raise NotImplementedError
 
 
-TDecoderJointNetLogProb = Union[IDecoderLabelSyncLogProb, IDecoderJointNoStateLogProb, IDecoderJointAlignStateLogProb]
+TDecoderJointNetLogProb = Union[
+  IDecoderLabelSyncLogits, IDecoderJointNoStateLogProb, IDecoderJointAlignStateLogProb,
+  IDecoderJointNoCtxLogProb, IDecoderAlignStateLogProb]
 
 
 class IDecoderLabelSyncRnn(nn.Module):
@@ -243,7 +397,6 @@ class IDecoderLabelSyncRnn(nn.Module):
   def __call__(self, *,
                prev_label: nn.Tensor,
                encoder_seq: nn.Tensor,
-               encoder_frame: nn.Tensor,
                state: nn.LayerState,
                ) -> Tuple[nn.Tensor, nn.LayerState]:
     raise NotImplementedError
@@ -301,7 +454,7 @@ def __call__(self, *,
 class IDecoderStepSyncRnn(nn.Module):
   """
   Represents FastRNN in Transducer.
-  Otherwise in general this runs step-synchronous,
+  Otherwise, in general this runs step-synchronous,
   which is alignment-synchronous or time-synchronous for RNN-T/RNA/CTC,
   or label-synchronous for att-enc-dec.
   """