self attention, more

albertz · albertz · commit ea5b77aec77a · 2021-12-09T17:03:43.000+01:00
Some related discussion in #81.
diff --git a/nn/attention.py b/nn/attention.py
@@ -6,15 +6,14 @@
 from .. import nn
 
 
-class SelfAttention(nn.Module):
+# noinspection PyAbstractClass
+class SelfAttentionBase(nn.Module):
   """
-  Classic self attention
+  Shared base class for self attention
   """
   def __init__(self, *, key_dim_total: nn.Dim, value_dim_total: nn.Dim, num_heads: Union[int, nn.Dim],
                att_dropout: float = 0.):
     super().__init__()
-    if not isinstance(num_heads, nn.Dim):
-      num_heads = nn.SpatialDim("num_heads", num_heads)
     self.key_dim_total = key_dim_total
     self.key_dim_per_head = key_dim_total // num_heads
     self.value_dim_total = value_dim_total
@@ -26,6 +25,16 @@ def __init__(self, *, key_dim_total: nn.Dim, value_dim_total: nn.Dim, num_heads:
     self.expand_dim = nn.SpatialDim("self_att_expand_dim")
     self.att_dropout = att_dropout
 
+
+class SelfAttention(SelfAttentionBase):
+  """
+  Classic self attention
+  """
+  def __init__(self, *, key_dim_total: nn.Dim, value_dim_total: nn.Dim, num_heads: Union[int, nn.Dim],
+               att_dropout: float = 0.):
+    super().__init__(
+      key_dim_total=key_dim_total, value_dim_total=value_dim_total, num_heads=num_heads, att_dropout=att_dropout)
+
   def forward(self, source: nn.LayerRef, *, axis: nn.Dim) -> nn.Layer:
     """forward"""
     # noinspection DuplicatedCode
@@ -49,21 +58,18 @@ def forward(self, source: nn.LayerRef, *, axis: nn.Dim) -> nn.Layer:
     return output
 
 
-class SelfAttentionStep(nn.Module):
+class CausalSelfAttention(SelfAttentionBase):
+  pass  # TODO
+
+
+class CausalSelfAttentionStep(SelfAttentionBase):
   """
-  Auto-regressive self-attention
+  Causal auto-regressive self-attention
   """
   def __init__(self, *, key_dim_total: nn.Dim, value_dim_total: nn.Dim, num_heads: Union[int, nn.Dim],
                att_dropout: float = 0.):
-    super().__init__()
-    self.key_dim_total = key_dim_total
-    self.key_dim_per_head = key_dim_total // num_heads
-    self.value_dim_total = value_dim_total
-    self.value_dim_per_head = value_dim_total // num_heads
-    self.num_heads = num_heads
-    self.qkv = nn.Linear(key_dim_total * 2 + value_dim_total)
-    self.expand_dim = nn.DimensionTag(kind=nn.DimensionTag.Types.Spatial, description="self_att_expand_dim")
-    self.att_dropout = att_dropout
+    super().__init__(
+      key_dim_total=key_dim_total, value_dim_total=value_dim_total, num_heads=num_heads, att_dropout=att_dropout)
 
   def forward(self, source: nn.LayerRef, *, state: nn.LayerState) -> Tuple[nn.Layer, nn.LayerState]:
     """forward"""