[RWKV7] use fast fused_addcmul_rwkv7 op

zhiyuan1i · zhiyuan1i · commit 6b673e6bbdda · 2025-04-09T12:31:44.000Z
diff --git a/fla/layers/rwkv7.py b/fla/layers/rwkv7.py
@@ -14,6 +14,7 @@
 from fla.modules import GroupNorm
 from fla.modules.l2norm import l2_norm
 from fla.ops.rwkv7 import chunk_rwkv7, fused_recurrent_rwkv7
+from fla.ops.rwkv7.fused_addcmul import fused_addcmul_rwkv7
 
 if TYPE_CHECKING:
     from fla.models.utils import Cache
@@ -36,7 +37,6 @@ def __init__(
         layer_idx: int = None,
         fuse_norm: bool = False,
         value_dim: int = None,
-        wkv_precision: str = 'bfloat16',
         **kwargs
     ) -> RWKV7Attention:
         super().__init__()
@@ -65,8 +65,12 @@ def __init__(
         self.fuse_norm = fuse_norm
 
         self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
-
-        self.x_x = nn.Parameter(torch.zeros(6, hidden_size))
+        self.x_r = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.x_w = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.x_k = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.x_v = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.x_a = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.x_g = nn.Parameter(torch.zeros(1, 1, hidden_size))
 
         self.k_k = nn.Parameter(torch.zeros(self.key_dim))
         self.k_a = nn.Parameter(torch.zeros(self.key_dim))
@@ -99,15 +103,6 @@ def __init__(
                 affine=elementwise_affine
             )
 
-        if wkv_precision == 'bfloat16':
-            self.precision = torch.bfloat16
-        elif wkv_precision == 'float16':
-            self.precision = torch.float16
-        elif wkv_precision == 'float32':
-            self.precision = torch.float32
-        else:
-            raise ValueError(f"""Unsupported wkv_precision `{wkv_precision}`.
-                              Supported values are `bfloat16`, `float16`, and `float32`.""")
         self.apply(self._initialize_weights)
 
     def _initialize_weights(self, module: nn.Module):
@@ -162,7 +157,9 @@ def forward(
 
         # [batch_size, seq_len, hidden_size]
         delta = shifted - hidden_states
-        xr, xw, xk, xv, xa, xg = hidden_states.addcmul(delta, self.x_x.view(6, 1, 1, -1)).unbind(0)
+
+        xr, xw, xk, xv, xa, xg = fused_addcmul_rwkv7(hidden_states, delta, self.x_r, self.x_w,
+                                                     self.x_k, self.x_v, self.x_a, self.x_g)
 
         r = self.r_proj(xr)
         # w (-0.6065, 0)
diff --git a/fla/models/rwkv7/configuration_rwkv7.py b/fla/models/rwkv7/configuration_rwkv7.py
@@ -39,7 +39,6 @@ def __init__(
         fuse_cross_entropy: bool = True,
         vocab_size: int = 32000,
         value_dim: Optional[Union[int, List[int]]] = None,
-        wkv_precision: Optional[str] = "bfloat16",
         **kwargs
     ):
         self.attn_mode = attn_mode
@@ -84,7 +83,6 @@ def __init__(
         self.fuse_norm = fuse_norm
         self.fuse_cross_entropy = fuse_cross_entropy
         self.vocab_size = vocab_size
-        self.wkv_precision = wkv_precision
 
         if attn is not None:
             if not isinstance(attn, Dict):
diff --git a/fla/models/rwkv7/modeling_rwkv7.py b/fla/models/rwkv7/modeling_rwkv7.py
@@ -263,6 +263,56 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings = value
 
+    def load_state_dict(self, state_dict, strict=True, assign=False):
+        """
+        Override the load_state_dict method to handle migration from version 1 to version 2.
+        Handles hierarchical keys like 'model.layers.0.attn.x_x'.
+        """
+        # Collect all layer indices from the state_dict keys
+        layer_indices = set()
+        for key in state_dict.keys():
+            if key.startswith("model.layers."):
+                # Extract the layer index from the key
+                try:
+                    layer_idx = int(key.split(".")[2])  # Extract the number after 'model.layers.'
+                    layer_indices.add(layer_idx)
+                except ValueError:
+                    # Skip keys that don't match the expected format
+                    continue
+
+        # Sort the layer indices to process them in order
+        sorted_layer_indices = sorted(layer_indices)
+
+        # Migration logic for each layer
+        for layer_idx in sorted_layer_indices:
+            layer_prefix = f"model.layers.{layer_idx}"
+            attn_prefix = f"{layer_prefix}.attn"
+
+            # Check if the layer contains the old 'x_x' parameter
+            if f"{attn_prefix}.x_x" in state_dict:
+                logger.info(f"Migrating weights for layer {layer_idx} from RWKV7Attention version 1 to version 2...")
+                # Extract the x_x parameter
+                x_x = state_dict[f"{attn_prefix}.x_x"]
+                with torch.no_grad():
+                    # Create new parameters for version 2
+                    state_dict[f"{attn_prefix}.x_r"] = x_x[0].unsqueeze(0).unsqueeze(0)
+                    state_dict[f"{attn_prefix}.x_w"] = x_x[1].unsqueeze(0).unsqueeze(0)
+                    state_dict[f"{attn_prefix}.x_k"] = x_x[2].unsqueeze(0).unsqueeze(0)
+                    state_dict[f"{attn_prefix}.x_v"] = x_x[3].unsqueeze(0).unsqueeze(0)
+                    state_dict[f"{attn_prefix}.x_a"] = x_x[4].unsqueeze(0).unsqueeze(0)
+                    state_dict[f"{attn_prefix}.x_g"] = x_x[5].unsqueeze(0).unsqueeze(0)
+
+        # Call the parent method to load the modified state_dict
+        try:
+            super().load_state_dict(state_dict, strict=strict, assign=assign)
+        except TypeError:
+            # If the parent method does not support `assign`, fall back to strict loading
+            logger.warning(
+                "`assign` parameter is not supported by the parent `load_state_dict` method. "
+                "Falling back to default behavior."
+            )
+            super().load_state_dict(state_dict, strict=strict)
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -349,7 +399,7 @@ def forward(
         )
 
 
-class RWKV7ForCausalLM(RWKV7PreTrainedModel, GenerationMixin):
+class RWKV7ForCausalLM(RWKV7Model, GenerationMixin):
 
     _tied_weights_keys = ["lm_head.weight"]
 
diff --git a/utils/convert_from_rwkv7.py b/utils/convert_from_rwkv7.py
@@ -90,9 +90,7 @@ def translate_into_fla(name):
             'ln1': 'attn_norm',
             'ln2': 'ffn_norm'
         }[name_compo[2]]
-        if name_compo[2] == 'attn' and re.match("x_[rwkvag]", name_compo[3]):
-            name_compo[3] = 'x_x'
-        elif re.match("[wvag][012]", name_compo[3]):
+        if re.match("[wvag][012]", name_compo[3]):
             typ, num = name_compo[3]
             name_compo[3] = f'{typ}_lora.lora.' + {
                 '0': '2.bias',
@@ -121,15 +119,9 @@ def translate_into_fla(name):
         if shape1 == [1, 1, config.hidden_size]:
             weight.squeeze_()
 
-        # fix: fusing x_[rwkvag] to x_x
-        if fla_name.endswith('attn.x_x'):
-            model_dict[fla_name].data['rwkvag'.find(name[-1])].copy_(weight)
-            if fla_name in model_names:
-                model_names.remove(fla_name)
-        else:
-            assert model_dict[fla_name].shape == weight.shape
-            model_dict[fla_name].data.copy_(weight)
-            model_names.remove(fla_name)
+        assert model_dict[fla_name].shape == weight.shape
+        model_dict[fla_name].data.copy_(weight)
+        model_names.remove(fla_name)
 
     print("uninitialized parameters: ", model_names)
     for n in model_names: