fla-org
diff --git a/‎fla/ops/attn/parallel.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/attn/parallel.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/delta_rule/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/delta_rule/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/delta_rule/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/delta_rule/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/forgetting_attn/parallel.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/forgetting_attn/parallel.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gated_delta_rule/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gated_delta_rule/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gated_delta_rule/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gated_delta_rule/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/generalized_delta_rule/dplr/chunk.py
Lines changed: 3 additions & 4 deletions b/‎fla/ops/generalized_delta_rule/dplr/chunk.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎fla/ops/generalized_delta_rule/dplr/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/generalized_delta_rule/dplr/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/generalized_delta_rule/iplr/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/generalized_delta_rule/iplr/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/generalized_delta_rule/iplr/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/generalized_delta_rule/iplr/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gla/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gla/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gla/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gla/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gsa/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gsa/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/gsa/fused_recurrent.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/gsa/fused_recurrent.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/linear_attn/chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/linear_attn/chunk.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎fla/ops/linear_attn/fused_chunk.py
Lines changed: 2 additions & 3 deletions b/‎fla/ops/linear_attn/fused_chunk.py
Lines changed: 2 additions & 3 deletions
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -713,15 +712,15 @@ def parallel_attn(
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v))
         if g is not None:
             g = rearrange(g, 'b h t ... -> b t h ...')
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -280,13 +279,13 @@ def chunk_delta_rule(
     assert len(beta.shape) == 3, "beta must be of shape (batch size, num of head, seq len)."
 
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, beta = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, beta))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -514,13 +513,13 @@ def fused_recurrent_delta_rule(
         >>> assert ht.allclose(ht_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, beta = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, beta))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -49,13 +48,13 @@ def parallel_forgetting_attn(
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -313,13 +312,13 @@ def chunk_gated_delta_rule(
     assert len(beta.shape) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
 
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, beta, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, beta, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -283,13 +282,13 @@ def fused_recurrent_gated_delta_rule(
         >>> assert ht.allclose(ht_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, beta, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, beta, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional
 
 import torch
@@ -318,20 +317,20 @@ def chunk_dplr_delta_rule(
             Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, a, b, gk = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, a, b, gk))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
             "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
         )
     if q.dtype == torch.float32:
-        warnings.warn(
+        raise DeprecationWarning(
             """ChunkDeltaRuleFunction does not support float32. Please use bfloat16.
             If you want to use float32, please solve the issue by yourself."""
         )
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -249,13 +248,13 @@ def fused_recurrent_dplr_delta_rule(
             Default: `False`.
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, a, b, gk = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, a, b, gk))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -462,13 +461,13 @@ def chunk_iplr_delta_rule(
     assert q.dtype != torch.float32, "ChunkDeltaRuleFunction does not support float32. Please use bfloat16."
 
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, a, b = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, a, b))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2024-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -427,13 +426,13 @@ def fused_recurrent_iplr_delta_rule(
 
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, a, b = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, a, b))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -1290,13 +1289,13 @@ def chunk_gla(
         >>> assert ht.allclose(ht_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2024, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -90,7 +89,7 @@ def fused_recurrent_gla(
         >>> assert ht.allclose(ht_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
@@ -100,7 +99,7 @@ def fused_recurrent_gla(
         if gv is not None:
             gv = rearrange(gv, 'b h t ... -> b t h ...')
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -1082,13 +1081,13 @@ def chunk_gsa(
         >>> assert hv.allclose(hv_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, s, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, s, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2024, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -510,13 +509,13 @@ def fused_recurrent_gsa(
         >>> assert hv.allclose(hv_var)
     """
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v, s, g = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, s, g))
     if not head_first and q.shape[1] < q.shape[2]:
-        warnings.warn(
+        raise DeprecationWarning(
             f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
             "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
             "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Yu Zhang, Songlin Yang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -51,14 +50,14 @@ def chunk_linear_attn(
     if scale is None:
         scale = k.shape[-1] ** -0.5
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
     if not head_first:
         if q.shape[1] < q.shape[2]:
-            warnings.warn(
+            raise DeprecationWarning(
                 f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
                 "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
                 "when head_first=False was specified. "
 
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -343,14 +342,14 @@ def fused_chunk_linear_attn(
     if scale is None:
         scale = q.shape[-1] ** -0.5
     if head_first:
-        warnings.warn(
+        raise DeprecationWarning(
             "head_first is deprecated and will be removed in a future version. "
             "Please use head_first=False for now instead."
         )
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
     if not head_first:
         if q.shape[1] < q.shape[2]:
-            warnings.warn(
+            raise DeprecationWarning(
                 f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
                 "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
                 "when head_first=False was specified. "