google
diff --git a/‎flax/nnx/nn/activations.py‎
Lines changed: 23 additions & 3 deletions b/‎flax/nnx/nn/activations.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎flax/nnx/nn/attention.py‎
Lines changed: 19 additions & 0 deletions b/‎flax/nnx/nn/attention.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎flax/nnx/nn/lora.py‎
Lines changed: 34 additions & 25 deletions b/‎flax/nnx/nn/lora.py‎
Lines changed: 34 additions & 25 deletions
@@ -42,7 +42,8 @@
 from jax.numpy import tanh
 
 from flax import nnx
-from flax.typing import Array, Dtype
+from flax.nnx.nn import dtypes
+from flax.typing import Array, Dtype, PromoteDtypeFn
 
 
 __all__ = [
@@ -97,21 +98,40 @@ class PReLU(nnx.Module):
 
   Args:
     negative_slope_init: the value to initialize the negative slope (default 0.01).
+    dtype: the dtype of the computation (default: infer from input and params).
     param_dtype: the dtype passed to parameter initializers (default: float32).
+    promote_dtype: function to promote the dtype of all input array arguments
+      (including Variables accessed through ``self``) to the desired dtype. The
+      function should accept a tuple of ``(inputs, negative_slope)`` and a ``dtype``
+      keyword argument, and return a tuple of arrays with the promoted dtype.
   """
   def __init__(
     self,
     negative_slope_init: float = 0.01,
-    param_dtype: Dtype = jnp.float32
+    *,
+    dtype: Dtype | None = None,
+    param_dtype: Dtype = jnp.float32,
+    promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
   ):
     self.negative_slope = nnx.Param(
       jnp.asarray(negative_slope_init, dtype=param_dtype)
     )
+    self.dtype = dtype
     self.param_dtype = param_dtype
+    self.promote_dtype = promote_dtype
 
   def __call__(self, inputs: Array) -> Array:
+    negative_slope = self.negative_slope[...]
+    if self.dtype is not None:
+      inputs, negative_slope = self.promote_dtype(
+        (inputs, negative_slope), dtype=self.dtype
+      )
+    else:
+      # Match Linen behavior: cast parameter to input dtype
+      negative_slope = jnp.asarray(negative_slope, inputs.dtype)
+
     return jnp.where(
       inputs >= 0,
       inputs,
-      jnp.asarray(self.negative_slope[...], inputs.dtype) * inputs,
+      negative_slope * inputs,
     )
@@ -301,6 +301,15 @@ class MultiHeadAttention(Module):
       num_heads, value_channels]``
     decode: whether to prepare and use an autoregressive cache.
     normalize_qk: should QK normalization be applied (arxiv.org/abs/2302.05442).
+    qkv_promote_dtype: function to promote the dtype of all input array arguments
+      (including Variables accessed through ``self``) to the desired dtype for the
+      query, key, and value LinearGeneral submodules.
+    out_promote_dtype: function to promote the dtype of all input array arguments
+      (including Variables accessed through ``self``) to the desired dtype for the
+      output LinearGeneral submodule.
+    ln_promote_dtype: function to promote the dtype of all input array arguments
+      (including Variables accessed through ``self``) to the desired dtype for the
+      LayerNorm submodules (query_ln and key_ln) when normalize_qk=True.
     rngs: rng key.
     keep_rngs: whether to store the input rngs as attribute (i.e. `self.rngs = rngs`)
       (default: True). If rngs is stored, we should split the module as
@@ -330,6 +339,9 @@ def __init__(
     attention_fn: Callable[..., Array] = dot_product_attention,
     decode: bool | None = None,
     normalize_qk: bool = False,
+    qkv_promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
+    out_promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
+    ln_promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
     # Deprecated, will be removed.
     qkv_dot_general: DotGeneralT | None = None,
     out_dot_general: DotGeneralT | None = None,
@@ -359,6 +371,9 @@ def __init__(
     self.attention_fn = attention_fn
     self.decode = decode
     self.normalize_qk = normalize_qk
+    self.qkv_promote_dtype = qkv_promote_dtype
+    self.out_promote_dtype = out_promote_dtype
+    self.ln_promote_dtype = ln_promote_dtype
     self.qkv_dot_general = qkv_dot_general
     self.out_dot_general = out_dot_general
     self.qkv_dot_general_cls = qkv_dot_general_cls
@@ -381,6 +396,7 @@ def __init__(
       bias_init=bias_init,
       use_bias=self.use_bias,
       precision=self.precision,
+      promote_dtype=self.qkv_promote_dtype,
       dot_general=self.qkv_dot_general,
       dot_general_cls=self.qkv_dot_general_cls,
     )
@@ -400,13 +416,15 @@ def __init__(
         use_bias=False,
         dtype=self.dtype,
         param_dtype=self.param_dtype,
+        promote_dtype=self.ln_promote_dtype,
         rngs=rngs,
       )
       self.key_ln = LayerNorm(
         self.head_dim,
         use_bias=False,
         dtype=self.dtype,
         param_dtype=self.param_dtype,
+        promote_dtype=self.ln_promote_dtype,
         rngs=rngs,
       )
     else:
@@ -423,6 +441,7 @@ def __init__(
       dtype=self.dtype,
       param_dtype=self.param_dtype,
       precision=self.precision,
+      promote_dtype=self.out_promote_dtype,
       dot_general=self.out_dot_general,
       dot_general_cls=self.out_dot_general_cls,
       rngs=rngs,
 
@@ -17,10 +17,9 @@
 
 from flax.nnx import rnglib, variablelib
 from flax.nnx.module import Module
-from flax.nnx.nn import initializers
+from flax.nnx.nn import initializers, dtypes
 from flax.nnx.nn.linear import Linear
-from flax.nnx.nn.dtypes import promote_dtype
-from flax.typing import Dtype, Initializer
+from flax.typing import Dtype, Initializer, PromoteDtypeFn
 import jax
 import jax.numpy as jnp
 
@@ -75,6 +74,11 @@ class LoRA(Module):
     b_initializer: initializer function for the fan-out matrices. Default to
       `zero initializer`.
     lora_param_type: the type of the LoRA params.
+    promote_dtype: function to promote the dtype of all input array arguments
+      (including Variables accessed through ``self``) to the desired dtype. The
+      function should accept a tuple of ``(inputs, lora_a, lora_b)`` and a ``dtype``
+      keyword argument, and return a tuple of arrays with the promoted dtype.
+    rngs: rng key.
   """
 
   def __init__(
@@ -89,6 +93,7 @@ def __init__(
     a_initializer: Initializer = default_a_initializer,
     b_initializer: Initializer = default_b_initializer,
     lora_param_type: tp.Type[variablelib.Variable] = LoRAParam,
+    promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
     rngs: rnglib.Rngs,
   ):
     self.in_features = in_features
@@ -97,6 +102,7 @@ def __init__(
     self.param_dtype = param_dtype
     self.lora_param_type = lora_param_type
     self.base_module = base_module
+    self.promote_dtype = promote_dtype
 
     self.lora_a = lora_param_type(
       a_initializer(rngs.params(), (in_features, lora_rank), param_dtype)
@@ -106,7 +112,7 @@ def __init__(
     )
 
   def __call__(self, x: jax.Array):
-    x, lora_a, lora_b = promote_dtype(
+    x, lora_a, lora_b = self.promote_dtype(
       (x, self.lora_a[...], self.lora_b[...]), dtype=self.dtype
     )
     out = x @ lora_a @ lora_b
@@ -154,33 +160,36 @@ class LoRALinear(Linear):
     b_initializer: initializer function for the fan-out matrices. Default to
       `zero initializer`.
     lora_param_type: the type of the LoRA params.
+    lora_promote_dtype: function to promote the dtype for the LoRA submodule.
   """
 
   def __init__(
-      self,
-      in_features: int,
-      out_features: int,
-      *,
-      lora_rank: int,
-      lora_dtype: tp.Optional[Dtype] = None,
-      lora_param_dtype: Dtype = jnp.float32,
-      a_initializer: Initializer = default_a_initializer,
-      b_initializer: Initializer = default_b_initializer,
-      lora_param_type: tp.Type[variablelib.Variable] = LoRAParam,
-      rngs: rnglib.Rngs,
-      **kwargs,
+    self,
+    in_features: int,
+    out_features: int,
+    *,
+    lora_rank: int,
+    lora_dtype: tp.Optional[Dtype] = None,
+    lora_param_dtype: Dtype = jnp.float32,
+    a_initializer: Initializer = default_a_initializer,
+    b_initializer: Initializer = default_b_initializer,
+    lora_param_type: tp.Type[variablelib.Variable] = LoRAParam,
+    lora_promote_dtype: PromoteDtypeFn = dtypes.promote_dtype,
+    rngs: rnglib.Rngs,
+    **kwargs,
   ):
     super().__init__(in_features, out_features, rngs=rngs, **kwargs)
     self.lora = LoRA(
-        in_features,
-        lora_rank,
-        out_features,
-        dtype=lora_dtype,
-        param_dtype=lora_param_dtype,
-        a_initializer=a_initializer,
-        b_initializer=b_initializer,
-        lora_param_type=lora_param_type,
-        rngs=rngs,
+      in_features,
+      lora_rank,
+      out_features,
+      dtype=lora_dtype,
+      param_dtype=lora_param_dtype,
+      a_initializer=a_initializer,
+      b_initializer=b_initializer,
+      lora_param_type=lora_param_type,
+      promote_dtype=lora_promote_dtype,
+      rngs=rngs,
     )
 
   def __call__(self, x: jax.Array):