ENH: Targeting multiple parameters on the same module (#2665)

BenjaminBossan · web-flow · commit 434651346ce1 · 2025-07-24T19:42:19.000+02:00
When the target_parameters feature for LoRA was introduced in #2638, there was one gap, namely the possibility to target multiple nn.Parameters on the same module (there was only a workaround involving multiple adapters, but that is not user friendly). With this PR, it is now possible to achieve this. The mechanism to enable this is a bit crude, namely allowing to nest multiple ParamWrappers. This should generally be fine as long as there are only a couple of nn.Parameters being targeted on the same module. When there are dozens or hundreds, this approach could load to slow downs or other issues. A side effect of this implementation is that the ParamWrapper, when it removes the parametrization, now only removes its own parametrization. When using nn.utils.parametrize.remove_parametrization, it removes all parametrizations, which is bad when we have nested parametrizations. Alternative approaches Some alternative approaches were discussed internally but the chosen one was considered most practical. Allow to have more than one adapted parameter per LoRA layer. This would require to have nested dicts for the LoRA parameters, something like self.lora_A[adapter_name][parameter_name]. We don't have this anywhere so far and it would probably break implicit assumptions about PEFT layers in many places (like, parsing of state_dict keys), requiring many adjustments. Have an auxiliary module that contains the individual LoRA layers that target the individual parameters. This could be the cleanest solution and would probably be more efficient if there are a huge number of targeted parameters per module. However, this also brings extra complexity, as it requires implementing the logic of how to route the information to the right parameter, and it may be a solution to a problem that is irrelevant in practice (large number of targets per module).
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -1777,6 +1777,15 @@ def forward(self, W):
             return W + self.delta_weight
 
 
+# copied from:
+# https://github.com/pytorch/pytorch/blob/5e386eec9426f174eea130c0c012d9f65ebe65fb/torch/nn/utils/parametrize.py#L75-L79
+def _register_parameter_or_buffer(module, name, X):
+    if isinstance(X, nn.Parameter):
+        module.register_parameter(name, X)
+    else:
+        module.register_buffer(name, X)
+
+
 class ParamWrapper(nn.Module, LoraLayer):
     """A LoRA wrapper for `nn.Parameter`. This layer is dispatched if users target a parameter directly with
     `lora_config.target_parameters`
@@ -1807,8 +1816,8 @@ def __init__(
     ) -> None:
         super().__init__()
         LoraLayer.__init__(self, base_layer, **kwargs)
-        param = getattr(base_layer, parameter_name)
         self.parameter_name = parameter_name
+        param = self.get_param()
         if param.ndim == 3:
             self.num_experts, self.in_features, self.out_features = param.shape
         else:
@@ -1867,15 +1876,6 @@ def update_layer(
         # This code works for linear layers, override for other layer types
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
-        if adapter_name in self.lora_A:
-            # It is not allowed to target multiple parameters on the same module. Supporting this would complicate
-            # things quite a lot, since we would require multiple self.lora_A, self.lora_B, etc., one for each targeted
-            # parameter.
-            raise ValueError(
-                f"lora.{self.__class__.__name__} already has an adapter for parameter '{self.parameter_name}'. "
-                "It is currently not possible to apply the same adapter to multiple parameters, please add a "
-                "different adapter to target another parameter of the same module."
-            )
 
         lora_variant = self.resolve_lora_variant(
             use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size
@@ -1958,7 +1958,8 @@ def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optio
                 adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device)
 
     def get_param(self):
-        return getattr(self.base_layer, self.parameter_name)
+        param = getattr(self.get_base_layer(), self.parameter_name)
+        return param
 
     def get_delta_weight(self, adapter_name, *args, **kwargs):
         if self.num_experts == 1:
@@ -2004,10 +2005,26 @@ def _activate_lora(self, active_adapters: list[str]):
         try:
             yield
         finally:
-            nn.utils.parametrize.remove_parametrizations(
-                self.base_layer, self.parameter_name, leave_parametrized=False
+            self._remove_parametrizations()
+
+    def _remove_parametrizations(self):
+        # Remove the parametrization of this specific parameter
+        base_layer = self.get_base_layer()
+        parameter_name = self.parameter_name
+        if parameter_name not in base_layer.parametrizations:
+            raise ValueError(
+                "Something went wrong, please report this issue on PEFT: https://github.com/huggingface/peft/issues"
             )
 
+        if len(base_layer.parametrizations[parameter_name]) == 1:
+            # last parametrization, we can safely remove it completely
+            nn.utils.parametrize.remove_parametrizations(base_layer, parameter_name, leave_parametrized=False)
+        else:
+            # TODO: If there are multiple parametrizations for the same parameter_name, we currently remove all of them,
+            # which is not desired. Unfortunately, PyTorch does not support this directly, so we need to take care.
+            # For now, remove all parametrizations.
+            nn.utils.parametrize.remove_parametrizations(base_layer, parameter_name, leave_parametrized=False)
+
     def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
         # same as lora.Linear.merge but not hard-coding base_layer.weight and without special cases like variants removed
         adapter_names = check_adapters_to_merge(self, adapter_names)
@@ -2059,6 +2076,18 @@ def _check_forward_args(self, x, *args, **kwargs):
             raise ValueError(f"lora.{self.__class__.__name__} does not support mixed adapter batches yet.")
         super()._check_forward_args(x, *args, **kwargs)
 
+    def unload_and_optionally_merge_module(self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]]):
+        base_layer = self.base_layer
+        # ParamWrappers can be nested, so merge and retrieve base layer recursively
+        if merge:
+            self.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+            while isinstance(base_layer, ParamWrapper):
+                base_layer.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                base_layer = base_layer.base_layer
+        else:
+            base_layer = self.get_base_layer()
+        return base_layer
+
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         self._check_forward_args(x, *args, **kwargs)
         adapter_names = kwargs.pop("adapter_names", None)
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
@@ -53,7 +53,7 @@
 from .gptq import dispatch_gptq
 from .hqq import dispatch_hqq
 from .inc import dispatch_inc
-from .layer import Conv2d, LoraLayer, dispatch_default
+from .layer import Conv2d, LoraLayer, ParamWrapper, dispatch_default
 from .torchao import dispatch_torchao
 from .tp_layer import dispatch_megatron
 
@@ -227,7 +227,9 @@ def _create_and_replace(
         # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it
         from peft.tuners.adalora import AdaLoraLayer
 
-        if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer):
+        # if the target is a ParamWrapper, we nest it to allow targeting multiple nn.Parameter on the same module
+        wrap_target_param = isinstance(target, ParamWrapper) and (adapter_name in target.lora_A)
+        if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer) and not wrap_target_param:
             target.update_layer(
                 adapter_name,
                 r,
@@ -239,6 +241,11 @@ def _create_and_replace(
                 lora_bias=lora_config.lora_bias,
             )
         else:
+            if isinstance(target, ParamWrapper) and (parameter_name == target.parameter_name):
+                raise ValueError(
+                    "Trying to target the same nn.Parameter twice, this should not happen. Please open an issue on the "
+                    "PEFT repo: https://github.com/huggingface/peft/issues"
+                )
             device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None
             new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs)
             if adapter_name not in self.active_adapters:
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
@@ -1411,30 +1411,6 @@ def __init__(self):
         with pytest.raises(ValueError, match=msg):
             get_peft_model(base_model, config)
 
-    def test_targeting_2_params_on_1_module_raises(self):
-        # It is currently not supported to target multiple parameters on the same module.
-        class ModuleWith2Params(nn.Module):
-            def __init__(self, in_features, out_features):
-                super().__init__()
-                self.weight0 = nn.Parameter(torch.zeros(in_features, out_features))
-                self.weight1 = nn.Parameter(torch.ones(3, out_features, out_features))
-
-        class Outer(nn.Module):
-            def __init__(self, in_features, out_features):
-                super().__init__()
-                self.lin = nn.Linear(in_features, in_features)
-                self.submodule = ModuleWith2Params(in_features, out_features)
-
-        model = Outer(3, 4)
-        config = LoraConfig(target_parameters=["submodule.weight0", "submodule.weight1"], init_lora_weights=False)
-        msg = (
-            "lora.ParamWrapper already has an adapter for parameter 'weight0'. It is currently not possible to apply "
-            "the same adapter to multiple parameters, please add a different adapter to target another parameter of "
-            "the same module."
-        )
-        with pytest.raises(ValueError, match=msg):
-            get_peft_model(model, config)
-
     @pytest.mark.parametrize("target_parameters", [["linear"], ["foobar"], ["foobar.weight"], ["foo", "bar"]])
     @pytest.mark.parametrize("target_modules", [None, [], ""])
     def test_valid_no_target_module_nor_target_parameter_match_raises(self, target_parameters, target_modules):
diff --git a/tests/test_target_parameters.py b/tests/test_target_parameters.py
@@ -59,6 +59,22 @@
             ],
         },
     ),
+    # target down_proj and gate_up_proj on the same module
+    (
+        LoraConfig,
+        {
+            "task_type": "CAUSAL_LM",
+            "r": 8,
+            "lora_alpha": 32,
+            "target_modules": None,
+            "lora_dropout": 0.0,
+            "bias": "none",
+            "target_parameters": [
+                "feed_forward.experts.down_proj",
+                "feed_forward.experts.gate_up_proj",
+            ],
+        },
+    ),
     # target q_proj, v_proj as modules, and down_proj as parameter
     (
         LoraConfig,
@@ -314,38 +330,75 @@ def test_targeting_module_and_targeting_param_equivalent(self):
             # LoRA outputs should be the same
             assert torch.allclose(out_lora_0, out_lora_1, atol=atol, rtol=rtol)
 
-    def test_target_multiple_parameters_on_same_module(self):
-        # for now, it is not supported to target multiple parameters from the same module with the same adapter,
-        # however, it is possible to target multiple parameters from same module with different adapters
+    def test_target_multiple_parameters_on_same_module(self, monkeypatch):
+        # test that if we target multiple nn.Parameters on the same module, all of them are being used during the
+        # forward pass
         torch.manual_seed(0)
-        model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+        model_id = "trl-internal-testing/tiny-Llama4ForCausalLM"
         with hub_online_once(model_id):
-            model = AutoModelForCausalLM.from_pretrained(model_id)
             x = torch.arange(10).view(2, 5)
-            with torch.inference_mode():
-                out_base = model(x, output_hidden_states=True).hidden_states[-1]
+            model = MyAutoModelForCausalLM.from_pretrained(model_id)
+            shape_gate_up_proj = model.model.layers[0].feed_forward.experts.gate_up_proj.shape
+            shape_down_proj = model.model.layers[0].feed_forward.experts.down_proj.shape
+            num_layers = len(model.model.layers)
 
-            # targeting gate_up_proj
-            config0 = LoraConfig(target_parameters=["feed_forward.experts.gate_up_proj"], init_lora_weights=False)
-            model = get_peft_model(model, config0)
-            with torch.inference_mode():
-                out_lora_0 = model(x, output_hidden_states=True).hidden_states[-1]
-            atol, rtol = 1e-6, 1e-6
-            assert not torch.allclose(out_base, out_lora_0, atol=atol, rtol=rtol)
+            target_parameters = ["feed_forward.experts.gate_up_proj", "feed_forward.experts.down_proj"]
+            num_params = len(target_parameters)
+            config = LoraConfig(target_parameters=target_parameters, init_lora_weights=False)
+            model = get_peft_model(model, config)
 
-            # targeting down_proj
-            config1 = LoraConfig(target_parameters=["feed_forward.experts.down_proj"], init_lora_weights=False)
-            model.add_adapter("other", config1)
-            model.set_adapter("other")
-            with torch.inference_mode():
-                out_lora_1 = model(x, output_hidden_states=True).hidden_states[-1]
-            assert not torch.allclose(out_base, out_lora_1, atol=atol, rtol=rtol)
-            assert not torch.allclose(out_lora_0, out_lora_1, atol=atol, rtol=rtol)
+            # CHECK FORWARD CALLS
+
+            # log the weights seen during the forward call
+            weights = []
+
+            def mock_forward(self, W):
+                weights.append(W)
+                return orig_forward(self, W)
+
+            from peft.tuners.lora.layer import _LoraParameterProxy
+
+            orig_forward = _LoraParameterProxy.forward
+            monkeypatch.setattr(_LoraParameterProxy, "forward", mock_forward)
 
-            # targeting both gate_up_proj and down_proj
-            model.base_model.set_adapter(["default", "other"])
+            num_steps = 3
             with torch.inference_mode():
-                out_lora_01 = model(x, output_hidden_states=True).hidden_states[-1]
-            assert not torch.allclose(out_base, out_lora_01, atol=atol, rtol=rtol)
-            assert not torch.allclose(out_lora_0, out_lora_01, atol=atol, rtol=rtol)
-            assert not torch.allclose(out_lora_1, out_lora_01, atol=atol, rtol=rtol)
+                for _ in range(num_steps):
+                    out_base = model(x, output_hidden_states=True).hidden_states[-1]
+
+            actual_call_count = len(weights)
+            # Note: We call forward twice per step, once to create the parametrization and once for the actual forward
+            # step. This may be a bit wasteful but it's not clear how to prevent this and overall is probably negligible
+            num_forward_per_step = 2
+            expected_call_count = num_steps * num_layers * num_params * num_forward_per_step
+            assert actual_call_count == expected_call_count
+
+            actual_shapes = {W.shape for W in weights}
+            expected_shapes = {shape_gate_up_proj, shape_down_proj}
+            assert actual_shapes == expected_shapes
+
+            # CHECK WEIGHT UPDATES
+
+            lora_weights_before = {
+                k: v.clone() for k, v in model.named_parameters() if "lora_A.default" in k or "lora_B.default" in k
+            }
+            print(lora_weights_before)
+            # sanity check:
+            assert len(lora_weights_before) == 2 * num_layers * num_params
+            # train
+            optim = torch.optim.SGD(model.parameters(), lr=0.01)
+            for _ in range(10):
+                optim.zero_grad()
+                out = model(x)
+                loss = out.logits.sum()
+                loss.backward()
+                optim.step()
+
+            print(lora_weights_before)
+            lora_weights_after = {
+                k: v for k, v in model.named_parameters() if "lora_A.default" in k or "lora_B.default" in k
+            }
+            assert lora_weights_before.keys() == lora_weights_after.keys()
+            atol, rtol = 0.1, 0.1
+            for key in lora_weights_before.keys():
+                assert not torch.allclose(lora_weights_before[key], lora_weights_after[key], atol=atol, rtol=rtol)