updates to get_lowest_common_x

HDCharles · HDCharles · commit dea5eabbdbd7 · 2025-11-27T03:08:53.000Z
Summary

Signed-off-by: HDCharles &lt;charlesdavidhernandez@gmail.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -320,21 +320,26 @@ def _set_resolved_mappings(self, model: Module) -> None:
         repeat for model.layer.1 and so on
         """
         resolved_mappings: list[ResolvedMapping] = []
-        module_to_name = {module: name for name, module in model.named_modules()}
-        for mapping_idx, mapping in enumerate(self.mappings):
-            num_skipped_mappings = 0
+        
+        module_to_name = {}
+        for name, module in model.named_modules():
+            if module in module_to_name:
+                logger.info(
+                    f"Warning, {name} and {module_to_name[module]} both "
+                    "share the same module the same module, "
+                    "may have trouble resolving mappings."
+                )
+            module_to_name[module] = name
+
+
+
+        for mapping in self.mappings:
 
-            # Use match_modules_set to find coherent sets of modules
             target_patterns = (mapping.smooth_layer, *mapping.balance_layers)
 
             for smooth_layer, *balance_layers in (
-                pbar := tqdm(match_modules_set(model, target_patterns, self.ignore))
+                match_modules_set(model, target_patterns, self.ignore)
             ):
-                pbar.set_description(
-                    f"Resolving mapping {mapping_idx+1}/{len(self.mappings)}"
-                    f" ({num_skipped_mappings} skipped)"
-                )
-
                 smooth_name = module_to_name.get(smooth_layer)
                 balance_names = [
                     module_to_name.get(balance_layer)
@@ -347,14 +352,18 @@ def _set_resolved_mappings(self, model: Module) -> None:
 
                 # skip mapping if any of the balance layers are incompatible
                 if not all_compatible or len(balance_layers) == 0:
-                    num_skipped_mappings += 1
+                    logger.info(
+                        f"skipping AWQ for {smooth_name} for mapping {mapping}" + (
+                            " because found incompatible balance layers" 
+                            if not all_compatible else 
+                            f" because no balance layers were found"
+                        )
+                    )
+
                     continue
-                elif len(balance_layers) == 1:
-                    # for single balance layer, parent is the balance layer
-                    parent_name, parent = balance_names[0], balance_layers[0]
                 else:
                     # for multiple balance layers, find lowest common parent
-                    parent_name, parent = get_lowest_common_parent(balance_names, model)
+                    parent_name, parent = get_lowest_common_module(balance_names, model)
 
                 resolved_mappings.append(
                     ResolvedMapping(
@@ -788,29 +797,41 @@ def _accumulate_mean(
     return (prev_sum + sum_added) / new_count, new_count
 
 
-def get_lowest_common_parent(names: list[str], module: Module) -> tuple[str, Module]:
+def get_lowest_common_module(names: list[str], module: Module) -> tuple[str, Module]:
     """
-    Given a list of names, returns the lowest-scope common parent.
+    Given a list of names, returns the lowest-scope common module.
 
-    NOTE: function excludes parents of type ModuleList, which don't play
+    NOTE: function excludes modules of type ModuleList, which don't play
     nicely with hooks because their forward method is never directly
     called for MoE models. See Qwen3MoeSparseMoeBlock for example, experts
     are selected based on router output and their forward method is called.
     https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L233
 
-    Returns name of parent and pointer to parent module
+    Returns name of module and pointer to module
 
     Implementation is a small alteration of os.path.commonprefix
     https://docs.python.org/3/library/os.path.html#os.path.commonprefix
     """
-    s1 = min(names)
-    s2 = max(names)
-    parent_name = ""
+    # adding "." before and after allows for handling a lot of corner 
+    # cases which were previously mishandled ([case]->prefix->result)
+    # case 0: single module: [.abc.] -> .abc. -> abc
+    # case 1: substring modules: [.abc., .ab.] -> .ab -> ""
+    # case 2: parent & child: [.ab., .ab.a.] -> .ab. -> ab
+    s1 = min(names) + "."
+    s2 = max(names) + "."
+
+    # 1) find longest shared prefix
+    parent_name = "."
     for i, c in enumerate(s1):
         if c != s2[i]:
-            parent_name = s1[:i].rstrip(".")
             break
+        parent_name += c
+
+    # 2) throw away module name fragment and leading dot
+    # ".keep.thro" -> "keep"
+    parent_name = parent_name[1:parent_name.rfind(".")]
 
+    # 3) return first parent that is not a module list
     while True:
         if parent_name == "":
             return "", module
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py
@@ -2,9 +2,9 @@
 import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from pydantic import ValidationError
-
+from torch.nn import Linear
 from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
-from llmcompressor.modifiers.awq.base import get_lowest_common_parent
+from llmcompressor.modifiers.awq.base import get_lowest_common_module
 from llmcompressor.modifiers.factory import ModifierFactory
 
 
@@ -40,16 +40,16 @@ def test_set_resolved_mappings():
     )
     self_attn = torch.nn.ModuleDict(
         {
-            "q_proj": torch.nn.Linear(4, 4),
-            "k_proj": torch.nn.Linear(4, 4),
-            "v_proj": torch.nn.Linear(4, 4),
-            "o_proj": torch.nn.Linear(4, 4),
+            "q_proj": Linear(4, 4),
+            "k_proj": Linear(4, 4),
+            "v_proj": Linear(4, 4),
+            "o_proj": Linear(4, 4),
         }
     )
     mlp = torch.nn.ModuleDict(
         {
-            "up_proj": torch.nn.Linear(4, 10),
-            "down_proj": torch.nn.Linear(10, 4),
+            "up_proj": Linear(4, 10),
+            "down_proj": Linear(10, 4),
         }
     )
     model = torch.nn.ModuleDict(
@@ -100,11 +100,11 @@ def test_set_resolved_mappings():
                 {
                     "self_attn": torch.nn.ModuleDict(
                         {
-                            "q_proj": torch.nn.Linear(4, 2),
-                            "k_proj": torch.nn.Linear(4, 2),
-                            "v_proj": torch.nn.Linear(4, 2),
-                            "z_proj": torch.nn.Linear(2, 4),
-                            "o_proj": torch.nn.Linear(4, 4),
+                            "q_proj": Linear(4, 2),
+                            "k_proj": Linear(4, 2),
+                            "v_proj": Linear(4, 2),
+                            "z_proj": Linear(2, 4),
+                            "o_proj": Linear(4, 4),
                         }
                     )
                 }
@@ -192,15 +192,15 @@ def test_validate():
 
 
 @pytest.mark.unit
-def test_get_lowest_common_parent():
+def test_get_lowest_common_module():
     mlp = torch.nn.ModuleDict(
         {
             "experts": torch.nn.ModuleList(
                 [
                     torch.nn.ModuleDict(
                         {
-                            "gate_proj": torch.nn.Linear(4, 2),
-                            "down_proj": torch.nn.Linear(4, 2),
+                            "gate_proj": Linear(4, 2),
+                            "down_proj": Linear(4, 2),
                         }
                     )
                     for _ in range(10)
@@ -210,15 +210,15 @@ def test_get_lowest_common_parent():
     )
     self_attn = torch.nn.ModuleDict(
         {
-            "q_proj": torch.nn.Linear(4, 2),
-            "k_proj": torch.nn.Linear(4, 2),
-            "v_proj": torch.nn.Linear(4, 2),
-            "o_proj": torch.nn.Linear(4, 4),
+            "q_proj": Linear(4, 2),
+            "k_proj": Linear(4, 2),
+            "v_proj": Linear(4, 2),
+            "o_proj": Linear(4, 4),
         }
     )
     model = torch.nn.ModuleDict(
         {
-            "embed_tokens": torch.nn.Linear(4, 2),
+            "embed_tokens": Linear(4, 2),
             "decoder": torch.nn.ModuleDict(
                 {
                     "self_attn": self_attn,
@@ -228,22 +228,37 @@ def test_get_lowest_common_parent():
         }
     )
 
-    parent_name, parent = get_lowest_common_parent(
+    parent_name, parent = get_lowest_common_module(
         ["decoder.mlp.experts.1.gate_proj", "decoder.mlp.experts.4.down_proj"], model
     )
     assert parent_name == "decoder.mlp" and parent == mlp
 
-    parent_name, parent = get_lowest_common_parent(
+    parent_name, parent = get_lowest_common_module(
         ["decoder.self_attn.q_proj", "decoder.self_attn.v_proj"], model
     )
     assert parent_name == "decoder.self_attn" and parent == self_attn
 
-    parent_name, parent = get_lowest_common_parent(
+    parent_name, parent = get_lowest_common_module(
         ["decoder.mlp.experts.1.gate_proj", "decoder.self_attn.v_proj"], model
     )
     assert parent_name == "decoder" and parent == model["decoder"]
 
-    parent_name, parent = get_lowest_common_parent(
+    parent_name, parent = get_lowest_common_module(
         ["embed_tokens", "decoder.self_attn.v_proj"], model
     )
     assert parent_name == "" and parent == model
+
+    m = torch.nn.ModuleDict(
+        {
+            "abc": Linear(3,3),
+            "ab": torch.nn.ModuleDict({"a": Linear(3,3)}),
+            "z": Linear(3,3)
+        }
+    )
+    parent_name, parent = get_lowest_common_module(["abc", "ab"], m)
+    assert parent_name == ""
+    parent_name, parent = get_lowest_common_module(["ab", "ab.a"], m)
+    assert parent_name == "ab"
+    parent_name, parent = get_lowest_common_module(["z"], m)
+    assert parent_name == "z"
+

Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,9 @@`
`2`	`2`	`import torch`
`3`	`3`	`from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme`
`4`	`4`	`from pydantic import ValidationError`
`5`		`-`
	`5`	`+from torch.nn import Linear`
`6`	`6`	`from llmcompressor.modifiers.awq import AWQMapping, AWQModifier`
`7`		`-from llmcompressor.modifiers.awq.base import get_lowest_common_parent`
	`7`	`+from llmcompressor.modifiers.awq.base import get_lowest_common_module`
`8`	`8`	`from llmcompressor.modifiers.factory import ModifierFactory`
`9`	`9`
`10`	`10`
`@@ -40,16 +40,16 @@ def test_set_resolved_mappings():`
`40`	`40`	`)`
`41`	`41`	`self_attn = torch.nn.ModuleDict(`
`42`	`42`	`{`
`43`		`- "q_proj": torch.nn.Linear(4, 4),`
`44`		`- "k_proj": torch.nn.Linear(4, 4),`
`45`		`- "v_proj": torch.nn.Linear(4, 4),`
`46`		`- "o_proj": torch.nn.Linear(4, 4),`
	`43`	`+ "q_proj": Linear(4, 4),`
	`44`	`+ "k_proj": Linear(4, 4),`
	`45`	`+ "v_proj": Linear(4, 4),`
	`46`	`+ "o_proj": Linear(4, 4),`
`47`	`47`	`}`
`48`	`48`	`)`
`49`	`49`	`mlp = torch.nn.ModuleDict(`
`50`	`50`	`{`
`51`		`- "up_proj": torch.nn.Linear(4, 10),`
`52`		`- "down_proj": torch.nn.Linear(10, 4),`
	`51`	`+ "up_proj": Linear(4, 10),`
	`52`	`+ "down_proj": Linear(10, 4),`
`53`	`53`	`}`
`54`	`54`	`)`
`55`	`55`	`model = torch.nn.ModuleDict(`
`@@ -100,11 +100,11 @@ def test_set_resolved_mappings():`
`100`	`100`	`{`
`101`	`101`	`"self_attn": torch.nn.ModuleDict(`
`102`	`102`	`{`
`103`		`- "q_proj": torch.nn.Linear(4, 2),`
`104`		`- "k_proj": torch.nn.Linear(4, 2),`
`105`		`- "v_proj": torch.nn.Linear(4, 2),`
`106`		`- "z_proj": torch.nn.Linear(2, 4),`
`107`		`- "o_proj": torch.nn.Linear(4, 4),`
	`103`	`+ "q_proj": Linear(4, 2),`
	`104`	`+ "k_proj": Linear(4, 2),`
	`105`	`+ "v_proj": Linear(4, 2),`
	`106`	`+ "z_proj": Linear(2, 4),`
	`107`	`+ "o_proj": Linear(4, 4),`
`108`	`108`	`}`
`109`	`109`	`)`
`110`	`110`	`}`
`@@ -192,15 +192,15 @@ def test_validate():`
`192`	`192`
`193`	`193`
`194`	`194`	`@pytest.mark.unit`
`195`		`-def test_get_lowest_common_parent():`
	`195`	`+def test_get_lowest_common_module():`
`196`	`196`	`mlp = torch.nn.ModuleDict(`
`197`	`197`	`{`
`198`	`198`	`"experts": torch.nn.ModuleList(`
`199`	`199`	`[`
`200`	`200`	`torch.nn.ModuleDict(`
`201`	`201`	`{`
`202`		`- "gate_proj": torch.nn.Linear(4, 2),`
`203`		`- "down_proj": torch.nn.Linear(4, 2),`
	`202`	`+ "gate_proj": Linear(4, 2),`
	`203`	`+ "down_proj": Linear(4, 2),`
`204`	`204`	`}`
`205`	`205`	`)`
`206`	`206`	`for _ in range(10)`
`@@ -210,15 +210,15 @@ def test_get_lowest_common_parent():`
`210`	`210`	`)`
`211`	`211`	`self_attn = torch.nn.ModuleDict(`
`212`	`212`	`{`
`213`		`- "q_proj": torch.nn.Linear(4, 2),`
`214`		`- "k_proj": torch.nn.Linear(4, 2),`
`215`		`- "v_proj": torch.nn.Linear(4, 2),`
`216`		`- "o_proj": torch.nn.Linear(4, 4),`
	`213`	`+ "q_proj": Linear(4, 2),`
	`214`	`+ "k_proj": Linear(4, 2),`
	`215`	`+ "v_proj": Linear(4, 2),`
	`216`	`+ "o_proj": Linear(4, 4),`
`217`	`217`	`}`
`218`	`218`	`)`
`219`	`219`	`model = torch.nn.ModuleDict(`
`220`	`220`	`{`
`221`		`- "embed_tokens": torch.nn.Linear(4, 2),`
	`221`	`+ "embed_tokens": Linear(4, 2),`
`222`	`222`	`"decoder": torch.nn.ModuleDict(`
`223`	`223`	`{`
`224`	`224`	`"self_attn": self_attn,`
`@@ -228,22 +228,37 @@ def test_get_lowest_common_parent():`
`228`	`228`	`}`
`229`	`229`	`)`
`230`	`230`
`231`		`- parent_name, parent = get_lowest_common_parent(`
	`231`	`+ parent_name, parent = get_lowest_common_module(`
`232`	`232`	`["decoder.mlp.experts.1.gate_proj", "decoder.mlp.experts.4.down_proj"], model`
`233`	`233`	`)`
`234`	`234`	`assert parent_name == "decoder.mlp" and parent == mlp`
`235`	`235`
`236`		`- parent_name, parent = get_lowest_common_parent(`
	`236`	`+ parent_name, parent = get_lowest_common_module(`
`237`	`237`	`["decoder.self_attn.q_proj", "decoder.self_attn.v_proj"], model`
`238`	`238`	`)`
`239`	`239`	`assert parent_name == "decoder.self_attn" and parent == self_attn`
`240`	`240`
`241`		`- parent_name, parent = get_lowest_common_parent(`
	`241`	`+ parent_name, parent = get_lowest_common_module(`
`242`	`242`	`["decoder.mlp.experts.1.gate_proj", "decoder.self_attn.v_proj"], model`
`243`	`243`	`)`
`244`	`244`	`assert parent_name == "decoder" and parent == model["decoder"]`
`245`	`245`
`246`		`- parent_name, parent = get_lowest_common_parent(`
	`246`	`+ parent_name, parent = get_lowest_common_module(`
`247`	`247`	`["embed_tokens", "decoder.self_attn.v_proj"], model`
`248`	`248`	`)`
`249`	`249`	`assert parent_name == "" and parent == model`
	`250`	`+`
	`251`	`+ m = torch.nn.ModuleDict(`
	`252`	`+ {`
	`253`	`+ "abc": Linear(3,3),`
	`254`	`+ "ab": torch.nn.ModuleDict({"a": Linear(3,3)}),`
	`255`	`+ "z": Linear(3,3)`
	`256`	`+ }`
	`257`	`+ )`
	`258`	`+ parent_name, parent = get_lowest_common_module(["abc", "ab"], m)`
	`259`	`+ assert parent_name == ""`
	`260`	`+ parent_name, parent = get_lowest_common_module(["ab", "ab.a"], m)`
	`261`	`+ assert parent_name == "ab"`
	`262`	`+ parent_name, parent = get_lowest_common_module(["z"], m)`
	`263`	`+ assert parent_name == "z"`
	`264`	`+`