foundation-model-stack · chichun-charlie-liu · Jul 24, 2025 · Jul 22, 2025 · Jul 23, 2025 · Jul 24, 2025
@@ -2140,14 +2140,22 @@ def get_blocks(model, model_type=None):
             None,
             "lm_head",
         ),
-        "granite": (
+        "granite_gptbigcode": (
             "transformer.h",
             "transformer.wte",
             "transformer.wpe",
             None,
             "transformer.ln_f",
             "lm_head",
         ),
+        "granite": (
+            "model.layers",
+            "model.embed_tokens",
+            "model.rotary_emb",
+            None,
+            "model.norm",
+            "lm_head",
+        ),
         "llama": (
             "model.layers",
             "model.embed_tokens",

@@ -155,7 +155,6 @@ def run_gptq(model_args, data_args, opt_args, gptq_args):
             v2_memory_device="cpu",
         )
 
-
     # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them.
     for mtype, cls in custom_gptq_classes.items():
         if mtype in MODEL_MAP:

@@ -139,8 +139,8 @@ class OptArguments(TypeChecker):
 
     quant_method: str = field(
         metadata={
-            "choices": ["gptq", "gptqv2", "fp8", "dq"], 
-            "help": "Quantization technique"
+            "choices": ["gptq", "gptqv2", "fp8", "dq"],
+            "help": "Quantization technique",
         }
     )
     output_dir: str = field(
@@ -229,7 +229,6 @@ class GPTQArguments(TypeChecker):
     cache_examples_on_gpu: bool = True
 
 
-
 @dataclass
 class FP8Arguments(TypeChecker):
     """Dataclass for FP8 related arguments that will be used by llm-compressor."""

@@ -98,6 +98,10 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
     logger.info("All blocks are computed for evaluation")
 
     nlls = []
+
+    # Required by Granite-3.X (and potentially other models) to scale their logits
+    logits_scaling = getattr(model.config, "logits_scaling", 1)
+
     # for i, data_mb in enumerate(dloader): #if using dloader.
     for i in tqdm(range(qcfg["n_samples"]), desc="Final Evaluating..."):
         hidden_states = qcfg["cached_input"][i].to(dev)
@@ -107,6 +111,9 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs):  #
         lm_head.to(dev)
         lm_logits = lm_head(hidden_states)
 
+        # Scaling of the lm_head outputs to obtain the correct logits
+        lm_logits /= logits_scaling
+
         # Shift so that tokens < n predict n
         shift_logits = lm_logits[:, :-1, :].contiguous().float()
         shift_labels = test_dataset.input_ids[:, (i * seq_len) : ((i + 1) * seq_len)][