diff --git a/fms_mo/quant/ptq.py b/fms_mo/quant/ptq.py index de2c572..b87ecf1 100644 --- a/fms_mo/quant/ptq.py +++ b/fms_mo/quant/ptq.py @@ -2140,7 +2140,7 @@ def get_blocks(model, model_type=None): None, "lm_head", ), - "granite": ( + "granite_gptbigcode": ( "transformer.h", "transformer.wte", "transformer.wpe", @@ -2148,6 +2148,14 @@ def get_blocks(model, model_type=None): "transformer.ln_f", "lm_head", ), + "granite": ( + "model.layers", + "model.embed_tokens", + "model.rotary_emb", + None, + "model.norm", + "lm_head", + ), "llama": ( "model.layers", "model.embed_tokens", diff --git a/fms_mo/run_quant.py b/fms_mo/run_quant.py index f13ee0b..a497239 100644 --- a/fms_mo/run_quant.py +++ b/fms_mo/run_quant.py @@ -155,7 +155,6 @@ def run_gptq(model_args, data_args, opt_args, gptq_args): v2_memory_device="cpu", ) - # Add custom model_type mapping to gptqmodel LUT so GPTQModel can recognize them. for mtype, cls in custom_gptq_classes.items(): if mtype in MODEL_MAP: diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py index 9f7da8e..95f3842 100644 --- a/fms_mo/training_args.py +++ b/fms_mo/training_args.py @@ -139,8 +139,8 @@ class OptArguments(TypeChecker): quant_method: str = field( metadata={ - "choices": ["gptq", "gptqv2", "fp8", "dq"], - "help": "Quantization technique" + "choices": ["gptq", "gptqv2", "fp8", "dq"], + "help": "Quantization technique", } ) output_dir: str = field( @@ -229,7 +229,6 @@ class GPTQArguments(TypeChecker): cache_examples_on_gpu: bool = True - @dataclass class FP8Arguments(TypeChecker): """Dataclass for FP8 related arguments that will be used by llm-compressor.""" diff --git a/fms_mo/utils/eval_utils.py b/fms_mo/utils/eval_utils.py index cce9428..774d74e 100644 --- a/fms_mo/utils/eval_utils.py +++ b/fms_mo/utils/eval_utils.py @@ -98,6 +98,10 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): # logger.info("All blocks are computed for evaluation") nlls = [] + + # Required by Granite-3.X (and potentially other models) to scale their logits + logits_scaling = getattr(model.config, "logits_scaling", 1) + # for i, data_mb in enumerate(dloader): #if using dloader. for i in tqdm(range(qcfg["n_samples"]), desc="Final Evaluating..."): hidden_states = qcfg["cached_input"][i].to(dev) @@ -107,6 +111,9 @@ def eval_llm_1GPU(qcfg, model, test_dataset, pre_cache_func=None, **kwargs): # lm_head.to(dev) lm_logits = lm_head(hidden_states) + # Scaling of the lm_head outputs to obtain the correct logits + lm_logits /= logits_scaling + # Shift so that tokens < n predict n shift_logits = lm_logits[:, :-1, :].contiguous().float() shift_labels = test_dataset.input_ids[:, (i * seq_len) : ((i + 1) * seq_len)][