huggingface · HolyFalafel · Jan 19, 2025
@@ -0,0 +1,9 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "scale_format": "SCALAR",
+    "dump_stats_path": "./hqt_output/measure",
+    "use_qdq": "True"
+}
@@ -490,6 +490,10 @@ def get_k_proj_weight_dtype(self):
         Scales tensor gets the weight dtype."""
         if hasattr(self.k_proj, "qweight"):
             return self.k_proj.scales.dtype
+        elif hasattr(self.k_proj, "use_qdq") and self.k_proj.use_qdq:
+            return self.k_proj.dequant_weights.hp_dtype
+        elif isinstance(self.k_cache, KVCache) and "float8" in str(self.k_proj.weight.dtype):
+            return self.k_proj.scale_weight.dtype
         return self.k_proj.weight.dtype
 
     def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):