Skip to content

Commit 3de75d2

Browse files
author
Mohit Soni
committed
Onboarding Molmo Model
Signed-off-by: Mohit Soni <[email protected]>
1 parent b7775c9 commit 3de75d2

File tree

5 files changed

+949
-5
lines changed

5 files changed

+949
-5
lines changed

QEfficient/transformers/models/modeling_auto.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,9 @@ def model_name(self) -> str:
479479

480480
@property
481481
def get_model_config(self) -> dict:
482-
return self.model.model.vision_model.config.__dict__
482+
if hasattr(self.model.model, "vision_model"):
483+
return self.model.model.vision_model.config.__dict__
484+
return self.model.model.config.__dict__
483485

484486

485487
class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
@@ -536,7 +538,9 @@ def model_name(self) -> str:
536538

537539
@property
538540
def get_model_config(self) -> dict:
539-
return self.model.language_model.config.__dict__
541+
if hasattr(self.model, "language_model"):
542+
return self.model.language_model.config.__dict__
543+
return self.model.config.__dict__
540544

541545

542546
class _QEffAutoModelForImageTextToTextDualQPC:
@@ -652,7 +656,11 @@ def compile(
652656

653657
custom_io_vision = {}
654658
kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
659+
molmo = hasattr(self.model.config, "model_type") and self.model.config.model_type == "molmo"
660+
if molmo:
661+
custom_io_vision["image_masks"] = "float16"
655662
custom_io_vision["pixel_values"] = "float16"
663+
656664
for output_name in output_names["vision"]:
657665
if output_name.startswith("past_"):
658666
custom_io_vision[output_name] = kv_cache_dtype
@@ -804,11 +812,18 @@ def kv_offload_generate(
804812
inputs[k] = np.array(v)
805813

806814
vision_inputs = {
807-
k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
815+
k: v
816+
for k, v in inputs.items()
817+
if k
818+
in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"}
808819
}
809820

821+
molmo = hasattr(self.model.config, "model_type") and self.model.config.model_type == "molmo"
822+
810823
if vision_inputs:
811824
vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
825+
if molmo:
826+
vision_inputs["image_masks"] = vision_inputs["image_masks"].astype("float16")
812827
vision_start = perf_counter()
813828

814829
vision_outputs = {}
@@ -923,7 +938,10 @@ def __init__(
923938
self.model.config.llm_config._attn_implementation = "eager"
924939
self.model.config.vision_config.use_flash_attn = "false"
925940
else:
926-
self.model.config.text_config.use_cache = True
941+
if hasattr(self.model.config, "text_config"):
942+
self.model.config.text_config.use_cache = True
943+
else:
944+
self.model.config.use_cache = True
927945
self.hash_params["qeff_auto_class"] = self.__class__.__name__
928946

929947
@classmethod
@@ -1292,7 +1310,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
12921310
return cls(model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
12931311

12941312

1295-
MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText}
1313+
MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {
1314+
"InternVLChatModel": QEFFAutoModelForImageTextToText,
1315+
"MolmoForCausalLM": QEFFAutoModelForImageTextToText,
1316+
}
12961317

12971318

12981319
class QEFFAutoModelForCausalLM(QEFFBaseModel):
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)