@@ -479,7 +479,9 @@ def model_name(self) -> str:
479479
480480 @property
481481 def get_model_config (self ) -> dict :
482- return self .model .model .vision_model .config .__dict__
482+ if hasattr (self .model .model , "vision_model" ):
483+ return self .model .model .vision_model .config .__dict__
484+ return self .model .model .config .__dict__
483485
484486
485487class QEffCausalLMForTextImageToTextModel (QEFFBaseModel ):
@@ -536,7 +538,9 @@ def model_name(self) -> str:
536538
537539 @property
538540 def get_model_config (self ) -> dict :
539- return self .model .language_model .config .__dict__
541+ if hasattr (self .model , "language_model" ):
542+ return self .model .language_model .config .__dict__
543+ return self .model .config .__dict__
540544
541545
542546class _QEffAutoModelForImageTextToTextDualQPC :
@@ -652,7 +656,11 @@ def compile(
652656
653657 custom_io_vision = {}
654658 kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
659+ molmo = hasattr (self .model .config , "model_type" ) and self .model .config .model_type == "molmo"
660+ if molmo :
661+ custom_io_vision ["image_masks" ] = "float16"
655662 custom_io_vision ["pixel_values" ] = "float16"
663+
656664 for output_name in output_names ["vision" ]:
657665 if output_name .startswith ("past_" ):
658666 custom_io_vision [output_name ] = kv_cache_dtype
@@ -804,11 +812,18 @@ def kv_offload_generate(
804812 inputs [k ] = np .array (v )
805813
806814 vision_inputs = {
807- k : v for k , v in inputs .items () if k in {"pixel_values" , "aspect_ratio_ids" , "aspect_ratio_mask" }
815+ k : v
816+ for k , v in inputs .items ()
817+ if k
818+ in {"pixel_values" , "image_masks" , "image_input_idx" , "valid_idx" , "aspect_ratio_ids" , "aspect_ratio_mask" }
808819 }
809820
821+ molmo = hasattr (self .model .config , "model_type" ) and self .model .config .model_type == "molmo"
822+
810823 if vision_inputs :
811824 vision_inputs ["pixel_values" ] = vision_inputs ["pixel_values" ].astype ("float16" )
825+ if molmo :
826+ vision_inputs ["image_masks" ] = vision_inputs ["image_masks" ].astype ("float16" )
812827 vision_start = perf_counter ()
813828
814829 vision_outputs = {}
@@ -923,7 +938,10 @@ def __init__(
923938 self .model .config .llm_config ._attn_implementation = "eager"
924939 self .model .config .vision_config .use_flash_attn = "false"
925940 else :
926- self .model .config .text_config .use_cache = True
941+ if hasattr (self .model .config , "text_config" ):
942+ self .model .config .text_config .use_cache = True
943+ else :
944+ self .model .config .use_cache = True
927945 self .hash_params ["qeff_auto_class" ] = self .__class__ .__name__
928946
929947 @classmethod
@@ -1292,7 +1310,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
12921310 return cls (model , kv_offload = kv_offload , pretrained_model_name_or_path = pretrained_model_name_or_path , ** kwargs )
12931311
12941312
1295- MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel" : QEFFAutoModelForImageTextToText }
1313+ MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {
1314+ "InternVLChatModel" : QEFFAutoModelForImageTextToText ,
1315+ "MolmoForCausalLM" : QEFFAutoModelForImageTextToText ,
1316+ }
12961317
12971318
12981319class QEFFAutoModelForCausalLM (QEFFBaseModel ):
0 commit comments