diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py index 86dd22ef1..abfb97933 100644 --- a/optimum/commands/export/neuronx.py +++ b/optimum/commands/export/neuronx.py @@ -179,6 +179,13 @@ def parse_args_neuronx(parser: "ArgumentParser"): type=float, help="List of scaling factors for the lora adapters.", ) + optional_group.add_argument( + "--output_attentions", + action="store_true", + help="Whether or not for the traced model to return the attentions tensors of all attention layers.", + ) + + # Diffusion Only optional_group.add_argument( "--controlnet_ids", default=None, @@ -186,12 +193,39 @@ def parse_args_neuronx(parser: "ArgumentParser"): type=str, help="List of model ids (eg. `thibaud/controlnet-openpose-sdxl-1.0`) of ControlNet models.", ) - optional_group.add_argument( - "--output_attentions", - action="store_true", - help="Whether or not for the traced model to return the attentions tensors of all attention layers.", + ip_adapter_group = parser.add_argument_group("IP adapters") + ip_adapter_group.add_argument( + "--ip_adapter_id", + default=None, + nargs="*", + type=str, + help=( + "Model ids (eg. `h94/IP-Adapter`) of IP-Adapter models hosted on the Hub or paths to local directories containing the IP-Adapter weights." + ), + ) + ip_adapter_group.add_argument( + "--ip_adapter_subfolder", + default=None, + nargs="*", + type=str, + help="The subfolder location of a model file within a larger model repository on the Hub or locally. If a list is passed, it should have the same length as `ip_adapter_weight_names`.", + ) + ip_adapter_group.add_argument( + "--ip_adapter_weight_name", + default=None, + nargs="*", + type=str, + help="The name of the weight file to load. If a list is passed, it should have the same length as `ip_adapter_subfolders`.", + ) + ip_adapter_group.add_argument( + "--ip_adapter_scale", + default=None, + nargs="*", + type=float, + help="Scaling factors for the IP-Adapters.", ) + # Static Input Shapes input_group = parser.add_argument_group("Input shapes") doc_input = "that the Neuronx-cc compiler exported model will be able to take as input." input_group.add_argument( @@ -262,6 +296,7 @@ def parse_args_neuronx(parser: "ArgumentParser"): help=f"Audio tasks only. Audio sequence length {doc_input}", ) + # Optimization Level level_group = parser.add_mutually_exclusive_group() level_group.add_argument( "-O1", diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 43d9f6626..aa234dfb1 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -18,6 +18,7 @@ import inspect import os from argparse import ArgumentParser +from dataclasses import fields from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union @@ -36,6 +37,10 @@ DIFFUSION_MODEL_VAE_ENCODER_NAME, ENCODER_NAME, NEURON_FILE_NAME, + ImageEncoderArguments, + InputShapesArguments, + IPAdapterArguments, + LoRAAdapterArguments, is_neuron_available, is_neuronx_available, is_transformers_neuronx_available, @@ -278,6 +283,26 @@ def infer_stable_diffusion_shapes_from_diffusers( "encoder_hidden_size": encoder_hidden_size, } + # Image encoder + if getattr(model, "image_encoder", None): + input_shapes["image_encoder"] = { + "batch_size": input_shapes[unet_or_transformer_name]["batch_size"], + "num_channels": model.image_encoder.config.num_channels, + "width": model.image_encoder.config.image_size, + "height": model.image_encoder.config.image_size, + } + # IP-Adapter: add image_embeds as input for unet/transformer + # unet has `ip_adapter_image_embeds` with shape [batch_size, 1, (self.image_encoder.config.image_size//patch_size)**2+1, self.image_encoder.config.hidden_size] as input + if getattr(model.unet.config, "encoder_hid_dim_type", None) == "ip_image_proj": + input_shapes[unet_or_transformer_name]["image_encoder_shapes"] = ImageEncoderArguments( + sequence_length=model.image_encoder.vision_model.embeddings.position_embedding.weight.shape[0], + hidden_size=model.image_encoder.vision_model.embeddings.position_embedding.weight.shape[1], + projection_dim=getattr(model.image_encoder.config, "projection_dim", None), + ) + + # Format with `InputShapesArguments` + for sub_model_name in input_shapes.keys(): + input_shapes[sub_model_name] = InputShapesArguments(**input_shapes[sub_model_name]) return input_shapes @@ -294,11 +319,8 @@ def get_submodels_and_neuron_configs( submodels: Optional[Dict[str, Union[Path, str]]] = None, output_attentions: bool = False, output_hidden_states: bool = False, - lora_model_ids: Optional[Union[str, List[str]]] = None, - lora_weight_names: Optional[Union[str, List[str]]] = None, - lora_adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[Union[float, List[float]]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, + lora_args: Optional[LoRAAdapterArguments] = None, ): is_encoder_decoder = ( getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False @@ -315,11 +337,8 @@ def get_submodels_and_neuron_configs( dynamic_batch_size=dynamic_batch_size, submodels=submodels, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, controlnet_ids=controlnet_ids, + lora_args=lora_args, ) elif is_encoder_decoder: optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states} @@ -346,7 +365,10 @@ def get_submodels_and_neuron_configs( library_name=library_name, ) input_shapes = check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes) - neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes) + input_shapes = InputShapesArguments(**input_shapes) + neuron_config = neuron_config_constructor( + model.config, dynamic_batch_size=dynamic_batch_size, input_shapes=input_shapes + ) model_name = getattr(model, "name_or_path", None) or model_name_or_path model_name = model_name.split("/")[-1] if model_name else model.config.model_type output_model_names = {model_name: "model.neuron"} @@ -355,26 +377,6 @@ def get_submodels_and_neuron_configs( return models_and_neuron_configs, output_model_names -def _normalize_lora_params(lora_model_ids, lora_weight_names, lora_adapter_names, lora_scales): - if isinstance(lora_model_ids, str): - lora_model_ids = [ - lora_model_ids, - ] - if isinstance(lora_weight_names, str): - lora_weight_names = [ - lora_weight_names, - ] - if isinstance(lora_adapter_names, str): - lora_adapter_names = [ - lora_adapter_names, - ] - if isinstance(lora_scales, float): - lora_scales = [ - lora_scales, - ] - return lora_model_ids, lora_weight_names, lora_adapter_names, lora_scales - - def _get_submodels_and_neuron_configs_for_stable_diffusion( model: Union["PreTrainedModel", "DiffusionPipeline"], input_shapes: Dict[str, int], @@ -382,11 +384,8 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( dynamic_batch_size: bool = False, submodels: Optional[Dict[str, Union[Path, str]]] = None, output_hidden_states: bool = False, - lora_model_ids: Optional[Union[str, List[str]]] = None, - lora_weight_names: Optional[Union[str, List[str]]] = None, - lora_adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[Union[float, List[float]]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, + lora_args: Optional[LoRAAdapterArguments] = None, ): check_compiler_compatibility_for_stable_diffusion() model = replace_stable_diffusion_submodels(model, submodels) @@ -412,9 +411,6 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( model.feature_extractor.save_pretrained(output.joinpath("feature_extractor")) model.save_config(output) - lora_model_ids, lora_weight_names, lora_adapter_names, lora_scales = _normalize_lora_params( - lora_model_ids, lora_weight_names, lora_adapter_names, lora_scales - ) models_and_neuron_configs = get_diffusion_models_for_export( pipeline=model, text_encoder_input_shapes=input_shapes["text_encoder"], @@ -422,14 +418,12 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( transformer_input_shapes=input_shapes.get("transformer", None), vae_encoder_input_shapes=input_shapes["vae_encoder"], vae_decoder_input_shapes=input_shapes["vae_decoder"], + lora_args=lora_args, dynamic_batch_size=dynamic_batch_size, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, controlnet_ids=controlnet_ids, controlnet_input_shapes=input_shapes.get("controlnet", None), + image_encoder_input_shapes=input_shapes.get("image_encoder", None), ) output_model_names = { DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), @@ -449,6 +443,8 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( output_model_names[DIFFUSION_MODEL_TRANSFORMER_NAME] = os.path.join( DIFFUSION_MODEL_TRANSFORMER_NAME, NEURON_FILE_NAME ) + if getattr(model, "image_encoder", None) is not None: + output_model_names["image_encoder"] = os.path.join("image_encoder", NEURON_FILE_NAME) # ControlNet models if controlnet_ids: @@ -515,13 +511,11 @@ def load_models_and_neuron_configs( local_files_only: bool, token: Optional[Union[bool, str]], submodels: Optional[Dict[str, Union[Path, str]]], - lora_model_ids: Optional[Union[str, List[str]]], - lora_weight_names: Optional[Union[str, List[str]]], - lora_adapter_names: Optional[Union[str, List[str]]], - lora_scales: Optional[Union[float, List[float]]], torch_dtype: Optional[Union[str, torch.dtype]] = None, tensor_parallel_size: int = 1, controlnet_ids: Optional[Union[str, List[str]]] = None, + lora_args: Optional[LoRAAdapterArguments] = None, + ip_adapter_args: Optional[IPAdapterArguments] = None, output_attentions: bool = False, output_hidden_states: bool = False, **input_shapes, @@ -542,6 +536,14 @@ def load_models_and_neuron_configs( } if model is None: model = TasksManager.get_model_from_task(**model_kwargs) + # Load IP-Adapter if it exists + if ip_adapter_args is not None and not all( + getattr(ip_adapter_args, field.name) is None for field in fields(ip_adapter_args) + ): + model.load_ip_adapter( + ip_adapter_args.model_id, subfolder=ip_adapter_args.subfolder, weight_name=ip_adapter_args.weight_name + ) + model.set_ip_adapter_scale(scale=ip_adapter_args.scale) models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs( model=model, @@ -556,11 +558,8 @@ def load_models_and_neuron_configs( submodels=submodels, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, controlnet_ids=controlnet_ids, + lora_args=lora_args, ) return models_and_neuron_configs, output_model_names @@ -592,11 +591,9 @@ def main_export( output_attentions: bool = False, output_hidden_states: bool = False, library_name: Optional[str] = None, - lora_model_ids: Optional[Union[str, List[str]]] = None, - lora_weight_names: Optional[Union[str, List[str]]] = None, - lora_adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[Union[float, List[float]]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, + lora_args: Optional[LoRAAdapterArguments] = None, + ip_adapter_args: Optional[IPAdapterArguments] = None, **input_shapes, ): output = Path(output) @@ -627,12 +624,10 @@ def main_export( local_files_only=local_files_only, token=token, submodels=submodels, + lora_args=lora_args, + ip_adapter_args=ip_adapter_args, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, controlnet_ids=controlnet_ids, **input_shapes, ) @@ -640,7 +635,6 @@ def main_export( _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, output_dir=output, - torch_dtype=torch_dtype, disable_neuron_cache=disable_neuron_cache, compiler_workdir=compiler_workdir, inline_weights_to_neff=inline_weights_to_neff, @@ -752,6 +746,18 @@ def main(): compiler_kwargs = infer_compiler_kwargs(args) optional_outputs = customize_optional_outputs(args) optlevel = parse_optlevel(args) + lora_args = LoRAAdapterArguments( + model_ids=getattr(args, "lora_model_ids", None), + weight_names=getattr(args, "lora_weight_names", None), + adapter_names=getattr(args, "lora_adapter_names", None), + scales=getattr(args, "lora_scales", None), + ) + ip_adapter_args = IPAdapterArguments( + model_id=getattr(args, "ip_adapter_id", None), + subfolder=getattr(args, "ip_adapter_subfolder", None), + weight_name=getattr(args, "ip_adapter_weight_name", None), + scale=getattr(args, "ip_adapter_scale", None), + ) main_export( model_name_or_path=args.model, @@ -772,11 +778,9 @@ def main(): do_validation=not args.disable_validation, submodels=submodels, library_name=library_name, - lora_model_ids=getattr(args, "lora_model_ids", None), - lora_weight_names=getattr(args, "lora_weight_names", None), - lora_adapter_names=getattr(args, "lora_adapter_names", None), - lora_scales=getattr(args, "lora_scales", None), controlnet_ids=getattr(args, "controlnet_ids", None), + lora_args=lora_args, + ip_adapter_args=ip_adapter_args, **optional_outputs, **input_shapes, ) diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index d2af5db95..8a132c166 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -16,6 +16,7 @@ import re from abc import ABC, abstractmethod +from dataclasses import fields, is_dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -23,7 +24,7 @@ from optimum.utils import logging from ...exporters.base import ExportConfig -from ...neuron.utils import is_neuron_available +from ...neuron.utils import ImageEncoderArguments, InputShapesArguments, is_neuron_available if TYPE_CHECKING: @@ -119,6 +120,7 @@ class NeuronDefaultConfig(NeuronExportConfig, ABC): DUMMY_INPUT_GENERATOR_CLASSES = () ATOL_FOR_VALIDATION: Union[float, Dict[str, float]] = 1e-5 MODEL_TYPE = None + CUSTOM_MODEL_WRAPPER = None _TASK_TO_COMMON_OUTPUTS = { "depth-estimation": ["predicted_depth"], @@ -144,32 +146,15 @@ def __init__( self, config: "PretrainedConfig", task: str, + input_shapes: InputShapesArguments, compiler_type: Optional[str] = None, compiler_version: Optional[str] = None, tensor_parallel_size: int = 1, - batch_size: Optional[int] = None, - text_batch_size: Optional[int] = None, - image_batch_size: Optional[int] = None, dynamic_batch_size: bool = False, - sequence_length: Optional[int] = None, - num_choices: Optional[int] = None, - width: Optional[int] = None, - height: Optional[int] = None, - image_size: Optional[int] = None, - patch_size: Optional[int] = None, - num_channels: Optional[int] = None, - feature_size: Optional[int] = None, - nb_max_frames: Optional[int] = None, - audio_sequence_length: Optional[int] = None, - point_batch_size: Optional[int] = None, - nb_points_per_image: Optional[int] = None, - num_beams: Optional[int] = None, - vae_scale_factor: Optional[int] = None, - encoder_hidden_size: Optional[int] = None, output_attentions: bool = False, output_hidden_states: bool = False, - int_dtype: Union[str, torch.dtype] = "int64", - float_dtype: Union[str, torch.dtype] = "fp32", + int_dtype: Union[str, torch.dtype] = "int64", # Int dtype of dummy inputs used for tracing + float_dtype: Union[str, torch.dtype] = "fp32", # Float dtype of dummy inputs used for tracing ): self._config = config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) @@ -184,34 +169,45 @@ def __init__( if self.dynamic_batch_size is True and is_neuron_available(): logger.info("Overwriting batch size to 1 for neuron dynamic batch size support.") batch_size = 1 + else: + batch_size = input_shapes.batch_size # To avoid using **kwargs. axes_values = { "batch_size": batch_size, - "text_batch_size": text_batch_size, - "image_batch_size": image_batch_size, - "sequence_length": sequence_length, - "num_choices": num_choices, - "width": width, - "height": height, - "num_channels": num_channels or getattr(self._config, "num_channels", None), - "feature_size": feature_size, - "nb_max_frames": nb_max_frames, - "audio_sequence_length": audio_sequence_length, - "point_batch_size": point_batch_size, - "nb_points_per_image": nb_points_per_image, - "num_beams": num_beams, - "image_size": image_size or getattr(self._config, "image_size", None), - "patch_size": patch_size or getattr(self._config, "patch_size", None), - "vae_scale_factor": vae_scale_factor, - "encoder_hidden_size": encoder_hidden_size, + "text_batch_size": input_shapes.text_batch_size, + "image_batch_size": input_shapes.image_batch_size, + "sequence_length": input_shapes.sequence_length, + "num_choices": input_shapes.num_choices, + "width": input_shapes.width, + "height": input_shapes.height, + "num_channels": input_shapes.num_channels or getattr(self._config, "num_channels", None), + "feature_size": input_shapes.feature_size, + "nb_max_frames": input_shapes.nb_max_frames, + "audio_sequence_length": input_shapes.audio_sequence_length, + "point_batch_size": input_shapes.point_batch_size, + "nb_points_per_image": input_shapes.nb_points_per_image, + "num_beams": input_shapes.num_beams, + "image_size": input_shapes.image_size or getattr(self._config, "image_size", None), + "patch_size": input_shapes.patch_size or getattr(self._config, "patch_size", None), + "vae_scale_factor": input_shapes.vae_scale_factor, + "encoder_hidden_size": input_shapes.encoder_hidden_size, + "image_encoder_shapes": ImageEncoderArguments( + sequence_length=getattr(input_shapes.image_encoder_shapes, "sequence_length", None), + hidden_size=getattr(input_shapes.image_encoder_shapes, "hidden_size", None), + projection_dim=getattr(input_shapes.image_encoder_shapes, "projection_dim", None), + ), } - input_shapes = {} + valid_input_shapes = {} for name, value in axes_values.items(): if value is not None: - input_shapes[name] = value + is_empty_dataclass = is_dataclass(value) and all( + getattr(value, field.name) is None for field in fields(value) + ) + if not is_empty_dataclass: + valid_input_shapes[name] = value setattr(self, name, value) - setattr(self, "input_shapes", input_shapes) + setattr(self, "input_shapes", valid_input_shapes) setattr(self, "output_attentions", output_attentions) setattr(self, "output_hidden_states", output_hidden_states) setattr(self, "compiler_type", compiler_type) @@ -425,4 +421,7 @@ def forward(self, *input): return outputs - return ModelWrapper(model, list(dummy_inputs.keys())) + if self.CUSTOM_MODEL_WRAPPER is None: + return ModelWrapper(model, list(dummy_inputs.keys())) + else: + return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index a30a7ba47..10b263266 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -193,15 +193,14 @@ def validate_model_outputs( ref_outputs = reference_model(**ref_inputs) neuron_inputs = tuple(config.flatten_inputs(inputs).values()) elif "AutoencoderKL" in getattr(config._config, "_class_name", "") or getattr( - reference_model.config, "is_encoder_decoder", False + config._config, "is_encoder_decoder", False ): # VAE components for stable diffusion or Encoder-Decoder models ref_inputs = tuple(ref_inputs.values()) ref_outputs = reference_model(*ref_inputs) neuron_inputs = tuple(inputs.values()) - elif any( - pattern in getattr(config._config, "_class_name", "").lower() for pattern in ["controlnet", "transformer"] - ): + elif config.CUSTOM_MODEL_WRAPPER is not None: + ref_inputs = config.flatten_inputs(inputs) reference_model = config.patch_model_for_export(reference_model, ref_inputs) neuron_inputs = ref_inputs = tuple(ref_inputs.values()) ref_outputs = reference_model(*ref_inputs) @@ -298,7 +297,6 @@ def export_models( str, Tuple[Union["PreTrainedModel", "ModelMixin", torch.nn.Module], "NeuronDefaultConfig"] ], output_dir: Path, - torch_dtype: Optional[Union[str, torch.dtype]] = None, disable_neuron_cache: Optional[bool] = False, compiler_workdir: Optional[Path] = None, inline_weights_to_neff: bool = True, @@ -315,8 +313,6 @@ def export_models( A dictionnary containing the models to export and their corresponding neuron configs. output_dir (`Path`): Output directory to store the exported Neuron models. - torch_dtype (`Optional[Union[str, torch.dtype]]`, defaults to `None`): - Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the dtype will be automatically derived from the model's weights. disable_neuron_cache (`Optional[bool]`, defaults to `False`): Whether to disable automatic caching of AOT compiled models (not applicable for JIT compilation). compiler_workdir (`Optional[Path]`, defaults to `None`): diff --git a/optimum/exporters/neuron/model_configs/traced_configs.py b/optimum/exporters/neuron/model_configs/traced_configs.py index fc1005a3c..5d785346d 100644 --- a/optimum/exporters/neuron/model_configs/traced_configs.py +++ b/optimum/exporters/neuron/model_configs/traced_configs.py @@ -41,6 +41,7 @@ ASTDummyAudioInputGenerator, DummyBeamValuesGenerator, DummyControNetInputGenerator, + DummyIPAdapterInputGenerator, DummyMaskedPosGenerator, is_neuronx_distributed_available, ) @@ -52,6 +53,7 @@ VisionNeuronConfig, ) from ..model_wrappers import ( + CLIPVisionModelNeuronWrapper, ControlNetNeuronWrapper, NoCacheModelWrapper, PixartTransformerNeuronWrapper, @@ -146,9 +148,6 @@ class PhiNeuronConfig(ElectraNeuronConfig): def inputs(self) -> List[str]: return ["input_ids", "attention_mask"] - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - @register_in_tasks_manager("roformer", *COMMON_TEXT_TASKS) class RoFormerNeuronConfig(ElectraNeuronConfig): @@ -235,15 +234,29 @@ def inputs(self) -> List[str]: def outputs(self) -> List[str]: return ["token_embeddings", "sentence_embedding"] - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - class CLIPNormalizedConfig(NormalizedTextAndVisionConfig): TEXT_CONFIG = "text_config" VISION_CONFIG = "vision_config" +@register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="diffusers") +class CLIPVisionModelNeuronConfig(VisionNeuronConfig): + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + CUSTOM_MODEL_WRAPPER = CLIPVisionModelNeuronWrapper + + @property + def inputs(self) -> List[str]: + return ["pixel_values"] + + @property + def outputs(self) -> List[str]: + common_outputs = ["image_embeds", "last_hidden_state"] + if self.output_hidden_states: + common_outputs.append("hidden_states") + return common_outputs + + @register_in_tasks_manager("clip", *["feature-extraction", "zero-shot-image-classification"]) class CLIPNeuronConfig(TextAndVisionNeuronConfig): NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig @@ -311,9 +324,6 @@ class SentenceTransformersCLIPNeuronConfig(CLIPNeuronConfig): def outputs(self) -> List[str]: return ["text_embeds", "image_embeds"] - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - def _create_dummy_input_generator_classes(self, **kwargs) -> List["DummyInputGenerator"]: for name, axis_dim in self._axes.items(): self._axes[name] = kwargs.pop(name, axis_dim) @@ -598,6 +608,7 @@ class UNetNeuronConfig(VisionNeuronConfig): DummyTimestepInputGenerator, DummySeq2SeqDecoderTextInputGenerator, DummyControNetInputGenerator, + DummyIPAdapterInputGenerator, ) @property @@ -616,6 +627,13 @@ def inputs(self) -> List[str]: # outputs of controlnet common_inputs += ["down_block_additional_residuals", "mid_block_additional_residual"] + if self.with_ip_adapter: + # add output of image encoder + if self.image_encoder_output_hidden_states: + common_inputs += ["image_enc_hidden_states"] + else: + common_inputs += ["image_embeds"] + return common_inputs @property @@ -648,9 +666,6 @@ def generate_dummy_inputs(self, return_tuple: bool = False, **kwargs): else: return dummy_inputs - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - @property def is_sdxl(self) -> bool: return self._is_sdxl @@ -667,6 +682,17 @@ def with_controlnet(self) -> bool: def with_controlnet(self, with_controlnet: bool): self._with_controlnet = with_controlnet + @property + def with_ip_adapter(self) -> bool: + return self._with_ip_adapter + + @with_ip_adapter.setter + def with_ip_adapter(self, with_ip_adapter: bool): + self._with_ip_adapter = with_ip_adapter + if with_ip_adapter: + self.mandatory_axes += ("image_encoder_shapes",) + setattr(self, "image_encoder_shapes", self.input_shapes["image_encoder_shapes"]) + @register_in_tasks_manager("pixart-transformer-2d", *["semantic-segmentation"], library_name="diffusers") class PixartTransformerNeuronConfig(VisionNeuronConfig): @@ -707,9 +733,6 @@ def inputs(self) -> List[str]: def outputs(self) -> List[str]: return ["out_hidden_states"] - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - @register_in_tasks_manager("controlnet", *["semantic-segmentation"], library_name="diffusers") class ControlNetNeuronConfig(VisionNeuronConfig): @@ -755,9 +778,6 @@ def inputs(self) -> List[str]: def outputs(self) -> List[str]: return ["down_block_res_samples", "mid_block_res_sample"] - def patch_model_for_export(self, model, dummy_inputs): - return self.CUSTOM_MODEL_WRAPPER(model, list(dummy_inputs.keys())) - @register_in_tasks_manager("vae-encoder", *["semantic-segmentation"], library_name="diffusers") class VaeEncoderNeuronConfig(VisionNeuronConfig): diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py index 2e1c15639..e8fac13f0 100644 --- a/optimum/exporters/neuron/model_wrappers.py +++ b/optimum/exporters/neuron/model_wrappers.py @@ -40,7 +40,7 @@ def forward(self, *inputs): if len(inputs) != len(self.input_names): raise ValueError( f"The model needs {len(self.input_names)} inputs: {self.input_names}." - f" But only {len(input)} inputs are passed." + f" But only {len(inputs)} inputs are passed." ) ordered_inputs = dict(zip(self.input_names, inputs)) @@ -48,6 +48,8 @@ def forward(self, *inputs): added_cond_kwargs = { "text_embeds": ordered_inputs.pop("text_embeds", None), "time_ids": ordered_inputs.pop("time_ids", None), + "image_embeds": ordered_inputs.pop("image_embeds", None) + or ordered_inputs.pop("image_enc_hidden_states", None), } sample = ordered_inputs.pop("sample", None) timestep = ordered_inputs.pop("timestep").float().expand((sample.shape[0],)) @@ -568,6 +570,32 @@ def forward(self, input_ids, attention_mask): return out_tuple["token_embeddings"], out_tuple["sentence_embedding"] +class CLIPVisionModelNeuronWrapper(torch.nn.Module): + def __init__( + self, + model, + input_names: List[str], + output_hidden_states: bool = True, + ): + super().__init__() + self.model = model + self.input_names = input_names + self.output_hidden_states = output_hidden_states + + def forward(self, pixel_values): + vision_outputs = self.model.vision_model( + pixel_values=pixel_values, output_hidden_states=self.output_hidden_states + ) + pooled_output = vision_outputs[1] + image_embeds = self.model.visual_projection(pooled_output) + + outputs = (image_embeds, vision_outputs.last_hidden_state) + + if self.output_hidden_states: + outputs += (vision_outputs.hidden_states,) + return outputs + + class SentenceTransformersCLIPNeuronWrapper(torch.nn.Module): def __init__(self, model, input_names: List[str]): super().__init__() diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 248de9bb7..556c1dc6a 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -18,7 +18,7 @@ import os from collections import OrderedDict from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -32,6 +32,8 @@ DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, ENCODER_NAME, + InputShapesArguments, + LoRAAdapterArguments, get_attention_scores_sd, get_attention_scores_sdxl, neuron_scaled_dot_product_attention, @@ -64,6 +66,7 @@ StableDiffusionXLPipeline, UNet2DConditionModel, ) + from diffusers.models import ImageProjection from diffusers.models.attention_processor import Attention @@ -116,19 +119,17 @@ def build_stable_diffusion_components_mandatory_shapes( def get_diffusion_models_for_export( pipeline: "DiffusionPipeline", - text_encoder_input_shapes: Dict[str, int], - unet_input_shapes: Dict[str, int], - transformer_input_shapes: Dict[str, int], - vae_encoder_input_shapes: Dict[str, int], - vae_decoder_input_shapes: Dict[str, int], + text_encoder_input_shapes: Dict[str, Any], + unet_input_shapes: Dict[str, Any], + transformer_input_shapes: Dict[str, Any], + vae_encoder_input_shapes: Dict[str, Any], + vae_decoder_input_shapes: Dict[str, Any], + lora_args: LoRAAdapterArguments, dynamic_batch_size: Optional[bool] = False, output_hidden_states: bool = False, - lora_model_ids: Optional[List[str]] = None, - lora_weight_names: Optional[List[str]] = None, - lora_adapter_names: Optional[List[str]] = None, - lora_scales: Optional[List[float]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, - controlnet_input_shapes: Optional[Dict[str, int]] = None, + controlnet_input_shapes: Optional[Dict[str, Any]] = None, + image_encoder_input_shapes: Optional[Dict[str, Any]] = None, ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "NeuronDefaultConfig"]]: """ Returns the components of a Stable Diffusion model and their subsequent neuron configs. @@ -139,33 +140,29 @@ def get_diffusion_models_for_export( Args: pipeline ([`"DiffusionPipeline"`]): The model to export. - text_encoder_input_shapes (`Dict[str, int]`): + text_encoder_input_shapes (`Dict[str, Any]`): Static shapes used for compiling text encoder. - unet_input_shapes (`Dict[str, int]`): + unet_input_shapes (`Dict[str, Any]`): Static shapes used for compiling unet. - transformer_input_shapes (`Dict[str, int]`): + transformer_input_shapes (`Dict[str, Any]`): Static shapes used for compiling diffusion transformer. - vae_encoder_input_shapes (`Dict[str, int]`): + vae_encoder_input_shapes (`Dict[str, Any]`): Static shapes used for compiling vae encoder. - vae_decoder_input_shapes (`Dict[str, int]`): + vae_decoder_input_shapes (`Dict[str, Any]`): Static shapes used for compiling vae decoder. + lora_args (`LoRAAdapterArguments`): + Arguments for fetching the lora adapters, including `model_ids`, `weight_names`, `adapter_names` and `scales`. dynamic_batch_size (`bool`, defaults to `False`): Whether the Neuron compiled model supports dynamic batch size. output_hidden_states (`bool`, defaults to `False`): Whether or not for the traced text encoders to return the hidden states of all layers. - lora_model_ids (`Optional[List[str]]`, defaults to `None`): - List of model ids (eg. `ostris/super-cereal-sdxl-lora`) of pretrained lora models hosted on the Hub or paths to local directories containing the lora weights. - lora_weight_names (`Optional[List[str]]`, defaults to `None`): - List of lora weights file names. - lora_adapter_names (`Optional[List[str]]`, defaults to `None`): - List of adapter names to be used for referencing the loaded adapter models. - lora_scales (`Optional[List[float]]`, defaults to `None`): - List of scaling factors for lora adapters. controlnet_ids (`Optional[Union[str, List[str]]]`, defaults to `None`): Model ID of one or multiple ControlNets providing additional conditioning to the `unet` during the denoising process. If you set multiple ControlNets as a list, the outputs from each ControlNet are added together to create one combined additional conditioning. - controlnet_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`): + controlnet_input_shapes (`Optional[Dict[str, Any]]`, defaults to `None`): Static shapes used for compiling ControlNets. + image_encoder_input_shapes (`Optional[Dict[str, Any]]`, defaults to `None`): + Static shapes used for compiling the image encoder. Returns: `Dict[str, Tuple[Union[`PreTrainedModel`, `ModelMixin`], `NeuronDefaultConfig`]`: A Dict containing the model and @@ -173,10 +170,7 @@ def get_diffusion_models_for_export( """ models_for_export = get_submodels_for_export_diffusion( pipeline=pipeline, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, + lora_args=lora_args, controlnet_ids=controlnet_ids, ) library_name = "diffusers" @@ -195,7 +189,7 @@ def get_diffusion_models_for_export( task="feature-extraction", dynamic_batch_size=dynamic_batch_size, output_hidden_states=output_hidden_states, - **text_encoder_input_shapes, + input_shapes=text_encoder_input_shapes, ) models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = (text_encoder, text_encoder_neuron_config) @@ -213,7 +207,7 @@ def get_diffusion_models_for_export( task="feature-extraction", dynamic_batch_size=dynamic_batch_size, output_hidden_states=output_hidden_states, - **text_encoder_input_shapes, + input_shapes=text_encoder_input_shapes, ) models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2) @@ -232,14 +226,14 @@ def get_diffusion_models_for_export( task="semantic-segmentation", dynamic_batch_size=dynamic_batch_size, float_dtype=unet.dtype, - **unet_input_shapes, + input_shapes=unet_input_shapes, ) is_stable_diffusion_xl = isinstance( pipeline, (StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline) ) unet_neuron_config.is_sdxl = is_stable_diffusion_xl - unet_neuron_config.with_controlnet = True if controlnet_ids else False + unet_neuron_config.with_ip_adapter = getattr(unet.config, "encoder_hid_dim_type", None) == "ip_image_proj" models_for_export[DIFFUSION_MODEL_UNET_NAME] = (unet, unet_neuron_config) @@ -261,7 +255,7 @@ def get_diffusion_models_for_export( task="semantic-segmentation", dynamic_batch_size=dynamic_batch_size, float_dtype=transformer.dtype, - **transformer_input_shapes, + input_shapes=transformer_input_shapes, ) models_for_export[DIFFUSION_MODEL_TRANSFORMER_NAME] = (transformer, transformer_neuron_config) @@ -279,7 +273,7 @@ def get_diffusion_models_for_export( task="semantic-segmentation", dynamic_batch_size=dynamic_batch_size, float_dtype=vae_encoder.dtype, - **vae_encoder_input_shapes, + input_shapes=vae_encoder_input_shapes, ) models_for_export[DIFFUSION_MODEL_VAE_ENCODER_NAME] = (vae_encoder, vae_encoder_neuron_config) @@ -297,7 +291,7 @@ def get_diffusion_models_for_export( task="semantic-segmentation", dynamic_batch_size=dynamic_batch_size, float_dtype=transformer.dtype if transformer else vae_decoder.dtype, - **vae_decoder_input_shapes, + input_shapes=vae_decoder_input_shapes, ) models_for_export[DIFFUSION_MODEL_VAE_DECODER_NAME] = (vae_decoder, vae_decoder_neuron_config) @@ -320,54 +314,57 @@ def get_diffusion_models_for_export( task="semantic-segmentation", dynamic_batch_size=dynamic_batch_size, float_dtype=controlnet.dtype, - **controlnet_input_shapes, + input_shapes=controlnet_input_shapes, ) models_for_export[controlnet_name] = ( controlnet, controlnet_neuron_config, ) + # IP-Adapter: need to compile the image encoder + if "image_encoder" in models_for_export: + image_encoder = models_for_export["image_encoder"] + output_hidden_states = not isinstance(unet.encoder_hid_proj.image_projection_layers[0], ImageProjection) + image_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=image_encoder, + exporter="neuron", + task="feature-extraction", + model_type="clip-vision-model", + library_name=library_name, + ) + image_encoder_neuron_config = image_encoder_config_constructor( + image_encoder.config, + task="feature-extraction", + dynamic_batch_size=dynamic_batch_size, + output_hidden_states=output_hidden_states, + input_shapes=image_encoder_input_shapes, + ) + models_for_export["image_encoder"] = (image_encoder, image_encoder_neuron_config) + models_for_export[DIFFUSION_MODEL_UNET_NAME][1].image_encoder_output_hidden_states = output_hidden_states + return models_for_export -def _load_lora_weights_to_pipeline( - pipeline: "DiffusionPipeline", - lora_model_ids: Optional[Union[str, List[str]]] = None, - weight_names: Optional[Union[str, List[str]]] = None, - adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[Union[float, List[float]]] = None, -): - if isinstance(lora_model_ids, str): - lora_model_ids = [ - lora_model_ids, - ] - if isinstance(weight_names, str): - weight_names = [ - weight_names, - ] - if isinstance(adapter_names, str): - adapter_names = [ - adapter_names, - ] - if isinstance(lora_scales, float): - lora_scales = [ - lora_scales, - ] - if lora_model_ids and weight_names: - if len(lora_model_ids) == 1: - pipeline.load_lora_weights(lora_model_ids[0], weight_name=weight_names[0]) +def _load_lora_weights_to_pipeline(pipeline: "DiffusionPipeline", lora_args: Optional[LoRAAdapterArguments]): + if lora_args is None: + lora_args = LoRAAdapterArguments() + if lora_args.model_ids and lora_args.weight_names: + if len(lora_args.model_ids) == 1: + pipeline.load_lora_weights(lora_args.model_ids[0], weight_name=lora_args.weight_names[0]) # For tracing the lora weights, we need to use PEFT to fuse adapters directly into the model weights. It won't work by passing the lora scale to the Neuron pipeline during the inference. - pipeline.fuse_lora(lora_scale=lora_scales[0] if lora_scales else 1.0) - elif len(lora_model_ids) > 1: - if not len(lora_model_ids) == len(weight_names) == len(adapter_names): + pipeline.fuse_lora(lora_scale=lora_args.scales[0] if lora_args.scales else 1.0) + elif len(lora_args.model_ids) > 1: + if not len(lora_args.model_ids) == len(lora_args.weight_names) == len(lora_args.adapter_names): raise ValueError( - f"weight_name and lora_scale are required to fuse more than one lora. You have {len(lora_model_ids)} lora models to fuse, but you have {len(weight_names)} lora weight names and {len(adapter_names)} adapter names." + f"weight_name and lora_scale are required to fuse more than one lora. You have {len(lora_args.model_ids)} lora models to fuse, but you have {len(lora_args.weight_names)} lora weight names and {len(lora_args.adapter_names)} adapter names." ) - for model_id, weight_name, adapter_name in zip(lora_model_ids, weight_names, adapter_names): + for model_id, weight_name, adapter_name in zip( + lora_args.model_ids, lora_args.weight_names, lora_args.adapter_names + ): pipeline.load_lora_weights(model_id, weight_name=weight_name, adapter_name=adapter_name) - if lora_scales: - pipeline.set_adapters(adapter_names, adapter_weights=lora_scales) + if lora_args.scales: + pipeline.set_adapters(lora_args.adapter_names, adapter_weights=lora_args.scales) pipeline.fuse_lora() return pipeline @@ -386,11 +383,8 @@ def load_controlnets(controlnet_ids: Optional[Union[str, List[str]]] = None): def get_submodels_for_export_diffusion( pipeline: "DiffusionPipeline", + lora_args: LoRAAdapterArguments, output_hidden_states: bool = False, - lora_model_ids: Optional[Union[str, List[str]]] = None, - lora_weight_names: Optional[Union[str, List[str]]] = None, - lora_adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[List[float]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]: """ @@ -401,13 +395,7 @@ def get_submodels_for_export_diffusion( ) # Lora - pipeline = _load_lora_weights_to_pipeline( - pipeline=pipeline, - lora_model_ids=lora_model_ids, - weight_names=lora_weight_names, - adapter_names=lora_adapter_names, - lora_scales=lora_scales, - ) + pipeline = _load_lora_weights_to_pipeline(pipeline=pipeline, lora_args=lora_args) models_for_export = [] @@ -494,6 +482,11 @@ def attention_wrapper(query, key, value, attn_mask=None, dropout_p=None, is_caus controlnet.config.time_cond_proj_dim = pipeline.unet.config.time_cond_proj_dim models_for_export.append((DIFFUSION_MODEL_CONTROLNET_NAME + "_" + str(idx), controlnet)) + # Image Encoder + image_encoder = getattr(pipeline, "image_encoder", None) + if image_encoder is not None: + models_for_export.append(("image_encoder", copy.deepcopy(image_encoder))) + return OrderedDict(models_for_export) @@ -598,12 +591,13 @@ def get_encoder_decoder_models_for_export( library_name="transformers", ) check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes) + input_shape_args = InputShapesArguments(**input_shapes) encoder_neuron_config = encoder_config_constructor( config=model.config, task=task, dynamic_batch_size=dynamic_batch_size, tensor_parallel_size=tensor_parallel_size, - **input_shapes, + input_shapes=input_shape_args, ) if not tensor_parallel_size > 1: models_for_export[ENCODER_NAME] = (model, encoder_neuron_config) @@ -623,7 +617,7 @@ def get_encoder_decoder_models_for_export( task=task, library_name="transformers", ) - check_mandatory_input_shapes(encoder_config_constructor, task, input_shapes) + check_mandatory_input_shapes(decoder_config_constructor, task, input_shapes) decoder_neuron_config = decoder_config_constructor( config=model.config, task=task, @@ -631,7 +625,7 @@ def get_encoder_decoder_models_for_export( tensor_parallel_size=tensor_parallel_size, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - **input_shapes, + input_shapes=input_shape_args, ) if not tensor_parallel_size > 1: models_for_export[DECODER_NAME] = (model, decoder_neuron_config) diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index 944d76ebe..703c1297e 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -22,12 +22,14 @@ import shutil from abc import abstractmethod from collections import OrderedDict +from dataclasses import asdict from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union import torch from huggingface_hub import snapshot_download +from torch.nn import ModuleList from transformers import CLIPFeatureExtractor, CLIPTokenizer, PretrainedConfig, T5Tokenizer from transformers.modeling_outputs import ModelOutput @@ -43,6 +45,7 @@ from .modeling_traced import NeuronTracedModel from .utils import ( DIFFUSION_MODEL_CONTROLNET_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, DIFFUSION_MODEL_TRANSFORMER_NAME, @@ -51,6 +54,7 @@ DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME, DiffusersPretrainedConfig, + NeuronArgumentParser, check_if_weights_replacable, is_neuronx_available, replace_weights, @@ -93,6 +97,7 @@ from diffusers.image_processor import PixArtImageProcessor, VaeImageProcessor from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution from diffusers.models.controlnet import ControlNetOutput + from diffusers.models.embeddings import ImageProjection, IPAdapterFullImageProjection from diffusers.models.modeling_outputs import AutoencoderKLOutput from diffusers.pipelines.controlnet import MultiControlNetModel from diffusers.pipelines.pipeline_utils import DiffusionPipeline @@ -137,6 +142,7 @@ class NeuronDiffusionPipelineBase(NeuronTracedModel): "transformer", "feature_extractor", ] + encoder_hid_proj = None # A dummy module of Unet/transformer when they take the outputs of image encoder. def __init__( self, @@ -327,7 +333,16 @@ def __init__( self.scheduler = LCMScheduler.from_config(self.scheduler.config) self.feature_extractor = feature_extractor - self.image_encoder = image_encoder # TODO: implement the class `NeuronImageEncoder`. + self.image_encoder = ( + NeuronModelImageEncoder( + image_encoder, + self, + self.configs[DIFFUSION_MODEL_IMAGE_ENCODER_NAME], + self.neuron_configs[DIFFUSION_MODEL_IMAGE_ENCODER_NAME], + ) + if image_encoder is not None and not isinstance(image_encoder, NeuronModelImageEncoder) + else image_encoder + ) self.safety_checker = safety_checker # TODO: implement the class `NeuronStableDiffusionSafetyChecker`. all_possible_init_args = { @@ -379,6 +394,8 @@ def __init__( self.control_image_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False ) + # create dummy objects for inference with ip adapters + self._maybe_create_dummy_image_proj_layers() @staticmethod def is_lcm(unet_config): @@ -392,6 +409,7 @@ def load_model( data_parallel_mode: Optional[Literal["none", "unet", "transformer", "all"]], text_encoder_path: Optional[Union[str, Path]] = None, text_encoder_2_path: Optional[Union[str, Path]] = None, + image_encoder_path: Optional[Union[str, Path]] = None, unet_path: Optional[Union[str, Path]] = None, transformer_path: Optional[Union[str, Path]] = None, vae_encoder_path: Optional[Union[str, Path]] = None, @@ -412,6 +430,8 @@ def load_model( Path of the compiled text encoder. text_encoder_2_path (`Optional[Union[str, Path]]`, defaults to `None`): Path of the compiled second frozen text encoder. SDXL only. + image_encoder_path (`Optional[Union[str, Path]]`, defaults to `None`): + Path of the compiled image encoder. unet_path (`Optional[Union[str, Path]]`, defaults to `None`): Path of the compiled U-NET. transformer_path (`Optional[Union[str, Path]]`, defaults to `None`): @@ -436,6 +456,7 @@ def load_model( "vae_encoder": vae_encoder_path, "vae_decoder": vae_decoder_path, "controlnet": controlnet_paths, + "image_encoder": image_encoder_path, } def _load_models_to_neuron(submodels, models_on_both_cores=None, models_on_a_single_core=None): @@ -517,7 +538,15 @@ def _load_models_to_neuron(submodels, models_on_both_cores=None, models_on_a_sin def replace_weights(self, weights: Optional[Union[Dict[str, torch.Tensor], torch.nn.Module]] = None): check_if_weights_replacable(self.configs, weights) - model_names = ["text_encoder", "text_encoder_2", "unet", "transformer", "vae_decoder", "vae_encoder"] + model_names = [ + "text_encoder", + "text_encoder_2", + "unet", + "transformer", + "vae_decoder", + "vae_encoder", + "image_encoder", + ] for name in model_names: model = getattr(self, name, None) weight = getattr(weights, name, None) @@ -552,6 +581,7 @@ def _save_pretrained( vae_encoder_file_name: str = NEURON_FILE_NAME, vae_decoder_file_name: str = NEURON_FILE_NAME, controlnet_file_name: str = NEURON_FILE_NAME, + image_encoder_file_name: str = NEURON_FILE_NAME, ): """ Saves the model to the serialized format optimized for Neuron devices. @@ -576,6 +606,7 @@ def _remove_submodel_if_non_exist(model_names): DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_TRANSFORMER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, ] ) @@ -604,6 +635,9 @@ def _remove_submodel_if_non_exist(model_names): DIFFUSION_MODEL_VAE_DECODER_NAME: save_directory / DIFFUSION_MODEL_VAE_DECODER_NAME / vae_decoder_file_name, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME: save_directory + / DIFFUSION_MODEL_IMAGE_ENCODER_NAME + / image_encoder_file_name, } dst_paths[DIFFUSION_MODEL_CONTROLNET_NAME] = [ save_directory / (DIFFUSION_MODEL_CONTROLNET_NAME + f"_{str(idx)}") / controlnet_file_name @@ -662,6 +696,7 @@ def _from_pretrained( vae_encoder_file_name: Optional[str] = NEURON_FILE_NAME, vae_decoder_file_name: Optional[str] = NEURON_FILE_NAME, controlnet_file_name: Optional[str] = NEURON_FILE_NAME, + image_encoder_file_name: Optional[str] = NEURON_FILE_NAME, local_files_only: bool = False, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, data_parallel_mode: Optional[Literal["none", "unet", "transformer", "all"]] = None, @@ -683,6 +718,7 @@ def _from_pretrained( vae_encoder_file_name, vae_decoder_file_name, controlnet_file_name, + image_encoder_file_name, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name, @@ -723,6 +759,10 @@ def _from_pretrained( new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_NAME / text_encoder_2_file_name, new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_NAME / cls.sub_component_config_name, ), + "image_encoder": ( + new_model_save_dir / DIFFUSION_MODEL_IMAGE_ENCODER_NAME / image_encoder_file_name, + new_model_save_dir / DIFFUSION_MODEL_IMAGE_ENCODER_NAME / cls.sub_component_config_name, + ), "unet": ( new_model_save_dir / DIFFUSION_MODEL_UNET_NAME / unet_file_name, new_model_save_dir / DIFFUSION_MODEL_UNET_NAME / cls.sub_component_config_name, @@ -782,6 +822,7 @@ def _from_pretrained( vae_decoder_path=model_and_config_save_paths["vae_decoder"][0], vae_encoder_path=model_and_config_save_paths["vae_encoder"][0], text_encoder_2_path=model_and_config_save_paths["text_encoder_2"][0], + image_encoder_path=model_and_config_save_paths["image_encoder"][0], controlnet_paths=model_and_config_save_paths["controlnet"][0], dynamic_batch_size=neuron_configs[DIFFUSION_MODEL_TEXT_ENCODER_NAME].dynamic_batch_size, to_neuron=not inline_weights_to_neff, @@ -798,6 +839,7 @@ def _from_pretrained( vae_encoder=pipe.get("vae_encoder"), vae_decoder=pipe.get("vae_decoder"), controlnet=pipe.get("controlnet"), + image_encoder=pipe.get("image_encoder"), config=config, tokenizer=sub_models.get("tokenizer", None), tokenizer_2=sub_models.get("tokenizer_2", None), @@ -841,12 +883,8 @@ def _export( dynamic_batch_size: bool = False, output_hidden_states: bool = False, data_parallel_mode: Optional[Literal["none", "unet", "transformer", "all"]] = None, - lora_model_ids: Optional[Union[str, List[str]]] = None, - lora_weight_names: Optional[Union[str, List[str]]] = None, - lora_adapter_names: Optional[Union[str, List[str]]] = None, - lora_scales: Optional[Union[float, List[float]]] = None, controlnet_ids: Optional[Union[str, List[str]]] = None, - **kwargs_shapes, + **kwargs, ) -> "NeuronDiffusionPipelineBase": """ Args: @@ -919,9 +957,21 @@ def _export( Lora adapters scaling factors. controlnet_ids (`Optional[Union[str, List[str]]]`, defaults to `None`): List of ControlNet model ids (eg. `thibaud/controlnet-openpose-sdxl-1.0`)." - kwargs_shapes (`Dict[str, int]`): - Shapes to use during inference. This argument allows to override the default shapes used during the export. + ip_adapter_ids (`Optional[Union[str, List[str]]]`, defaults to `None`): + Model ids (eg. `h94/IP-Adapter`) of IP-Adapter models hosted on the Hub or paths to local directories containing the IP-Adapter weights. + ip_adapter_subfolders (`Optional[Union[str, List[str]]]`, defaults to `None`): + The subfolder location of a model file within a larger model repository on the Hub or locally. If a list is passed, it should have the same length as `ip_adapter_weight_names`. + ip_adapter_weight_names (`Optional[Union[str, List[str]]]`, defaults to `None`): + The name of the weight file to load. If a list is passed, it should have the same length as `ip_adapter_subfolders`. + ip_adapter_scales (`Optional[Union[float, List[float]]]`, defaults to `None`): + Scaling factors for the IP-Adapters. """ + # Parse kwargs to their dataclass + parser = NeuronArgumentParser(**kwargs) + lora_args = parser.lora_args + ip_adapter_args = parser.ip_adapter_args + kwargs_shapes = asdict(parser.input_shapes) + if task is None: if cls.task is not None: task = cls.task @@ -976,11 +1026,9 @@ def _export( local_files_only=local_files_only, token=token, submodels=submodels, + lora_args=lora_args, + ip_adapter_args=ip_adapter_args, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, torch_dtype=torch_dtype, controlnet_ids=controlnet_ids, **input_shapes_copy, @@ -1038,6 +1086,8 @@ def _export( model_name_or_path=model_id, output=save_dir_path, compiler_kwargs=compiler_kwargs, + lora_args=lora_args, + ip_adapter_args=ip_adapter_args, torch_dtype=torch_dtype, task=task, dynamic_batch_size=dynamic_batch_size, @@ -1055,10 +1105,6 @@ def _export( do_validation=False, submodels={"unet": unet_id}, output_hidden_states=output_hidden_states, - lora_model_ids=lora_model_ids, - lora_weight_names=lora_weight_names, - lora_adapter_names=lora_adapter_names, - lora_scales=lora_scales, controlnet_ids=controlnet_ids, library_name=cls.library_name, **input_shapes, @@ -1112,6 +1158,10 @@ def do_classifier_free_guidance(self): ) ) + def _maybe_create_dummy_image_proj_layers(self): + if all([self.image_encoder, self.encoder_hid_proj]): + self.unet.encoder_hid_proj = self.encoder_hid_proj + def __call__(self, *args, **kwargs): # Height and width to unet/transformer (static shapes) unet_or_transformer = self.unet or self.transformer @@ -1204,6 +1254,42 @@ def modules(self): return [] +class NeuronModelImageEncoder(_NeuronDiffusionModelPart): + def __init__( + self, + model: torch.jit._script.ScriptModule, + parent_pipeline: NeuronDiffusionPipelineBase, + config: Optional[DiffusersPretrainedConfig] = None, + neuron_config: Optional[Dict[str, str]] = None, + ): + super().__init__(model, parent_pipeline, config, neuron_config, DIFFUSION_MODEL_IMAGE_ENCODER_NAME) + + def forward( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = True, + ): + inputs = (pixel_values,) + + outputs = self.model(*inputs) + + if return_dict: + outputs = ModelOutput(dict(zip(self.neuron_config.outputs, outputs))) + + return outputs + + # Create a dummy parameters to be compatible with `https://github.com/huggingface/diffusers/blob/c14057c8dbc32847bac9082bcc0ae00c9a19357d/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L514` + def parameters(self): + class DummyObject: + def __init__(self): + self.dtype = None + + return iter([DummyObject()]) + + class NeuronModelUnet(_NeuronDiffusionModelPart): def __init__( self, @@ -1240,9 +1326,15 @@ def forward( for idx in range(len(down_block_additional_residuals)): inputs = inputs + (down_block_additional_residuals[idx],) if added_cond_kwargs: - text_embeds = added_cond_kwargs.pop("text_embeds", None) - time_ids = added_cond_kwargs.pop("time_ids", None) - inputs = inputs + (text_embeds, time_ids) + optional_inputs_names = ["text_embeds", "time_ids", "image_embeds"] + for optional_input_name in optional_inputs_names: + optional_input = added_cond_kwargs.get(optional_input_name, None) + if isinstance(optional_input, List): + optional_input = ( + torch.stack(optional_input, dim=0) if len(optional_input) > 1 else optional_input[0] + ) + if optional_input is not None: + inputs = inputs + (optional_input,) outputs = self.model(*inputs) if return_dict: @@ -1467,6 +1559,14 @@ class NeuronStableDiffusionPipeline(NeuronDiffusionPipelineBase, StableDiffusion main_input_name = "prompt" auto_model_class = StableDiffusionPipeline + class DummyEncoderHidProj: + def __init__(self): + self.image_projection_layers = ModuleList( + [IPAdapterFullImageProjection()] + ) # TODO: support multiple IP adapters + + encoder_hid_proj = DummyEncoderHidProj() + class NeuronStableDiffusionImg2ImgPipeline(NeuronDiffusionPipelineBase, StableDiffusionImg2ImgPipeline): main_input_name = "image" @@ -1522,6 +1622,12 @@ class NeuronStableDiffusionXLPipeline( main_input_name = "prompt" auto_model_class = StableDiffusionXLPipeline + class DummyEncoderHidProj: + def __init__(self): + self.image_projection_layers = ModuleList([ImageProjection()]) # TODO: support multiple IP adapters + + encoder_hid_proj = DummyEncoderHidProj() + class NeuronStableDiffusionXLImg2ImgPipeline( NeuronStableDiffusionXLPipelineMixin, NeuronDiffusionPipelineBase, StableDiffusionXLImg2ImgPipeline diff --git a/optimum/neuron/modeling_traced.py b/optimum/neuron/modeling_traced.py index ef92da0b0..0673cff99 100644 --- a/optimum/neuron/modeling_traced.py +++ b/optimum/neuron/modeling_traced.py @@ -32,6 +32,7 @@ from .modeling_base import NeuronModel from .utils import ( NEURON_FILE_NAME, + InputShapesArguments, check_if_weights_replacable, is_neuron_available, replace_weights, @@ -463,13 +464,15 @@ def _neuron_config_init(cls, config: "PretrainedConfig") -> "NeuronDefaultConfig library_name=cls.library_name, ) + compile_shapes = InputShapesArguments(**compile_shapes) return neuron_config_constructor( config, dynamic_batch_size=neuron_config.get("dynamic_batch_size", False), compiler_type=compiler_type, compiler_version=compiler_version, tensor_parallel_size=tensor_parallel_size, - **compile_shapes, + input_shapes=compile_shapes, + output_hidden_states=neuron_config.get("output_hidden_states", False), ) @classmethod diff --git a/optimum/neuron/pipelines/__init__.py b/optimum/neuron/pipelines/__init__.py index d45c704ff..bd342cf34 100644 --- a/optimum/neuron/pipelines/__init__.py +++ b/optimum/neuron/pipelines/__init__.py @@ -24,11 +24,13 @@ "NeuronStableDiffusionXLPipelineMixin", "NeuronStableDiffusionControlNetPipelineMixin", "NeuronStableDiffusionXLControlNetPipelineMixin", + "NeuronIPAdapterMixin", ], } if TYPE_CHECKING: from .diffusers import ( + NeuronIPAdapterMixin, NeuronStableDiffusionControlNetPipelineMixin, NeuronStableDiffusionXLControlNetPipelineMixin, NeuronStableDiffusionXLPipelineMixin, diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index 1a4179f26..704c117bb 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -19,11 +19,20 @@ _import_structure = { - "argument_utils": ["convert_neuronx_compiler_args_to_neuron", "store_compilation_config"], + "argument_utils": [ + "LoRAAdapterArguments", + "IPAdapterArguments", + "ImageEncoderArguments", + "InputShapesArguments", + "NeuronArgumentParser", + "convert_neuronx_compiler_args_to_neuron", + "store_compilation_config", + ], "constant": [ "DECODER_NAME", "DIFFUSION_MODEL_TEXT_ENCODER_2_NAME", "DIFFUSION_MODEL_TEXT_ENCODER_NAME", + "DIFFUSION_MODEL_IMAGE_ENCODER_NAME", "DIFFUSION_MODEL_UNET_NAME", "DIFFUSION_MODEL_TRANSFORMER_NAME", "DIFFUSION_MODEL_VAE_DECODER_NAME", @@ -54,6 +63,7 @@ "DummyMaskedPosGenerator", "DummyControNetInputGenerator", "ASTDummyAudioInputGenerator", + "DummyIPAdapterInputGenerator", ], "misc": [ "DiffusersPretrainedConfig", @@ -87,10 +97,19 @@ } if TYPE_CHECKING: - from .argument_utils import convert_neuronx_compiler_args_to_neuron, store_compilation_config + from .argument_utils import ( + ImageEncoderArguments, + InputShapesArguments, + IPAdapterArguments, + LoRAAdapterArguments, + NeuronArgumentParser, + convert_neuronx_compiler_args_to_neuron, + store_compilation_config, + ) from .constant import ( DECODER_NAME, DIFFUSION_MODEL_CONTROLNET_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, DIFFUSION_MODEL_TRANSFORMER_NAME, @@ -121,6 +140,7 @@ ASTDummyAudioInputGenerator, DummyBeamValuesGenerator, DummyControNetInputGenerator, + DummyIPAdapterInputGenerator, DummyMaskedPosGenerator, ) from .misc import ( diff --git a/optimum/neuron/utils/argument_utils.py b/optimum/neuron/utils/argument_utils.py index 71f723eb0..a64261d0b 100644 --- a/optimum/neuron/utils/argument_utils.py +++ b/optimum/neuron/utils/argument_utils.py @@ -15,6 +15,7 @@ """Utilities related to CLI arguments.""" import os +from dataclasses import asdict, dataclass, fields, is_dataclass from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from ...utils import logging @@ -29,6 +30,108 @@ DISABLE_STRICT_MODE = os.environ.get("OPTIMUM_DISABLE_STRICT_MODE", "0") +@dataclass +class LoRAAdapterArguments: + model_ids: Optional[Union[str, List[str]]] = None + weight_names: Optional[Union[str, List[str]]] = None + adapter_names: Optional[Union[str, List[str]]] = None + scales: Optional[Union[float, List[float]]] = None + + def __post_init__(self): + if isinstance(self.model_ids, str): + self.model_ids = [ + self.model_ids, + ] + if isinstance(self.weight_names, str): + self.weight_names = [ + self.weight_names, + ] + if isinstance(self.adapter_names, str): + self.adapter_names = [ + self.adapter_names, + ] + if isinstance(self.scales, float): + self.scales = [ + self.scales, + ] + + +@dataclass +class IPAdapterArguments: + model_id: Optional[Union[str, List[str]]] = None + subfolder: Optional[Union[str, List[str]]] = None + weight_name: Optional[Union[str, List[str]]] = None + scale: Optional[Union[float, List[float]]] = None + + +@dataclass +class ImageEncoderArguments: + sequence_length: Optional[int] = None + hidden_size: Optional[int] = None + projection_dim: Optional[int] = None + + +@dataclass +class InputShapesArguments: + batch_size: Optional[int] = None + text_batch_size: Optional[int] = None + image_batch_size: Optional[int] = None + sequence_length: Optional[int] = None + num_choices: Optional[int] = None + width: Optional[int] = None + height: Optional[int] = None + image_size: Optional[int] = None + patch_size: Optional[int] = None + num_channels: Optional[int] = None + feature_size: Optional[int] = None + nb_max_frames: Optional[int] = None + audio_sequence_length: Optional[int] = None + point_batch_size: Optional[int] = None + nb_points_per_image: Optional[int] = None + num_beams: Optional[int] = None + vae_scale_factor: Optional[int] = None + encoder_hidden_size: Optional[int] = None + image_encoder_shapes: Optional[ImageEncoderArguments] = None + + +class DataclassParser: + def __init__(self, **kwargs): + for name, cls in self.__class__.__annotations__.items(): + if is_dataclass(cls): + parsed_kwargs = {k: v for k, v in kwargs.items() if k in {f.name for f in fields(cls)}} + setattr(self, f"{name}", cls(**parsed_kwargs)) + + +class NeuronArgumentParser(DataclassParser): + input_shapes: InputShapesArguments + + def __init__(self, **kwargs): + super().__init__(**kwargs) + for name, value in kwargs.items(): + if value is not None: + setattr(self, name, value) + + @property + def lora_args(self): + _lora_args = LoRAAdapterArguments( + model_ids=getattr(self, "lora_model_ids", None), + weight_names=getattr(self, "lora_weight_names", None), + adapter_names=getattr(self, "lora_adapter_names", None), + scales=getattr(self, "lora_scales", None), + ) + return _lora_args + + @property + def ip_adapter_args(self): + _ip_adapter_args = IPAdapterArguments( + model_id=getattr(self, "ip_adapter_id", None), + subfolder=getattr(self, "ip_adapter_subfolder", None), + weight_name=getattr(self, "ip_adapter_weight_name", None), + scale=getattr(self, "ip_adapter_scale", None), + ) + return _ip_adapter_args + + def validate_arg( args, arg_name: str, @@ -135,6 +238,18 @@ def convert_neuronx_compiler_args_to_neuron( return compiler_args +def add_shapes_to_config(config_args, input_shapes: Dict[str, Any]): + for axis, shape in input_shapes.items(): + if shape is not None: + if is_dataclass(shape): + shape_dict = asdict(shape) + config_args[axis] = shape_dict + else: + axis = f"static_{axis}" + config_args[axis] = shape + return config_args + + def store_compilation_config( config: Union["PretrainedConfig", Dict], input_shapes: Dict[str, int], @@ -165,10 +280,7 @@ def store_compilation_config( config_args["inline_weights_to_neff"] = inline_weights_to_neff # Add input shapes during compilation to the config - for axis, shape in input_shapes.items(): - if shape is not None: - axis = f"static_{axis}" - config_args[axis] = shape + config_args = add_shapes_to_config(config_args, input_shapes) config_args["dynamic_batch_size"] = dynamic_batch_size config_args["tensor_parallel_size"] = tensor_parallel_size diff --git a/optimum/neuron/utils/constant.py b/optimum/neuron/utils/constant.py index dbf600bd7..5dc9a3bec 100644 --- a/optimum/neuron/utils/constant.py +++ b/optimum/neuron/utils/constant.py @@ -24,5 +24,6 @@ DIFFUSION_MODEL_VAE_ENCODER_NAME = "vae_encoder" DIFFUSION_MODEL_VAE_DECODER_NAME = "vae_decoder" DIFFUSION_MODEL_CONTROLNET_NAME = "controlnet" +DIFFUSION_MODEL_IMAGE_ENCODER_NAME = "image_encoder" NEURON_BINARIES_PATH = "/opt/aws/neuron/bin" diff --git a/optimum/neuron/utils/input_generators.py b/optimum/neuron/utils/input_generators.py index c3cceddd3..85eb43544 100644 --- a/optimum/neuron/utils/input_generators.py +++ b/optimum/neuron/utils/input_generators.py @@ -14,7 +14,7 @@ # limitations under the License. """Dummy input generation classes.""" -from typing import Optional +from typing import TYPE_CHECKING, Optional import torch @@ -27,6 +27,10 @@ ) +if TYPE_CHECKING: + from .argument_utils import ImageEncoderArguments + + class DummyBeamValuesGenerator(DummyInputGenerator): """ Generates dummy beam search inputs. @@ -166,6 +170,49 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) +class DummyIPAdapterInputGenerator(DummyInputGenerator): + SUPPORTED_INPUT_NAMES = ( + # Unet extra inputs + "image_embeds", # If `unet.encoder_hid_proj.image_projection_layers` are instances of `IPAdapterFullImageProjection`, eg. sd. + "image_enc_hidden_states", # If `unet.encoder_hid_proj.image_projection_layers` are instances of `ImageProjection`, eg. sdxl. + "ip_adapter_masks", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int, + image_encoder_shapes: Optional["ImageEncoderArguments"] = None, + **kwargs, + ): + self.task = task + self.normalized_config = normalized_config + self.batch_size = batch_size + self.image_encoder_shapes = image_encoder_shapes + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "image_enc_hidden_states": + shape = [ + self.batch_size, + 1, + self.image_encoder_shapes.sequence_length, + self.image_encoder_shapes.hidden_size, + ] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + elif input_name == "image_embeds": + shape = [self.batch_size, 1, self.image_encoder_shapes.projection_dim] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + elif input_name == "ip_adapter_masks": + shape = [ + self.batch_size, + 1, + self.image_encoder_shapes.sequence_length, + self.image_encoder_shapes.hidden_size, + ] + return self.random_int_tensor(shape, framework=framework, dtype=int_dtype) + + # copied from https://github.com/huggingface/optimum/blob/171020c775cec6ff77826c3f5f5e5c1498b23f81/optimum/exporters/onnx/model_configs.py#L1363C1-L1368C111 class ASTDummyAudioInputGenerator(DummyAudioInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py index d71d065d2..70e43f244 100644 --- a/tests/cache/test_neuronx_cache.py +++ b/tests/cache/test_neuronx_cache.py @@ -96,7 +96,7 @@ def export_stable_diffusion_model(model_id): batch_size = 1 height = 64 width = 64 - num_images_per_prompt = 4 + num_images_per_prompt = 1 return NeuronStableDiffusionPipeline.from_pretrained( model_id, export=True, @@ -113,7 +113,7 @@ def export_stable_diffusion_xl_model(model_id): batch_size = 1 height = 64 width = 64 - num_images_per_prompt = 4 + num_images_per_prompt = 1 return NeuronStableDiffusionXLPipeline.from_pretrained( model_id, export=True, @@ -143,7 +143,7 @@ def check_encoder_inference(model, tokenizer): def check_stable_diffusion_inference(model): prompts = ["sailing ship in storm by Leonardo da Vinci"] - image = model(prompts, num_images_per_prompt=4).images[0] + image = model(prompts, num_images_per_prompt=1).images[0] assert isinstance(image, PIL.Image.Image) diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py index 167ea2d6c..2d7f7e33d 100644 --- a/tests/exporters/test_export.py +++ b/tests/exporters/test_export.py @@ -35,7 +35,7 @@ from optimum.exporters.neuron.__main__ import get_submodels_and_neuron_configs from optimum.exporters.neuron.model_configs import * # noqa: F403 from optimum.exporters.tasks import TasksManager -from optimum.neuron.utils import is_neuron_available +from optimum.neuron.utils import InputShapesArguments, LoRAAdapterArguments, is_neuron_available from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available, logging from optimum.utils.testing_utils import require_diffusers, require_sentence_transformers @@ -84,13 +84,14 @@ def _get_models_to_test( for model_name, tasks in model_tasks.items(): for task in tasks: default_shapes = dict(DEFAULT_DUMMY_SHAPES) + default_shapes = InputShapesArguments(**default_shapes) neuron_config_constructor = TasksManager.get_exporter_config_constructor( model_type=model_type, exporter="neuron", library_name=library_name, task=task, model_name=model_name, - exporter_config_kwargs={**default_shapes}, + exporter_config_kwargs={"input_shapes": default_shapes}, ) models_to_test.append( @@ -143,8 +144,12 @@ def _neuronx_export( name: DEFAULT_DUMMY_SHAPES.get(name) or EXTREA_DEFAULT_DUMMY_SHAPES.get(name) for name in neuron_config_constructor.func.get_mandatory_axes_for_task(task) } + mandatory_shapes = InputShapesArguments(**mandatory_shapes) neuron_config = neuron_config_constructor( - config=config, task=task, dynamic_batch_size=dynamic_batch_size, **mandatory_shapes + config=config, + task=task, + dynamic_batch_size=dynamic_batch_size, + input_shapes=mandatory_shapes, ) atol = neuron_config.ATOL_FOR_VALIDATION @@ -218,8 +223,9 @@ def test_export_for_stable_diffusion_models(self, model_id): # prepare neuron config / models model = StableDiffusionPipeline.from_pretrained(model_id) input_shapes = build_stable_diffusion_components_mandatory_shapes( - **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4} + **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 1} ) + compiler_kwargs = {"auto_cast": "matmul", "auto_cast_type": "bf16"} with TemporaryDirectory() as tmpdirname: models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs( @@ -234,6 +240,7 @@ def test_export_for_stable_diffusion_models(self, model_id): models_and_neuron_configs=models_and_neuron_configs, output_dir=Path(tmpdirname), output_file_names=output_model_names, + compiler_kwargs=compiler_kwargs, ) validate_models_outputs( models_and_neuron_configs=models_and_neuron_configs, @@ -249,8 +256,9 @@ def test_export_for_stable_diffusion_xl_models(self, model_id): # prepare neuron config / models model = StableDiffusionXLPipeline.from_pretrained(model_id) input_shapes = build_stable_diffusion_components_mandatory_shapes( - **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4} + **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 1} ) + compiler_kwargs = {"auto_cast": "matmul", "auto_cast_type": "bf16"} with TemporaryDirectory() as tmpdirname: models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs( @@ -265,6 +273,7 @@ def test_export_for_stable_diffusion_xl_models(self, model_id): models_and_neuron_configs=models_and_neuron_configs, output_dir=Path(tmpdirname), output_file_names=output_model_names, + compiler_kwargs=compiler_kwargs, ) validate_models_outputs( models_and_neuron_configs=models_and_neuron_configs, @@ -281,8 +290,15 @@ def test_export_sd_with_fused_lora_weights(self): # prepare neuron config / models model = StableDiffusionPipeline.from_pretrained(model_id) input_shapes = build_stable_diffusion_components_mandatory_shapes( - **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 4} + **{"batch_size": 1, "height": 64, "width": 64, "num_images_per_prompt": 1} ) + lora_args = LoRAAdapterArguments( + model_ids=lora_params[0], + weight_names=lora_params[1], + adapter_names=lora_params[2], + scales=0.9, + ) + compiler_kwargs = {"auto_cast": "matmul", "auto_cast_type": "bf16"} with TemporaryDirectory() as tmpdirname: models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs( @@ -292,15 +308,13 @@ def test_export_sd_with_fused_lora_weights(self): library_name="diffusers", output=Path(tmpdirname), model_name_or_path=model_id, - lora_model_ids=lora_params[0], - lora_weight_names=lora_params[1], - lora_adapter_names=lora_params[2], - lora_scales=0.9, + lora_args=lora_args, ) _, neuron_outputs = export_models( models_and_neuron_configs=models_and_neuron_configs, output_dir=Path(tmpdirname), output_file_names=output_model_names, + compiler_kwargs=compiler_kwargs, ) validate_models_outputs( models_and_neuron_configs=models_and_neuron_configs, diff --git a/tests/inference/test_stable_diffusion_pipeline.py b/tests/inference/test_stable_diffusion_pipeline.py index 616e31ce4..d44cc80e8 100644 --- a/tests/inference/test_stable_diffusion_pipeline.py +++ b/tests/inference/test_stable_diffusion_pipeline.py @@ -70,7 +70,7 @@ class NeuronStableDiffusionPipelineIntegrationTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES, skip_on_empty=True) def test_export_and_inference_non_dyn(self, model_arch): - num_images_per_prompt = 4 + num_images_per_prompt = 1 input_shapes = copy.deepcopy(self.STATIC_INPUTS_SHAPES) input_shapes.update({"num_images_per_prompt": num_images_per_prompt}) neuron_pipeline = self.NEURON_MODEL_CLASS.from_pretrained( @@ -169,7 +169,7 @@ def test_lcm_export_and_inference(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES, skip_on_empty=True) def test_export_and_inference_with_fused_lora(self, model_arch): - num_images_per_prompt = 4 + num_images_per_prompt = 1 input_shapes = copy.deepcopy(self.STATIC_INPUTS_SHAPES) input_shapes.update({"num_images_per_prompt": num_images_per_prompt}) lora_params = LORA_WEIGHTS_TINY[model_arch] @@ -295,7 +295,7 @@ class NeuronStableDiffusionXLPipelineIntegrationTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES, skip_on_empty=True) def test_export_and_inference_non_dyn(self, model_arch): - num_images_per_prompt = 4 + num_images_per_prompt = 1 input_shapes = copy.deepcopy(self.STATIC_INPUTS_SHAPES) input_shapes.update({"num_images_per_prompt": num_images_per_prompt}) neuron_pipeline = self.NEURON_MODEL_CLASS.from_pretrained(