diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py index df5c2498ef..654f9a649e 100644 --- a/optimum/exporters/onnx/__main__.py +++ b/optimum/exporters/onnx/__main__.py @@ -24,6 +24,7 @@ from ...commands.export.onnx import parse_args_onnx from ...utils import DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, logging +from ...utils.modeling_utils import MODEL_TO_PATCH_FOR_PAST from ...utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from ..error_utils import AtolError, OutputMatchError, ShapeError from ..tasks import TasksManager @@ -83,16 +84,12 @@ def _get_submodels_and_onnx_configs( onnx_config_constructor = TasksManager.get_exporter_config_constructor( model=model, exporter="onnx", task=task ) - onnx_config_kwargs = {} - if task.startswith("text-generation") and legacy: - onnx_config_kwargs["no_position_ids"] = legacy - onnx_config = onnx_config_constructor( model.config, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, - **onnx_config_kwargs, + legacy=legacy, ) onnx_config.variant = _variant @@ -317,13 +314,6 @@ def main_export( model_name_or_path, subfolder=subfolder, library_name=library_name ) - # get the shapes to be used to generate dummy inputs - input_shapes = {} - for input_name in DEFAULT_DUMMY_SHAPES.keys(): - input_shapes[input_name] = ( - kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] - ) - torch_dtype = None if fp16 is False else torch.float16 if task == "auto": @@ -382,6 +372,25 @@ def main_export( is_stable_diffusion = "stable-diffusion" in task model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-") + # For MODEL_TO_PATCH_FOR_PAST architectures, when exporting the model with an input of sequence length of 1, a tracer that does not handle + # controlflows will trace incorrectly the mask generation, resulting in incorrect attention masks for other sequence lengthss. + # Reference: https://github.com/huggingface/transformers/blob/af3de8d87c717c4bb090f037d0d89413c195a42f/src/transformers/modeling_attn_mask_utils.py#L94 + input_shapes = {} + for input_name in DEFAULT_DUMMY_SHAPES.keys(): + input_shapes[input_name] = ( + kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] + ) + + # TODO: this may be moved rather to the OnnxConfig to avoid bloating this script. + if ( + model_type in MODEL_TO_PATCH_FOR_PAST + and input_name == "sequence_length" + and kwargs_shapes.get(input_name) == 1 + ): + raise ValueError( + f"Exporting with a sequence length of 1 a {model_type} model is not supported and can yield unexpected results." + ) + if legacy and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and task.startswith("text-generation"): logger.warning( f"legacy=True was specified in the ONNX export, although the model {model_name_or_path} (model type {model_type}) requires position_ids for batched inference. Passing `legacy=True` is strongly discouraged, and this option will be removed in a future release. Reference: https://github.com/huggingface/optimum/pull/1381" diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py index b623d3bd22..6765f3310c 100644 --- a/optimum/exporters/onnx/base.py +++ b/optimum/exporters/onnx/base.py @@ -200,6 +200,7 @@ def __init__( preprocessors: Optional[List[Any]] = None, int_dtype: str = "int64", float_dtype: str = "fp32", + legacy: bool = False, ): self.task = task self.int_dtype = int_dtype @@ -209,6 +210,7 @@ def __init__( self._preprocessors = preprocessors self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self.variant = "default" + self.legacy = legacy def _create_dummy_input_generator_classes(self, **kwargs) -> List[DummyInputGenerator]: """ @@ -565,6 +567,7 @@ def __init__( use_past: bool = False, use_past_in_inputs: bool = False, preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): self.use_past = use_past self.use_past_in_inputs = use_past_in_inputs @@ -572,7 +575,12 @@ def __init__( self.is_merged = False self.use_cache_branch = None super().__init__( - config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + legacy=legacy, ) @property @@ -628,11 +636,11 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): and "attention_mask" in dummy_inputs ): # Obtain the past sequence length from the value instead of the key (Bloom). - past_length = dummy_inputs["past_key_values"][0][1].shape[-2] + past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[-2] dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( dummy_inputs["attention_mask"], - desired_length=past_length + 1, + desired_length=past_present_length, dim=1, dtype=dummy_inputs["attention_mask"].dtype, ) @@ -658,11 +666,15 @@ def overwrite_shape_and_generate_input( # models from TextSeq2SeqOnnxConfig use decoder_input_ids as input name # while models from TextDecoderOnnxConfig use input_ids, hence the check for both + + # TODO: The check `self.task != "text-generation" and self.legacy` is added following the use of a single ONNX for both without/with KV cache, without subgraphs. + # This overwrite may be moved to OnnxSeq2SeqConfigWithPast, but I am afraid it would break encoder-decoder models. if ( self.use_past and self.use_past_in_inputs and self.use_cache_branch is not False and input_name in ["decoder_input_ids", "input_ids", "position_ids"] + and ((self.task == "text-generation" and self.legacy) or self.task != "text-generation") ): sequence_length = dummy_input_gen.sequence_length # Use a sequence length of 1 when the KV cache is already populated. @@ -768,6 +780,7 @@ def __init__( use_past_in_inputs: bool = False, behavior: ConfigBehavior = ConfigBehavior.MONOLITH, preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): super().__init__( config=config, @@ -777,6 +790,7 @@ def __init__( use_past=use_past, use_past_in_inputs=use_past_in_inputs, preprocessors=preprocessors, + legacy=legacy, ) self._behavior = behavior @@ -816,6 +830,7 @@ def with_behavior( use_past_in_inputs=use_past_in_inputs, behavior=behavior, preprocessors=self._preprocessors, + legacy=self.legacy, ) onnx_config.variant = self.variant return onnx_config @@ -1003,7 +1018,7 @@ class OnnxConfigWithLoss(OnnxConfig, ABC): DUMMY_EXTRA_INPUT_GENERATOR_CLASSES = (DummyLabelsGenerator,) - def __init__(self, config: OnnxConfig, int_dtype: str = "int64", float_dtype: str = "fp32"): + def __init__(self, config: OnnxConfig, int_dtype: str = "int64", float_dtype: str = "fp32", legacy: bool = False): self._onnx_config = config self.task = self._onnx_config.task self.int_dtype = int_dtype @@ -1011,6 +1026,7 @@ def __init__(self, config: OnnxConfig, int_dtype: str = "int64", float_dtype: st self._normalized_config = self._onnx_config._normalized_config self.PATCHING_SPECS = self._onnx_config.PATCHING_SPECS self.variant = "default" + self.legacy = legacy @classmethod def from_onnx_config(cls, config: OnnxConfig) -> "OnnxConfigWithLoss": @@ -1037,7 +1053,11 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): batch_size = dummy_inputs[input_name].shape[0] # TODO: doesn't this break attention_mask generation? - if isinstance(self._onnx_config, OnnxConfigWithPast) and self._onnx_config.use_past_in_inputs is True: + if ( + isinstance(self._onnx_config, OnnxConfigWithPast) + and self._onnx_config.use_past_in_inputs is True + and self.task != "text-generation" + ): kwargs["sequence_length"] = 1 else: for input_name, dynamic_axes in self._tasks_to_extra_inputs[self.task].items(): diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py index 7b7d8b19a5..2eaa78d85e 100644 --- a/optimum/exporters/onnx/config.py +++ b/optimum/exporters/onnx/config.py @@ -35,11 +35,14 @@ ) from .base import ConfigBehavior, OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME +from .model_patcher import DecoderModelPatcher if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel + from .model_patcher import ModelPatcher + if is_tf_available(): from transformers import TFPreTrainedModel @@ -75,7 +78,7 @@ def __init__( use_past: bool = False, use_past_in_inputs: bool = False, preprocessors: Optional[List[Any]] = None, - no_position_ids: bool = False, + legacy: bool = False, ): super().__init__( config=config, @@ -85,9 +88,8 @@ def __init__( use_past=use_past, use_past_in_inputs=use_past_in_inputs, preprocessors=preprocessors, + legacy=legacy, ) - # TODO: remove no_position_ids once optimum is sufficiently above 1.13 - self.no_position_ids = no_position_ids @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -154,6 +156,12 @@ def post_process_exported_models( return models_and_onnx_configs, onnx_files_subpaths + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + # Refer to DecoderModelPatcher. + return DecoderModelPatcher(self, model, model_kwargs=model_kwargs) + class TextDecoderWithPositionIdsOnnxConfig(TextDecoderOnnxConfig): @property @@ -163,7 +171,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: # Decoders based on GPT2 require a position_ids input to avoid # generating wrong position_ids in the model itself: # https://github.com/huggingface/transformers/blob/v4.33.1/src/transformers/models/gpt2/modeling_gpt2.py#L802 - if not self.no_position_ids and self.task in ["text-generation", "feature-extraction"]: + if not self.legacy and self.task in ["text-generation", "feature-extraction"]: common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"} return common_inputs @@ -316,6 +324,7 @@ def __init__( use_past_in_inputs: bool = False, behavior: ConfigBehavior = ConfigBehavior.MONOLITH, preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): super().__init__( config=config, @@ -326,6 +335,7 @@ def __init__( use_past_in_inputs=use_past_in_inputs, behavior=behavior, preprocessors=preprocessors, + legacy=legacy, ) from ..tasks import TasksManager diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 2da3f5bea6..b5d67e5040 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -61,12 +61,7 @@ VisionOnnxConfig, ) from .model_patcher import ( - BartModelPatcher, - BloomModelPatcher, FalconModelPatcher, - LlamaModelPatcher, - MistralModelPatcher, - OPTModelPatcher, SAMModelPatcher, SpeechT5ModelPatcher, VisionEncoderDecoderPatcher, @@ -230,11 +225,6 @@ class OPTOnnxConfig(TextDecoderOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return OPTModelPatcher(self, model, model_kwargs=model_kwargs) - class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) @@ -242,13 +232,11 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return LlamaModelPatcher(self, model, model_kwargs=model_kwargs) - class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): + # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 + MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") + # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = ( @@ -257,11 +245,6 @@ class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return MistralModelPatcher(self, model, model_kwargs=model_kwargs) - class MPTOnnxConfig(TextDecoderOnnxConfig): # MPT does not require position_ids input. @@ -270,11 +253,6 @@ class MPTOnnxConfig(TextDecoderOnnxConfig): num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers" ) - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return BloomModelPatcher(self, model, model_kwargs=model_kwargs) - class BloomOnnxConfig(TextDecoderOnnxConfig): # Bloom does not require position_ids input. @@ -305,11 +283,6 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire 1: decoder_sequence_name, } - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return BloomModelPatcher(self, model, model_kwargs=model_kwargs) - class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( @@ -341,6 +314,9 @@ def flatten_past_key_values(self, flattened_output, name, idx, t): class FalconOnnxConfig(TextDecoderOnnxConfig): + # This is because of the patching that uses _prepare_4d_causal_attention_mask from transformers>=4.35 + MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") + DUMMY_INPUT_GENERATOR_CLASSES = ( MultiQueryPastKeyValuesGenerator, ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES @@ -357,7 +333,7 @@ def __init__( use_past: bool = False, use_past_in_inputs: bool = False, preprocessors: Optional[List[Any]] = None, - no_position_ids: bool = False, + legacy: bool = False, ): super().__init__( config=config, @@ -367,7 +343,7 @@ def __init__( use_past=use_past, use_past_in_inputs=use_past_in_inputs, preprocessors=preprocessors, - no_position_ids=no_position_ids, + legacy=legacy, ) # For some reason Falcon config.num_kv_heads can not be trusted, see in Transformers: # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/models/falcon/modeling_falcon.py#L337 @@ -381,11 +357,7 @@ def __init__( def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = super().inputs - if ( - not self.no_position_ids - and not self._config.alibi - and self.task in ["text-generation", "feature-extraction"] - ): + if not self.legacy and not self._config.alibi and self.task in ["text-generation", "feature-extraction"]: # When alibi is used, position_ids are not used in Falcon. # Reference: https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/models/falcon/modeling_falcon.py#L1116 common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"} @@ -655,10 +627,7 @@ def flatten_past_key_values(self, flattened_output, name, idx, t): class BartOnnxConfig(M2M100OnnxConfig): - def patch_model_for_export( - self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None - ) -> "ModelPatcher": - return BartModelPatcher(self, model, model_kwargs=model_kwargs) + pass class MBartOnnxConfig(BartOnnxConfig): @@ -1033,9 +1002,15 @@ def __init__( int_dtype: str = "int64", float_dtype: str = "fp32", preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): super().__init__( - config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + legacy=legacy, ) if task == "zero-shot-object-detection": logger.warning( @@ -1174,9 +1149,15 @@ def __init__( int_dtype: str = "int64", float_dtype: str = "fp32", preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): super().__init__( - config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + legacy=legacy, ) self.is_generating_dummy_inputs = False @@ -1351,6 +1332,7 @@ def __init__( behavior: ConfigBehavior = ConfigBehavior.MONOLITH, preprocessors: Optional[List[Any]] = None, is_postnet_and_vocoder: bool = False, + legacy: bool = False, ): super().__init__( config=config, @@ -1361,6 +1343,7 @@ def __init__( use_past_in_inputs=use_past_in_inputs, behavior=behavior, preprocessors=preprocessors, + legacy=legacy, ) if float_dtype == "fp16": raise ValueError( @@ -1595,9 +1578,15 @@ def __init__( variant: str = "split", vision_encoder: Optional[bool] = None, preprocessors: Optional[List[Any]] = None, + legacy: bool = False, ): super().__init__( - config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + legacy=legacy, ) self.variant = variant self.vision_encoder = vision_encoder diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 4a5f4d1ace..09cdddc95f 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -18,26 +18,26 @@ import types from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union -import transformers +from packaging import version from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions -from transformers.models.falcon.modeling_falcon import FalconModel, build_alibi_tensor +from transformers.models.falcon.modeling_falcon import build_alibi_tensor from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet from transformers.utils import is_torch_available -from ...utils.modeling_utils import ( - _falcon_prepare_attn_mask, - _prepare_attn_mask, - _prepare_decoder_attention_mask, - _prepare_decoder_sliding_window_attention_mask, -) - if is_torch_available(): import torch +from ...configuration_utils import _transformers_version from ...utils import logging +if _transformers_version > version.parse("4.34.99"): + from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask +else: + _prepare_4d_causal_attention_mask = None + AttentionMaskConverter = None + if TYPE_CHECKING: from transformers import PreTrainedModel, TFPreTrainedModel @@ -249,31 +249,6 @@ def __init__( model.decoder.model.decoder.config.use_cache = True -def _make_causal_mask_falcon_patched( - input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int -) -> torch.BoolTensor: - """ - Make causal mask used for self-attention. This mask does not take the existing attention mask into account - it - just blocks tokens from attending forwards in the sequence. The output shape will be `[batch_size, 1, - target_length, target_length+past_key_values_length]`. - """ - batch_size, target_length = input_ids_shape - - # NOTE: ONNX Runtime is not able to run ONNX Trilu node with bool input. As a workaround, we pass a float input - # and cast to bool here. Reference: https://github.com/microsoft/onnxruntime/issues/16189 - mask = torch.triu(torch.ones((target_length, target_length), dtype=torch.float, device=device), diagonal=1).to( - torch.bool - ) - - # If past_key_values_length is 0 this is an empty tensor and the concatenation is a no-op. - # This code style is an unfortunate consequence of getting your TF engineer to port models; doing it this - # way avoids a data-dependent conditional, which will help me when I have to port this to XLA later. - past_mask = torch.zeros((target_length, past_key_values_length), dtype=torch.bool, device=device) - mask = torch.cat([past_mask, mask], dim=-1) - expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length) - return expanded_mask - - def falcon_model_forward_without_kv_reformatting( self, input_ids: Optional[torch.LongTensor] = None, @@ -287,6 +262,8 @@ def falcon_model_forward_without_kv_reformatting( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): + # TODO: We may remove this patch once https://github.com/huggingface/transformers/pull/26933 is merged & released in Transformers. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -346,10 +323,9 @@ def falcon_model_forward_without_kv_reformatting( else: position_ids = position_ids.view(-1, seq_length).long() - causal_mask = self._prepare_attn_mask( - attention_mask, - input_shape=(batch_size, seq_length), - past_key_values_length=past_key_values_length, + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): @@ -359,7 +335,7 @@ def falcon_model_forward_without_kv_reformatting( outputs = block( hidden_states, layer_past=layer_past, - attention_mask=causal_mask, + attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask[i], use_cache=use_cache, @@ -393,26 +369,77 @@ def falcon_model_forward_without_kv_reformatting( ) -class FalconModelPatcher(ModelPatcher): +def _make_causal_mask_patched( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, + sliding_window: Optional[int] = None, +): + """ + Make causal mask used for bi-directional self-attention. + """ + # We add self in the signature because `self._make_causal_mask` is used elsewhere in the class definition, despite the method being a staticmethod. + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + + # add lower triangular sliding window mask if necessary + if sliding_window is not None: + diagonal = past_key_values_length - sliding_window + 1 + + # NOTE: adding dtype=torch.int64 here for triu to be supported by ORT: https://github.com/microsoft/onnxruntime/issues/16189 + context_mask = 1 - torch.triu(torch.ones_like(mask, dtype=torch.int64), diagonal=diagonal) + mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min) + + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +_make_causal_mask_patched = staticmethod(_make_causal_mask_patched) + + +class DecoderModelPatcher(ModelPatcher): def __enter__(self): - self.patch_ops() + # TODO: Remove this if once transformers if much above 4.35 + if AttentionMaskConverter is not None: + AttentionMaskConverter._make_causal_mask = _make_causal_mask_patched - transformers.models.falcon.modeling_falcon._make_causal_mask = _make_causal_mask_falcon_patched + def __exit__(self, exc_type, exc_value, traceback): + # TODO: Remove this if once transformers if much above 4.35 + if AttentionMaskConverter is not None: + AttentionMaskConverter._make_causal_mask = self.original_make_causal + + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + # TODO: Remove this if once transformers if much above 4.35 + if AttentionMaskConverter is not None: + self.original_make_causal = AttentionMaskConverter._make_causal_mask + + +class FalconModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + self.patch_ops() if self.real_config.task == "text-generation": self._model.transformer.forward = types.MethodType( falcon_model_forward_without_kv_reformatting, self._model.transformer ) - # In order to use a single decoder, we need to patch the _prepare_attn_mask function to behave independently of the sequence length. - if isinstance(self._model, FalconModel): - self._model._prepare_attn_mask = _falcon_prepare_attn_mask - else: - self._model.transformer._prepare_attn_mask = _falcon_prepare_attn_mask - - setattr(self._model, self.orig_forward_name, self.patched_forward) - def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) self.restore_ops() setattr(self._model, self.orig_forward_name, self.orig_forward) @@ -422,14 +449,6 @@ def __exit__(self, exc_type, exc_value, traceback): self.original_model_transformer_forward, self._model.transformer ) - transformers.models.falcon.modeling_falcon._make_causal_mask = self.original_make_causal - - # In order to use a single decoder, we need to patch the _prepare_attn_mask function to behave independently of the sequence length. - if isinstance(self._model, FalconModel): - self._model._prepare_attn_mask = self.original_falcon_prepare_attn_mask - else: - self._model.transformer._prepare_attn_mask = self.original_falcon_prepare_attn_mask - def __init__( self, config: "OnnxConfig", @@ -441,13 +460,6 @@ def __init__( if config.task == "text-generation": self.original_model_transformer_forward = model.transformer.forward - self.original_make_causal = transformers.models.falcon.modeling_falcon._make_causal_mask - - if isinstance(model, FalconModel): - self.original_falcon_prepare_attn_mask = model._prepare_attn_mask - else: - self.original_falcon_prepare_attn_mask = model.transformer._prepare_attn_mask - self._model = model self.orig_forward_name = "forward" if hasattr(self._model, "forward") else "call" @@ -763,103 +775,3 @@ def patched_forward( return filterd_outputs self.patched_forward = patched_forward - - -class CausalAttentionMaskModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - self.patch = self.real_config.task == "text-generation" and self.real_config.use_past - - def __enter__(self): - super().__enter__() - if self.patch: - setattr(self._model_to_patch, self._orig_func_name, self._patch_func.__get__(self._model_to_patch)) - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - if self.patch: - setattr(self._model_to_patch, self._orig_func_name, self._orig_func.__get__(self._model_to_patch)) - - -class BloomModelPatcher(CausalAttentionMaskModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - if self.patch: - self._model_to_patch = model.transformer - self._patch_func = _prepare_attn_mask - self._orig_func_name = "_prepare_attn_mask" - self._orig_func = self._model_to_patch._prepare_attn_mask - - -class OPTModelPatcher(CausalAttentionMaskModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - if self.patch: - self._model_to_patch = model.model.decoder - self._patch_func = _prepare_decoder_attention_mask - self._orig_func_name = "_prepare_decoder_attention_mask" - self._orig_func = self._model_to_patch._prepare_decoder_attention_mask - - -class LlamaModelPatcher(CausalAttentionMaskModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - if self.patch: - self._model_to_patch = model.model - self._patch_func = _prepare_decoder_attention_mask - self._orig_func_name = "_prepare_decoder_attention_mask" - self._orig_func = self._model_to_patch._prepare_decoder_attention_mask - - -class MistralModelPatcher(CausalAttentionMaskModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - if self.patch: - self._model_to_patch = model.model - self._patch_func = _prepare_decoder_sliding_window_attention_mask - self._orig_func_name = "_prepare_decoder_attention_mask" - self._orig_func = self._model_to_patch._prepare_decoder_attention_mask - - -class BartModelPatcher(CausalAttentionMaskModelPatcher, Seq2SeqModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - if self.patch: - self._model_to_patch = model.model.decoder - self._patch_func = _prepare_decoder_attention_mask - self._orig_func_name = "_prepare_decoder_attention_mask" - self._orig_func = self._model_to_patch._prepare_decoder_attention_mask diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index ef6206e8d0..c1737fc087 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -30,7 +30,6 @@ logging, ) from ...utils.import_utils import _diffusers_version -from ...utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask # noqa: F401 from ..tasks import TasksManager from .constants import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME @@ -254,9 +253,12 @@ def get_decoder_models_for_export( models_for_export = _get_submodels_for_export_decoder(model, use_past=config.use_past, legacy=legacy) - onnx_kwargs = {"task": config.task, "float_dtype": config.float_dtype, "int_dtype": config.int_dtype} - if model.config.model_type.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: - onnx_kwargs["no_position_ids"] = config.no_position_ids + onnx_kwargs = { + "task": config.task, + "float_dtype": config.float_dtype, + "int_dtype": config.int_dtype, + "legacy": legacy, + } if legacy: onnx_config = config.__class__( @@ -389,14 +391,14 @@ def get_sam_models_for_export(model: Union["PreTrainedModel", "TFPreTrainedModel models_for_export = _get_submodels_for_export_sam(model, config.variant) if config.variant == "monolith": - onnx_config = config.__class__(model.config, task=config.task) + onnx_config = config.__class__(model.config, task=config.task, legacy=config.legacy) models_for_export["model"] = (models_for_export["model"], onnx_config) else: vision_encoder_onnx_config = config.__class__( - model.config, task=config.task, variant=config.variant, vision_encoder=True + model.config, task=config.task, variant=config.variant, vision_encoder=True, legacy=config.legacy ) prompt_encoder_mask_decoder_onnx_config = config.__class__( - model.config, task=config.task, variant=config.variant, vision_encoder=False + model.config, task=config.task, variant=config.variant, vision_encoder=False, legacy=config.legacy ) models_for_export["vision_encoder"] = (models_for_export["vision_encoder"], vision_encoder_onnx_config) models_for_export["prompt_encoder_mask_decoder"] = ( @@ -454,6 +456,7 @@ def get_speecht5_models_for_export( behavior=config._behavior, # Irrelevant here. preprocessors=config._preprocessors, is_postnet_and_vocoder=True, + legacy=config.legacy, ) postnet_and_vocoder_onnx_config.variant = config.variant models_for_export["decoder_postnet_and_vocoder"] = ( diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 13aef3546a..94418a96af 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -473,7 +473,7 @@ def _from_pretrained( if file_name == ONNX_DECODER_WITH_PAST_NAME and config.model_type in MODEL_TO_PATCH_FOR_PAST: raise ValueError( - f"{ONNX_DECODER_WITH_PAST_NAME} not supported for the following architecture : {', '.join(MODEL_TO_PATCH_FOR_PAST)}. Please re-export your model or set use_cache=False." + f"ONNX Runtime inference using {ONNX_DECODER_WITH_PAST_NAME} has been deprecated for {config.model_type} architecture. Please re-export your model with optimum>=1.14.0 or set use_cache=False. For details about the deprecation, please refer to https://github.com/huggingface/optimum/releases/tag/v1.14.0." ) regular_file_names = [] diff --git a/optimum/utils/modeling_utils.py b/optimum/utils/modeling_utils.py index 336ad31e5a..dae5b5d633 100644 --- a/optimum/utils/modeling_utils.py +++ b/optimum/utils/modeling_utils.py @@ -13,9 +13,6 @@ # limitations under the License. import functools -from typing import Tuple - -import torch MODEL_TO_PATCH_FOR_PAST = { @@ -55,164 +52,3 @@ def recurse_setattr(module, name, value): else: name, rest = name.split(".", 1) recurse_setattr(getattr(module, name), rest, value) - - -# Modified from transformers.models.bloom.modeling_bloom._make_causal_mask -def _make_causal_mask( - input_ids_shape: torch.Size, - device: torch.device, - past_key_values_length: int, - dtype: torch.dtype = torch.bool, -) -> torch.BoolTensor: - """ - Make causal mask used for bi-directional self-attention. - """ - batch_size, target_length = input_ids_shape - mask = torch.zeros((target_length, target_length + past_key_values_length), dtype=dtype, device=device) - seq_ids = torch.arange(target_length, device=device) - - mask[:, past_key_values_length:] = ( - (seq_ids[:, None] < seq_ids[None, :]) * torch.finfo(dtype).min - if torch.is_floating_point(mask) - else seq_ids[:, None] < seq_ids[None, :] - ) - - return mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length) - - -# NOTE: For MODEL_TO_PATCH_FOR_PAST architectures, when exporting the model with an input of sequence length of 1, the attention masks will be generated incorrectly for other sequence length -# https://github.com/huggingface/transformers/blob/0ee45906845c8d58b9bd2df5acd90e09b00047ff/src/transformers/models/bloom/modeling_bloom.py#L654 -# The method taking care of the decoder mask generation of the models from these architectures must be patched during export for sequence length of 1. - - -# Modified from transformers.models.bloom.modeling_bloom._prepare_attn_mask -def _prepare_attn_mask( - self, - attention_mask: torch.Tensor, - input_shape: Tuple[int, int], - past_key_values_length: int, -) -> torch.BoolTensor: - from transformers.models.bloom.modeling_bloom import _expand_mask - - # create causal mask - # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length] - combined_attention_mask = None - device = attention_mask.device - _, src_length = input_shape - - combined_attention_mask = _make_causal_mask( - input_shape, device=device, past_key_values_length=past_key_values_length - ) - # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length] - expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask - ) - - return combined_attention_mask - - -# Modified from transformers.models.llama.modeling_llama._prepare_decoder_attention_mask -def _prepare_decoder_attention_mask( - self, - attention_mask: torch.Tensor, - input_shape: Tuple[int, int], - inputs_embeds: torch.Tensor, - past_key_values_length: int, -): - from transformers.models.llama.modeling_llama import _expand_mask - - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - - combined_attention_mask = _make_causal_mask( - input_shape, - device=inputs_embeds.device, - past_key_values_length=past_key_values_length, - dtype=inputs_embeds.dtype, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - -# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask -def _prepare_decoder_sliding_window_attention_mask( - self, - attention_mask: torch.Tensor, - input_shape: Tuple[int, int], - inputs_embeds: torch.Tensor, - past_key_values_length: int, - sliding_window: int, -): - from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask - - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - - combined_attention_mask = _make_sliding_window_causal_mask( - input_shape, - device=inputs_embeds.device, - dtype=inputs_embeds.dtype, - past_key_values_length=past_key_values_length, - sliding_window=sliding_window, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - -def _falcon_prepare_attn_mask( - attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int -) -> torch.BoolTensor: - from transformers.models.falcon.modeling_falcon import ( - _expand_mask, - ) - - # NOTE: there is no "copied from" for falcon in transformers which makes no sense to me. - - # Create a causal mask - # The attention mask we receive as input should cover the whole extended sequence, including any past - # cache, so its shape should be [batch_size, seq_length + past_key_values_length] - # The output shape will be [batch_size, 1, seq_length, seq_length + past_key_values_length] - if input_shape[1] + past_key_values_length != attention_mask.shape[1]: - raise ValueError( - "Attention mask shape should be (batch_size, seq_length + past_key_values_length)" - f" but is {attention_mask.shape} with input_ids shape {input_shape} and past length" - f" {past_key_values_length}." - ) - combined_attention_mask = None - device = attention_mask.device - _, seq_length = input_shape - - # if seq_length > 1: - # NOTE: we remove here the `if seq_length > 1` to allow to use a single decoder. - combined_attention_mask = _make_causal_mask( - input_shape, device=device, past_key_values_length=past_key_values_length - ) - - # [batch_size, seq_length + past_key_values_length] -> [batch_size, 1, seq_length, seq_length + past_key_values_length] - expanded_attn_mask = _expand_mask(attention_mask, past_key_values_length=past_key_values_length) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask - ) - - return combined_attention_mask