diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index a330e23bfb..fb4d190a2c 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1279,6 +1279,8 @@ class WhisperOnnxConfig(AudioToTextOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( encoder_num_layers="encoder_layers", decoder_num_layers="decoder_layers", + feature_size="num_mel_bins", + allow_new=True, ) ATOL_FOR_VALIDATION = 1e-3 diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 1028307f8c..aa1f785309 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -651,7 +651,10 @@ def __init__( self.task = task self.normalized_config = normalized_config - self.feature_size = feature_size + if hasattr(self.normalized_config, "feature_size"): + self.feature_size = self.normalized_config.feature_size + else: + self.feature_size = feature_size self.nb_max_frames = nb_max_frames self.batch_size = batch_size self.sequence_length = audio_sequence_length