diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 1fc9a25fd8e..ceafa26a8b1 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -235,9 +235,6 @@ def __call__( **kwargs, ) - # Temporary fix for "padding_side" in init_kwargs - output_kwargs["text_kwargs"].pop("padding_side", None) - image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len n_images_in_text = [] diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index f9d550e789d..039e05a7ec1 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -172,8 +172,6 @@ def __call__( num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text] - # Padding side can be in TextKwargs but is not accepted by the tokenizer - _ = output_kwargs["text_kwargs"].pop("padding_side", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}) diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py index 48516e6aa31..6c0e8d98014 100644 --- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py @@ -150,7 +150,6 @@ def __call__( index += 1 text[i] = text[i].replace("<|placeholder|>", "<|video_pad|>") - _ = output_kwargs["text_kwargs"].pop("padding_side", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 062dfe311c1..cb2327e5c46 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -829,7 +829,12 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): # init with tokenizer init kwargs if necessary if modality_key in tokenizer_init_kwargs: - default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] + value = ( + getattr(self.tokenizer, modality_key) + if hasattr(self.tokenizer, modality_key) + else tokenizer_init_kwargs[modality_key] + ) + default_kwargs[modality][modality_key] = value # now defaults kwargs are updated with the tokenizers defaults. # pass defaults to output dictionary output_kwargs.update(default_kwargs)