diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index a8545ad3c0a5b3..4a71f9f723ca41 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -20,7 +20,7 @@
 import tempfile
 import unittest
 
-from pytest import mark
+import pytest
 
 from transformers import (
     BarkCoarseConfig,
@@ -877,7 +877,7 @@ def test_resize_embeddings_untied(self):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference(self):
         for model_class in self.all_model_classes:
@@ -936,7 +936,7 @@ def test_flash_attn_2_inference(self):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_padding_right(self):
         for model_class in self.all_model_classes:
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index b6d3c0f57aad75..8194c4285916de 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -16,7 +16,7 @@
 import tempfile
 import unittest
 
-from pytest import mark
+import pytest
 
 from transformers import DistilBertConfig, is_torch_available
 from transformers.testing_utils import require_flash_attn, require_torch, require_torch_accelerator, slow, torch_device
@@ -290,7 +290,7 @@ def test_torchscript_device_change(self):
     # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test.
     @require_flash_attn
     @require_torch_accelerator
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference(self):
         import torch
@@ -344,7 +344,7 @@ def test_flash_attn_2_inference(self):
     # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test.
     @require_flash_attn
     @require_torch_accelerator
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_padding_right(self):
         import torch
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 4d6b363e4a75d4..a646e5ab7a5ce6 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -17,8 +17,8 @@
 
 import unittest
 
+import pytest
 from parameterized import parameterized
-from pytest import mark
 
 from transformers import LlamaConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
@@ -385,7 +385,7 @@ def test_model_rope_scaling(self, scaling_type):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_padding_right(self):
         """
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 4bcb722c144e09..2989f40304657b 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -19,7 +19,7 @@
 import tempfile
 import unittest
 
-from pytest import mark
+import pytest
 
 from transformers import AutoTokenizer, MistralConfig, is_torch_available
 from transformers.testing_utils import (
@@ -369,7 +369,7 @@ def test_past_key_values_format(self):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_padding_right(self):
         import torch
@@ -403,7 +403,7 @@ def test_flash_attn_2_generate_padding_right(self):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_padding_right(self):
         import torch
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 05d48786148e20..22290bab6691f7 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -21,7 +21,7 @@
 import unittest
 
 import numpy as np
-from pytest import mark
+import pytest
 
 import transformers
 from transformers import WhisperConfig
@@ -800,7 +800,7 @@ def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference(self):
         import torch
@@ -845,7 +845,7 @@ def test_flash_attn_2_inference(self):
 
     @require_flash_attn
     @require_torch_gpu
-    @mark.flash_attn_test
+    @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_padding_right(self):
         import torch