diff --git a/forge/test/mlir/llama/test_llama_inference.py b/forge/test/mlir/llama/test_llama_inference.py
index 4b32e662..7def7ce9 100644
--- a/forge/test/mlir/llama/test_llama_inference.py
+++ b/forge/test/mlir/llama/test_llama_inference.py
@@ -10,7 +10,7 @@
 from test.mlir.llama.utils.utils import load_model
 
 
-@pytest.mark.push
+@pytest.mark.nightly
 @pytest.mark.xfail()
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 def test_llama_inference(model_path):
diff --git a/forge/test/mlir/llama/tests/test_llama_prefil.py b/forge/test/mlir/llama/tests/test_llama_prefil.py
index f6e2c127..917a5792 100644
--- a/forge/test/mlir/llama/tests/test_llama_prefil.py
+++ b/forge/test/mlir/llama/tests/test_llama_prefil.py
@@ -50,7 +50,7 @@ def decode_on_cpu(model, tokenizer, input_ids, hidden_states, max_new_tokens):
 
 @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"])
 @pytest.mark.xfail()
-@pytest.mark.push
+@pytest.mark.nightly
 def test_llama_prefil_on_device_decode_on_cpu(model_path):
     """
     This function tests the inference of the Llama models split into two parts: