diff --git a/forge/test/mlir/llama/test_llama_inference.py b/forge/test/mlir/llama/test_llama_inference.py index 4b32e662..7def7ce9 100644 --- a/forge/test/mlir/llama/test_llama_inference.py +++ b/forge/test/mlir/llama/test_llama_inference.py @@ -10,7 +10,7 @@ from test.mlir.llama.utils.utils import load_model -@pytest.mark.push +@pytest.mark.nightly @pytest.mark.xfail() @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"]) def test_llama_inference(model_path): diff --git a/forge/test/mlir/llama/tests/test_llama_prefil.py b/forge/test/mlir/llama/tests/test_llama_prefil.py index f6e2c127..917a5792 100644 --- a/forge/test/mlir/llama/tests/test_llama_prefil.py +++ b/forge/test/mlir/llama/tests/test_llama_prefil.py @@ -50,7 +50,7 @@ def decode_on_cpu(model, tokenizer, input_ids, hidden_states, max_new_tokens): @pytest.mark.parametrize("model_path", ["openlm-research/open_llama_3b", "meta-llama/Llama-3.2-1B"]) @pytest.mark.xfail() -@pytest.mark.push +@pytest.mark.nightly def test_llama_prefil_on_device_decode_on_cpu(model_path): """ This function tests the inference of the Llama models split into two parts: