From 7e03a520c1ad7c79407f8b4c3d0bcc91dfddd41d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 8 Apr 2024 19:17:50 +0200 Subject: [PATCH] Skip TCP-only DGX tests with UCX 1.16 (#1331) Wireup may fail in UCX 1.16 in nodes with multiple NICs if TCP is used, thus skip those tests. UCX 1.17 will resolve the issue, and alternatively `UCX_PROTO_ENABLE=n` may be used in UCX 1.16 as well. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/dask-cuda/pull/1331 --- dask_cuda/tests/test_dgx.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py index d57cf1a3..41bfa6cb 100644 --- a/dask_cuda/tests/test_dgx.py +++ b/dask_cuda/tests/test_dgx.py @@ -15,6 +15,10 @@ psutil = pytest.importorskip("psutil") +def _is_ucx_116(ucp): + return ucp.get_ucx_version()[:2] == (1, 16) + + class DGXVersion(Enum): DGX_1 = auto() DGX_2 = auto() @@ -102,9 +106,11 @@ def check_ucx_options(): ) def test_tcp_over_ucx(protocol): if protocol == "ucx": - pytest.importorskip("ucp") + ucp = pytest.importorskip("ucp") elif protocol == "ucxx": - pytest.importorskip("ucxx") + ucp = pytest.importorskip("ucxx") + if _is_ucx_116(ucp): + pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037") p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,)) p.start() @@ -217,9 +223,11 @@ def check_ucx_options(): ) def test_ucx_infiniband_nvlink(protocol, params): if protocol == "ucx": - pytest.importorskip("ucp") + ucp = pytest.importorskip("ucp") elif protocol == "ucxx": - pytest.importorskip("ucxx") + ucp = pytest.importorskip("ucxx") + if _is_ucx_116(ucp) and params["enable_infiniband"] is False: + pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037") skip_queue = mp.Queue()