Skip to content

Commit

Permalink
Skip TCP-only DGX tests with UCX 1.16
Browse files Browse the repository at this point in the history
Wireup may fail in UCX 1.16 in nodes with multiple NICs if TCP is used,
thus skip those tests. UCX 1.17 will resolve the issue, and
alternatively `UCX_PROTO_ENABLE=n` may be used in UCX 1.16 as well.
  • Loading branch information
pentschev committed Apr 7, 2024
1 parent 4208768 commit 9991143
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions dask_cuda/tests/test_dgx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
psutil = pytest.importorskip("psutil")


def _is_ucx_116(ucp):
return ucp.get_ucx_version()[:2] == (1, 16)


class DGXVersion(Enum):
DGX_1 = auto()
DGX_2 = auto()
Expand Down Expand Up @@ -102,9 +106,13 @@ def check_ucx_options():
)
def test_tcp_over_ucx(protocol):
if protocol == "ucx":
pytest.importorskip("ucp")
ucp = pytest.importorskip("ucp")
elif protocol == "ucxx":
pytest.importorskip("ucxx")
ucp = pytest.importorskip("ucxx")
if _is_ucx_116(ucp):
pytest.skip(
"Wireup may fail in UCX 1.16 in nodes with multiple NICs if TCP is used"
)

p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
p.start()
Expand Down Expand Up @@ -217,9 +225,13 @@ def check_ucx_options():
)
def test_ucx_infiniband_nvlink(protocol, params):
if protocol == "ucx":
pytest.importorskip("ucp")
ucp = pytest.importorskip("ucp")
elif protocol == "ucxx":
pytest.importorskip("ucxx")
ucp = pytest.importorskip("ucxx")
if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
pytest.skip(
"Wireup may fail in UCX 1.16 in nodes with multiple NICs if TCP is used"
)

skip_queue = mp.Queue()

Expand Down

0 comments on commit 9991143

Please sign in to comment.