diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc index cd168462cc4..0285bdb806e 100644 --- a/test/gtest/common/mem_buffer.cc +++ b/test/gtest/common/mem_buffer.cc @@ -221,7 +221,7 @@ void mem_buffer::get_bar1_free_size_nvml() #endif } -void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type) +void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type, bool async) { void *ptr; @@ -238,7 +238,16 @@ void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type) return ptr; #if HAVE_CUDA case UCS_MEMORY_TYPE_CUDA: - CUDA_CALL(cudaMalloc(&ptr, size), ": size=" << size); +#if CUDA_VERSION >= 11020 + if (async) { + CUDA_CALL(cudaMallocAsync(&ptr, size, 0), ": size=" << size); + cudaStreamSynchronize(0); + } else { +#endif + CUDA_CALL(cudaMalloc(&ptr, size), ": size=" << size); +#if CUDA_VERSION >= 11020 + } +#endif return ptr; case UCS_MEMORY_TYPE_CUDA_MANAGED: CUDA_CALL(cudaMallocManaged(&ptr, size), ": size=" << size); @@ -258,7 +267,7 @@ void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type) } } -void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type) +void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type, bool async) { try { switch (mem_type) { @@ -267,6 +276,17 @@ void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type) break; #if HAVE_CUDA case UCS_MEMORY_TYPE_CUDA: +#if CUDA_VERSION >= 11020 + if (async) { + cudaStreamSynchronize(0); + CUDA_CALL(cudaFreeAsync(ptr, 0), ": ptr=" << ptr); + } else { +#endif + CUDA_CALL(cudaFree(ptr), ": ptr=" << ptr); +#if CUDA_VERSION >= 11020 + } +#endif + break; case UCS_MEMORY_TYPE_CUDA_MANAGED: CUDA_CALL(cudaFree(ptr), ": ptr=" << ptr); break; diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h index 25e5cb166dd..511de6fbb48 100644 --- a/test/gtest/common/mem_buffer.h +++ b/test/gtest/common/mem_buffer.h @@ -28,10 +28,12 @@ class mem_buffer { static bool is_mem_type_supported(ucs_memory_type_t mem_type); /* allocate buffer of a given memory type */ - static void *allocate(size_t size, ucs_memory_type_t mem_type); + static void *allocate(size_t size, ucs_memory_type_t mem_type, + bool async = false); /* release buffer of a given memory type */ - static void release(void *ptr, ucs_memory_type_t mem_type); + static void release(void *ptr, ucs_memory_type_t mem_type, + bool async = false); /* fill pattern in a host-accessible buffer */ static void pattern_fill(void *buffer, size_t length, uint64_t seed); diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc index 88f1d547fb3..7e62f41ce04 100644 --- a/test/gtest/ucp/test_ucp_am.cc +++ b/test/gtest/ucp/test_ucp_am.cc @@ -366,6 +366,16 @@ class test_ucp_am_nbx : public test_ucp_am_base { return UCS_MEMORY_TYPE_HOST; } + virtual bool tx_memtype_async() const + { + return false; + } + + virtual bool rx_memtype_async() const + { + return false; + } + void reset_counters() { m_send_counter = 0; @@ -495,8 +505,9 @@ class test_ucp_am_nbx : public test_ucp_am_base { unsigned flags = 0, unsigned data_cb_flags = 0, uint32_t op_attr_mask = 0) { - mem_buffer sbuf(size, tx_memtype()); - sbuf.pattern_fill(SEED); + auto sbuf = mem_buffer::allocate(size, tx_memtype(), + tx_memtype_async()); + mem_buffer::pattern_fill(sbuf, size, SEED, tx_memtype()); m_hdr.resize(header_size); ucs::fill_random(m_hdr); reset_counters(); @@ -505,10 +516,10 @@ class test_ucp_am_nbx : public test_ucp_am_base { set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_cb, this, data_cb_flags); - ucp::data_type_desc_t sdt_desc(m_dt, sbuf.ptr(), size); + ucp::data_type_desc_t sdt_desc(m_dt, sbuf, size); if (prereg()) { - memh = sender().mem_map(sbuf.ptr(), size); + memh = sender().mem_map(sbuf, size); } ucs_status_ptr_t sptr = send_am(sdt_desc, get_send_flag() | flags, @@ -522,6 +533,7 @@ class test_ucp_am_nbx : public test_ucp_am_base { sender().mem_unmap(memh); } + mem_buffer::release(sbuf, tx_memtype(), tx_memtype_async()); EXPECT_EQ(m_recv_counter, m_send_counter); } @@ -562,7 +574,8 @@ class test_ucp_am_nbx : public test_ucp_am_base { { ucs_status_t status; - m_rx_buf = mem_buffer::allocate(length, rx_memtype()); + m_rx_buf = mem_buffer::allocate(length, rx_memtype(), + rx_memtype_async()); mem_buffer::pattern_fill(m_rx_buf, length, 0ul, rx_memtype()); m_rx_dt_desc.make(m_rx_dt, m_rx_buf, length); @@ -638,7 +651,7 @@ class test_ucp_am_nbx : public test_ucp_am_base { if (m_rx_memh != NULL) { receiver().mem_unmap(m_rx_memh); } - mem_buffer::release(m_rx_buf, rx_memtype()); + mem_buffer::release(m_rx_buf, rx_memtype(), rx_memtype_async()); } static ucs_status_t am_data_cb(void *arg, const void *header, @@ -1358,10 +1371,7 @@ class test_ucp_am_nbx_eager_memtype : public test_ucp_am_nbx_prereg { private: static void base_test_generator(variant_vec_t &variants) { - // 1. Do not instantiate test case if no GPU memtypes supported. - // 2. Do not exclude host memory type, because this generator is used by - // test_ucp_am_nbx_rndv_memtype class to generate combinations like - // host<->cuda, cuda-managed<->host, etc. + // Do not instantiate test case if no GPU memtypes supported. if (!mem_buffer::is_gpu_supported()) { return; } @@ -1890,10 +1900,7 @@ class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv { public: static void get_test_variants(variant_vec_t &variants) { - // Test will not be instantiated if no GPU memtypes supported, because - // of the check for supported memory types in - // test_ucp_am_nbx_eager_memtype::get_test_variants - return test_ucp_am_nbx_eager_memtype::get_test_variants(variants); + add_variant_memtypes(variants, base_test_generator); } void init() override @@ -1902,6 +1909,28 @@ class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv { } private: + static void base_test_generator(variant_vec_t &variants) + { + // Do not instantiate test case if no GPU memtypes supported. + if (!mem_buffer::is_gpu_supported()) { + return; + } + + add_variant_memtypes(variants, + test_ucp_am_nbx_prereg::get_test_variants); + } + + static void add_variant_memtypes(variant_vec_t &variants, + get_variants_func_t generator) + { + ucp_test::add_variant_memtypes(variants, generator); + + if (mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) { + add_variant_values(variants, generator, MEMORY_TYPE_CUDA_ASYNC, + "cuda-async"); + } + } + unsigned get_send_flag() const override { return test_ucp_am_nbx_rndv::get_send_flag() | UCP_AM_SEND_FLAG_RNDV; @@ -1909,13 +1938,43 @@ class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv { ucs_memory_type_t tx_memtype() const override { - return static_cast(get_variant_value(2)); + return variant_value_to_mem_type(2); } ucs_memory_type_t rx_memtype() const override { - return static_cast(get_variant_value(3)); + return variant_value_to_mem_type(3); } + + bool tx_memtype_async() const override + { + return get_variant_value(2) == MEMORY_TYPE_CUDA_ASYNC; + } + + bool rx_memtype_async() const override + { + return get_variant_value(3) == MEMORY_TYPE_CUDA_ASYNC; + } + + ucs_memory_type_t variant_value_to_mem_type(unsigned index) const + { + auto variant_value = get_variant_value(index); + switch (variant_value) { + case UCS_MEMORY_TYPE_HOST: + case UCS_MEMORY_TYPE_CUDA: + case UCS_MEMORY_TYPE_CUDA_MANAGED: + case UCS_MEMORY_TYPE_ROCM: + case UCS_MEMORY_TYPE_ROCM_MANAGED: + return static_cast(variant_value); + case MEMORY_TYPE_CUDA_ASYNC: + return UCS_MEMORY_TYPE_CUDA; + default: + UCS_TEST_ABORT("invalid memory type"); + return UCS_MEMORY_TYPE_HOST; + } + } + + static const int MEMORY_TYPE_CUDA_ASYNC = UCS_MEMORY_TYPE_LAST + 1; }; UCS_TEST_P(test_ucp_am_nbx_rndv_memtype, rndv)