Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GTEST/COMMON: Added test for cuda async allocated buffers. #10173

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions test/gtest/common/mem_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ void mem_buffer::get_bar1_free_size_nvml()
#endif
}

void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type, bool async)
iyastreb marked this conversation as resolved.
Show resolved Hide resolved
{
void *ptr;

Expand All @@ -238,7 +238,16 @@ void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
return ptr;
#if HAVE_CUDA
case UCS_MEMORY_TYPE_CUDA:
CUDA_CALL(cudaMalloc(&ptr, size), ": size=" << size);
#if CUDA_VERSION >= 11020
if (async) {
CUDA_CALL(cudaMallocAsync(&ptr, size, 0), ": size=" << size);
cudaStreamSynchronize(0);
} else {
#endif
CUDA_CALL(cudaMalloc(&ptr, size), ": size=" << size);
#if CUDA_VERSION >= 11020
}
#endif
return ptr;
case UCS_MEMORY_TYPE_CUDA_MANAGED:
CUDA_CALL(cudaMallocManaged(&ptr, size), ": size=" << size);
Expand All @@ -258,7 +267,7 @@ void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
}
}

void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type)
void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type, bool async)
{
try {
switch (mem_type) {
Expand All @@ -267,6 +276,17 @@ void mem_buffer::release(void *ptr, ucs_memory_type_t mem_type)
break;
#if HAVE_CUDA
case UCS_MEMORY_TYPE_CUDA:
#if CUDA_VERSION >= 11020
if (async) {
cudaStreamSynchronize(0);
CUDA_CALL(cudaFreeAsync(ptr, 0), ": ptr=" << ptr);
} else {
#endif
CUDA_CALL(cudaFree(ptr), ": ptr=" << ptr);
#if CUDA_VERSION >= 11020
}
#endif
break;
case UCS_MEMORY_TYPE_CUDA_MANAGED:
CUDA_CALL(cudaFree(ptr), ": ptr=" << ptr);
break;
Expand Down
6 changes: 4 additions & 2 deletions test/gtest/common/mem_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ class mem_buffer {
static bool is_mem_type_supported(ucs_memory_type_t mem_type);

/* allocate buffer of a given memory type */
static void *allocate(size_t size, ucs_memory_type_t mem_type);
static void *allocate(size_t size, ucs_memory_type_t mem_type,
bool async = false);

/* release buffer of a given memory type */
static void release(void *ptr, ucs_memory_type_t mem_type);
static void release(void *ptr, ucs_memory_type_t mem_type,
bool async = false);

/* fill pattern in a host-accessible buffer */
static void pattern_fill(void *buffer, size_t length, uint64_t seed);
Expand Down
91 changes: 75 additions & 16 deletions test/gtest/ucp/test_ucp_am.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,16 @@ class test_ucp_am_nbx : public test_ucp_am_base {
return UCS_MEMORY_TYPE_HOST;
}

virtual bool tx_memtype_async() const
{
return false;
}

virtual bool rx_memtype_async() const
{
return false;
}

void reset_counters()
{
m_send_counter = 0;
Expand Down Expand Up @@ -495,8 +505,9 @@ class test_ucp_am_nbx : public test_ucp_am_base {
unsigned flags = 0, unsigned data_cb_flags = 0,
uint32_t op_attr_mask = 0)
{
mem_buffer sbuf(size, tx_memtype());
sbuf.pattern_fill(SEED);
auto sbuf = mem_buffer::allocate(size, tx_memtype(),
tx_memtype_async());
mem_buffer::pattern_fill(sbuf, size, SEED, tx_memtype());
m_hdr.resize(header_size);
ucs::fill_random(m_hdr);
reset_counters();
Expand All @@ -505,10 +516,10 @@ class test_ucp_am_nbx : public test_ucp_am_base {
set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_cb, this,
data_cb_flags);

ucp::data_type_desc_t sdt_desc(m_dt, sbuf.ptr(), size);
ucp::data_type_desc_t sdt_desc(m_dt, sbuf, size);

if (prereg()) {
memh = sender().mem_map(sbuf.ptr(), size);
memh = sender().mem_map(sbuf, size);
}

ucs_status_ptr_t sptr = send_am(sdt_desc, get_send_flag() | flags,
Expand All @@ -522,6 +533,7 @@ class test_ucp_am_nbx : public test_ucp_am_base {
sender().mem_unmap(memh);
}

mem_buffer::release(sbuf, tx_memtype(), tx_memtype_async());
EXPECT_EQ(m_recv_counter, m_send_counter);
}

Expand Down Expand Up @@ -562,7 +574,8 @@ class test_ucp_am_nbx : public test_ucp_am_base {
{
ucs_status_t status;

m_rx_buf = mem_buffer::allocate(length, rx_memtype());
m_rx_buf = mem_buffer::allocate(length, rx_memtype(),
rx_memtype_async());
mem_buffer::pattern_fill(m_rx_buf, length, 0ul, rx_memtype());

m_rx_dt_desc.make(m_rx_dt, m_rx_buf, length);
Expand Down Expand Up @@ -638,7 +651,7 @@ class test_ucp_am_nbx : public test_ucp_am_base {
if (m_rx_memh != NULL) {
receiver().mem_unmap(m_rx_memh);
}
mem_buffer::release(m_rx_buf, rx_memtype());
mem_buffer::release(m_rx_buf, rx_memtype(), rx_memtype_async());
}

static ucs_status_t am_data_cb(void *arg, const void *header,
Expand Down Expand Up @@ -1358,10 +1371,7 @@ class test_ucp_am_nbx_eager_memtype : public test_ucp_am_nbx_prereg {
private:
static void base_test_generator(variant_vec_t &variants)
{
// 1. Do not instantiate test case if no GPU memtypes supported.
// 2. Do not exclude host memory type, because this generator is used by
// test_ucp_am_nbx_rndv_memtype class to generate combinations like
// host<->cuda, cuda-managed<->host, etc.
// Do not instantiate test case if no GPU memtypes supported.
if (!mem_buffer::is_gpu_supported()) {
return;
}
Expand Down Expand Up @@ -1890,10 +1900,7 @@ class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv {
public:
static void get_test_variants(variant_vec_t &variants)
{
// Test will not be instantiated if no GPU memtypes supported, because
// of the check for supported memory types in
// test_ucp_am_nbx_eager_memtype::get_test_variants
return test_ucp_am_nbx_eager_memtype::get_test_variants(variants);
add_variant_memtypes(variants, base_test_generator);
}

void init() override
Expand All @@ -1902,20 +1909,72 @@ class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv {
}

private:
static void base_test_generator(variant_vec_t &variants)
{
// Do not instantiate test case if no GPU memtypes supported.
if (!mem_buffer::is_gpu_supported()) {
return;
}

add_variant_memtypes(variants,
test_ucp_am_nbx_prereg::get_test_variants);
}

static void add_variant_memtypes(variant_vec_t &variants,
get_variants_func_t generator)
{
ucp_test::add_variant_memtypes(variants, generator);

if (mem_buffer::is_mem_type_supported(UCS_MEMORY_TYPE_CUDA)) {
add_variant_values(variants, generator, MEMORY_TYPE_CUDA_ASYNC,
"cuda-async");
}
}

unsigned get_send_flag() const override
{
return test_ucp_am_nbx_rndv::get_send_flag() | UCP_AM_SEND_FLAG_RNDV;
}

ucs_memory_type_t tx_memtype() const override
{
return static_cast<ucs_memory_type_t>(get_variant_value(2));
return variant_value_to_mem_type(2);
}

ucs_memory_type_t rx_memtype() const override
{
return static_cast<ucs_memory_type_t>(get_variant_value(3));
return variant_value_to_mem_type(3);
}

bool tx_memtype_async() const override
{
return get_variant_value(2) == MEMORY_TYPE_CUDA_ASYNC;
}

bool rx_memtype_async() const override
{
return get_variant_value(3) == MEMORY_TYPE_CUDA_ASYNC;
}

ucs_memory_type_t variant_value_to_mem_type(unsigned index) const
{
auto variant_value = get_variant_value(index);
switch (variant_value) {
case UCS_MEMORY_TYPE_HOST:
case UCS_MEMORY_TYPE_CUDA:
case UCS_MEMORY_TYPE_CUDA_MANAGED:
case UCS_MEMORY_TYPE_ROCM:
case UCS_MEMORY_TYPE_ROCM_MANAGED:
return static_cast<ucs_memory_type_t>(variant_value);
case MEMORY_TYPE_CUDA_ASYNC:
return UCS_MEMORY_TYPE_CUDA;
default:
UCS_TEST_ABORT("invalid memory type");
return UCS_MEMORY_TYPE_HOST;
}
}

static const int MEMORY_TYPE_CUDA_ASYNC = UCS_MEMORY_TYPE_LAST + 1;
};

UCS_TEST_P(test_ucp_am_nbx_rndv_memtype, rndv)
Expand Down
Loading