From fa32b7a4dd9a246feae71f1f25d471bfb98b6dea Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 8 Nov 2023 12:30:48 -0800 Subject: [PATCH] implementing C++ part for np.select --- cunumeric/eager.py | 2 +- src/cunumeric/index/select.cc | 59 +++++++++++ src/cunumeric/index/select.cu | 88 +++++++++++++++- src/cunumeric/index/select.h | 24 +++++ src/cunumeric/index/select_omp.cc | 61 +++++++++++ src/cunumeric/index/select_template.inl | 64 ++++++++++++ tests/integration/test_index_routines.py | 126 ++++++++++++++++++++++- 7 files changed, 417 insertions(+), 7 deletions(-) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index 56a056b6f..91efd777a 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -644,7 +644,7 @@ def select( default, ) else: - self.array[:] = np.select( + self.array[...] = np.select( tuple(c.array for c in condlist), tuple(c.array for c in choicelist), default, diff --git a/src/cunumeric/index/select.cc b/src/cunumeric/index/select.cc index 87e3983aa..3ebd52746 100644 --- a/src/cunumeric/index/select.cc +++ b/src/cunumeric/index/select.cc @@ -16,3 +16,62 @@ #include "cunumeric/index/select.h" #include "cunumeric/index/select_template.inl" + +namespace cunumeric { + +using namespace legate; + +template +struct SelectImplBody { + using VAL = legate_type_of; + + void operator()(const AccessorWO& out, + const std::vector>& condlist, + const std::vector>& choicelist, + VAL default_val, + const Rect& rect, + const Pitches& pitches, + bool dense) const + { + const size_t volume = rect.volume(); + uint32_t narrays = condlist.size(); +#ifdef DEBUG_CUNUMERIC + assert(narrays == choicelist.size()); +#endif + + if (dense) { + auto outptr = out.ptr(rect); + for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val; + for (int32_t c = (narrays - 1); c >= 0; c--) { + auto condptr = condlist[c].ptr(rect); + auto choiseptr = choicelist[c].ptr(rect); + for (int32_t idx = (volume - 1); idx >= 0; idx--) { + if (condptr[idx]) outptr[idx] = choiseptr[idx]; + } + } + } else { + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = default_val; + } + for (int32_t c = (narrays - 1); c >= 0; c--) { + for (int32_t idx = (volume - 1); idx >= 0; idx--) { + auto p = pitches.unflatten(idx, rect.lo); + if (condlist[c][p]) out[p] = choicelist[c][p]; + } + } + } + } +}; + +/*static*/ void SelectTask::cpu_variant(TaskContext& context) +{ + select_template(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { SelectTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/index/select.cu b/src/cunumeric/index/select.cu index a03a4dcda..e99fac0e9 100644 --- a/src/cunumeric/index/select.cu +++ b/src/cunumeric/index/select.cu @@ -31,13 +31,91 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) { const size_t idx = global_tid_1d(); if (idx >= volume) return; - for (uint32_t c = 0; c < narrays; ++c) { - if (condlist[c][idx]) { - outptr[idx] = choicelist[c][idx]; - return; + outptr[idx] = default_val; + for (int32_t c = (narrays - 1); c >= 0; c--) { + if (condlist[c][idx]) { outptr[idx] = choicelist[c][idx]; } + } +} + +template +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + select_kernel(const AccessorWO out, + uint32_t narrays, + const legate::Buffer> condlist, + const legate::Buffer> choicelist, + VAL default_val, + const Rect rect, + const Pitches pitches, + int out_size, + int volume) +{ + const size_t tid = global_tid_1d(); + if (tid >= out_size) return; + for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = default_val; + } + __syncthreads(); + for (int32_t c = (narrays - 1); c >= 0; c--) { + for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) { + auto p = pitches.unflatten(idx, rect.lo); + if (condlist[c][p]) { out[p] = choicelist[c][p]; } } } - outptr[idx] = default_val; +} + +using namespace legate; + +template +struct SelectImplBody { + using VAL = legate_type_of; + + void operator()(const AccessorWO& out, + const std::vector>& condlist, + const std::vector>& choicelist, + VAL default_val, + const Rect& rect, + const Pitches& pitches, + bool dense) const + { + const size_t out_size = rect.hi[0] - rect.lo[0] + 1; + uint32_t narrays = condlist.size(); +#ifdef DEBUG_CUNUMERIC + assert(narrays == choicelist.size()); +#endif + const size_t blocks = (out_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + auto stream = get_cached_stream(); + if (dense && (DIM <= 1 || rect.volume() == 0)) { + auto cond_arr = create_buffer(condlist.size(), legate::Memory::Kind::Z_COPY_MEM); + for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx].ptr(rect); + auto choice_arr = + create_buffer(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM); + for (uint32_t idx = 0; idx < choicelist.size(); ++idx) + choice_arr[idx] = choicelist[idx].ptr(rect); + VAL* outptr = out.ptr(rect); + select_kernel_dense<<>>( + outptr, narrays, cond_arr, choice_arr, default_val, out_size); + } else { + auto cond_arr = + create_buffer>(condlist.size(), legate::Memory::Kind::Z_COPY_MEM); + for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx]; + + auto choice_arr = + create_buffer>(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM); + for (uint32_t idx = 0; idx < choicelist.size(); ++idx) choice_arr[idx] = choicelist[idx]; + if (out_size == 0) return; + select_kernel<<>>( + out, narrays, cond_arr, choice_arr, default_val, rect, pitches, out_size, rect.volume()); + } + + CHECK_CUDA_STREAM(stream); + } +}; + +/*static*/ void SelectTask::gpu_variant(TaskContext& context) +{ + select_template(context); } } // namespace cunumeric diff --git a/src/cunumeric/index/select.h b/src/cunumeric/index/select.h index 9d14df5bb..2179e12b7 100644 --- a/src/cunumeric/index/select.h +++ b/src/cunumeric/index/select.h @@ -17,3 +17,27 @@ #pragma once #include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct SelectArgs { + const Array& out; + const std::vector& inputs; + const legate::Scalar& default_value; +}; + +class SelectTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_SELECT; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/index/select_omp.cc b/src/cunumeric/index/select_omp.cc index 87e3983aa..f015208a7 100644 --- a/src/cunumeric/index/select_omp.cc +++ b/src/cunumeric/index/select_omp.cc @@ -16,3 +16,64 @@ #include "cunumeric/index/select.h" #include "cunumeric/index/select_template.inl" + +namespace cunumeric { + +using namespace legate; + +template +struct SelectImplBody { + using VAL = legate_type_of; + + void operator()(const AccessorWO& out, + const std::vector>& condlist, + const std::vector>& choicelist, + VAL default_val, + const Rect& rect, + const Pitches& pitches, + bool dense) const + { + const size_t volume = rect.volume(); + uint32_t narrays = condlist.size(); +#ifdef DEBUG_CUNUMERIC + assert(narrays == choicelist.size()); +#endif + + if (dense && DIM <= 1) { + auto outptr = out.ptr(rect); +#pragma omp parallel for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val; + for (int32_t c = (narrays - 1); c >= 0; c--) { + auto condptr = condlist[c].ptr(rect); + auto choiseptr = choicelist[c].ptr(rect); +#pragma omp parallel for schedule(static) + for (int32_t idx = (volume - 1); idx >= 0; idx--) { + if (condptr[idx]) outptr[idx] = choiseptr[idx]; + } + } + } else { +#pragma omp parallel for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + out[p] = default_val; + } + const size_t out_size = rect.hi[0] - rect.lo[0] + 1; + for (int32_t c = (narrays - 1); c >= 0; c--) { +#pragma omp parallel for schedule(static) + for (int32_t out_idx = 0; out_idx <= out_size; out_idx++) { + for (int32_t idx = (volume - out_size + out_idx); idx >= 0; idx -= out_size) { + auto p = pitches.unflatten(idx, rect.lo); + if (condlist[c][p]) out[p] = choicelist[c][p]; + } + } + } + } + } +}; + +/*static*/ void SelectTask::omp_variant(TaskContext& context) +{ + select_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/index/select_template.inl b/src/cunumeric/index/select_template.inl index a6019594c..c37ff19aa 100644 --- a/src/cunumeric/index/select_template.inl +++ b/src/cunumeric/index/select_template.inl @@ -19,3 +19,67 @@ // Useful for IDEs #include "cunumeric/index/select.h" #include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace legate; + +template +struct SelectImplBody; + +template +struct SelectImpl { + template + void operator()(SelectArgs& args) const + { + using VAL = legate_type_of; + auto out_rect = args.out.shape(); + + Pitches pitches; + size_t volume = pitches.flatten(out_rect); + if (volume == 0) return; + + auto out = args.out.write_accessor(out_rect); + +#ifndef LEGATE_BOUNDS_CHECKS + // Check to see if this is dense or not + bool dense = out.accessor.is_dense_row_major(out_rect); +#else + // No dense execution if we're doing bounds checks + bool dense = false; +#endif + + std::vector> condlist; + for (int i = 0; i < args.inputs.size() / 2; i++) { + auto rect_c = args.inputs[i].shape(); +#ifdef DEBUG_CUNUMERIC + assert(rect_c == out_rect); +#endif + condlist.push_back(args.inputs[i].read_accessor(rect_c)); + dense = dense && condlist[i].accessor.is_dense_row_major(out_rect); + } + + std::vector> choicelist; + for (int i = args.inputs.size() / 2; i < args.inputs.size(); i++) { + auto rect_c = args.inputs[i].shape(); +#ifdef DEBUG_CUNUMERIC + assert(rect_c == out_rect); +#endif + choicelist.push_back(args.inputs[i].read_accessor(rect_c)); + dense = dense && choicelist[i - args.inputs.size() / 2].accessor.is_dense_row_major(out_rect); + } + + VAL default_value = args.default_value.value(); + SelectImplBody()( + out, condlist, choicelist, default_value, out_rect, pitches, dense); + } +}; + +template +static void select_template(TaskContext& context) +{ + SelectArgs args{context.outputs()[0], context.inputs(), context.scalars()[0]}; + double_dispatch(args.out.dim(), args.out.code(), SelectImpl{}, args); +} + +} // namespace cunumeric diff --git a/tests/integration/test_index_routines.py b/tests/integration/test_index_routines.py index 2068bb1cd..64ee32be6 100644 --- a/tests/integration/test_index_routines.py +++ b/tests/integration/test_index_routines.py @@ -268,7 +268,131 @@ def test_out_invalid_shape(self): num.choose(self.a, self.choices, out=aout) -# TODO: test select +DIM = 7 + +SELECT_SHAPES = ( + (DIM,), + (1, 1), + (1, DIM), + (DIM, 1), + (DIM, 0), + (DIM, DIM), + (1, 1, 1), + (1, 0, DIM), + (DIM, 1, 1), + (1, DIM, 1), + (1, 1, DIM), + (DIM, DIM, DIM), +) + +DEFAULTS = (0, -100, 5) + + +@pytest.mark.parametrize("size", SELECT_SHAPES) +def test_select(size): + # test with 2 conditions/choices + no default passed + arr = np.random.randint(-15, 15, size=size) + cond_np1 = arr > 1 + cond_num1 = num.array(cond_np1) + cond_np2 = arr < 0 + cond_num2 = num.array(cond_np2) + choice_np1 = arr * 10 + choice_num1 = num.array(choice_np1) + choice_np2 = arr * 2 + choice_num2 = num.array(choice_np2) + res_np = np.select( + ( + cond_np1, + cond_np2, + ), + ( + choice_np1, + choice_np2, + ), + ) + res_num = num.select( + ( + cond_num1, + cond_num2, + ), + ( + choice_num1, + choice_num2, + ), + ) + assert np.array_equal(res_np, res_num) + + # test with all False + cond_np = arr > 100 + cond_num = num.array(cond_np) + choice_np = arr * 100 + choice_num = num.array(choice_np) + res_np = np.select(cond_np, choice_np) + res_num = num.select(cond_num, choice_num) + assert np.array_equal(res_np, res_num) + + # test with all True + cond_np = arr < 100 + cond_num = num.array(cond_np) + choice_np = arr * 10 + choice_num = num.array(choice_np) + res_np = np.select(cond_np, choice_np) + res_num = num.select(cond_num, choice_num) + assert np.array_equal(res_np, res_num) + + +def test_select_maxdim(): + for ndim in range(2, LEGATE_MAX_DIM + 1): + a_shape = tuple(np.random.randint(1, 9) for i in range(ndim)) + arr = mk_seq_array(np, a_shape) + condlist_np = list() + choicelist_np = list() + condlist_num = list() + choicelist_num = list() + nlist = np.random.randint(1, 5) + for nl in range(0, nlist): + arr_con = arr > nl * 2 + arr_ch = arr * nl + condlist_np += (arr_con,) + choicelist_np += (arr_ch,) + condlist_num += (num.array(arr_con),) + choicelist_num += (num.array(arr_ch),) + res_np = np.select(condlist_np, choicelist_np) + res_num = num.select(condlist_num, choicelist_num) + assert np.array_equal(res_np, res_num) + + +@pytest.mark.parametrize("size", SELECT_SHAPES) +@pytest.mark.parametrize("default", DEFAULTS) +def test_select_default(size, default): + arr_np = np.random.randint(-5, 5, size=size) + cond_np = arr_np > 1 + cond_num = num.array(cond_np) + choice_np = arr_np**2 + choice_num = num.array(choice_np) + res_np = np.select(cond_np, choice_np, default) + res_num = num.select(cond_num, choice_num, default) + assert np.array_equal(res_np, res_num) + + +SELECT_ZERO_SHAPES = ( + (0,), + (0, 1), +) + + +@pytest.mark.parametrize("size", SELECT_ZERO_SHAPES) +def test_select_zero_shape(size): + arr_np = np.random.randint(-15, 15, size=size) + cond_np = arr_np > 1 + cond_num = num.array(cond_np) + choice_np = arr_np * 10 + choice_num = num.array(choice_np) + msg = "select with an empty condition list is not possible" + with pytest.raises(ValueError, match=msg): + np.select(cond_np, choice_np) + with pytest.raises(ValueError, match=msg): + num.select(cond_num, choice_num) def test_diagonal():