From fa32b7a4dd9a246feae71f1f25d471bfb98b6dea Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Nov 2023 12:30:48 -0800
Subject: [PATCH] implementing C++ part for np.select

---
 cunumeric/eager.py                       |   2 +-
 src/cunumeric/index/select.cc            |  59 +++++++++++
 src/cunumeric/index/select.cu            |  88 +++++++++++++++-
 src/cunumeric/index/select.h             |  24 +++++
 src/cunumeric/index/select_omp.cc        |  61 +++++++++++
 src/cunumeric/index/select_template.inl  |  64 ++++++++++++
 tests/integration/test_index_routines.py | 126 ++++++++++++++++++++++-
 7 files changed, 417 insertions(+), 7 deletions(-)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 56a056b6f..91efd777a 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -644,7 +644,7 @@ def select(
                 default,
             )
         else:
-            self.array[:] = np.select(
+            self.array[...] = np.select(
                 tuple(c.array for c in condlist),
                 tuple(c.array for c in choicelist),
                 default,
diff --git a/src/cunumeric/index/select.cc b/src/cunumeric/index/select.cc
index 87e3983aa..3ebd52746 100644
--- a/src/cunumeric/index/select.cc
+++ b/src/cunumeric/index/select.cc
@@ -16,3 +16,62 @@
 
 #include "cunumeric/index/select.h"
 #include "cunumeric/index/select_template.inl"
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <Type::Code CODE, int DIM>
+struct SelectImplBody<VariantKind::CPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(const AccessorWO<VAL, DIM>& out,
+                  const std::vector<AccessorRO<bool, DIM>>& condlist,
+                  const std::vector<AccessorRO<VAL, DIM>>& choicelist,
+                  VAL default_val,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense) const
+  {
+    const size_t volume = rect.volume();
+    uint32_t narrays    = condlist.size();
+#ifdef DEBUG_CUNUMERIC
+    assert(narrays == choicelist.size());
+#endif
+
+    if (dense) {
+      auto outptr = out.ptr(rect);
+      for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val;
+      for (int32_t c = (narrays - 1); c >= 0; c--) {
+        auto condptr   = condlist[c].ptr(rect);
+        auto choiseptr = choicelist[c].ptr(rect);
+        for (int32_t idx = (volume - 1); idx >= 0; idx--) {
+          if (condptr[idx]) outptr[idx] = choiseptr[idx];
+        }
+      }
+    } else {
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        out[p] = default_val;
+      }
+      for (int32_t c = (narrays - 1); c >= 0; c--) {
+        for (int32_t idx = (volume - 1); idx >= 0; idx--) {
+          auto p = pitches.unflatten(idx, rect.lo);
+          if (condlist[c][p]) out[p] = choicelist[c][p];
+        }
+      }
+    }
+  }
+};
+
+/*static*/ void SelectTask::cpu_variant(TaskContext& context)
+{
+  select_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { SelectTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/select.cu b/src/cunumeric/index/select.cu
index a03a4dcda..e99fac0e9 100644
--- a/src/cunumeric/index/select.cu
+++ b/src/cunumeric/index/select.cu
@@ -31,13 +31,91 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
-  for (uint32_t c = 0; c < narrays; ++c) {
-    if (condlist[c][idx]) {
-      outptr[idx] = choicelist[c][idx];
-      return;
+  outptr[idx] = default_val;
+  for (int32_t c = (narrays - 1); c >= 0; c--) {
+    if (condlist[c][idx]) { outptr[idx] = choicelist[c][idx]; }
+  }
+}
+
+template <typename VAL, int DIM>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  select_kernel(const AccessorWO<VAL, DIM> out,
+                uint32_t narrays,
+                const legate::Buffer<AccessorRO<bool, DIM>> condlist,
+                const legate::Buffer<AccessorRO<VAL, DIM>> choicelist,
+                VAL default_val,
+                const Rect<DIM> rect,
+                const Pitches<DIM - 1> pitches,
+                int out_size,
+                int volume)
+{
+  const size_t tid = global_tid_1d();
+  if (tid >= out_size) return;
+  for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) {
+    auto p = pitches.unflatten(idx, rect.lo);
+    out[p] = default_val;
+  }
+  __syncthreads();
+  for (int32_t c = (narrays - 1); c >= 0; c--) {
+    for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) {
+      auto p = pitches.unflatten(idx, rect.lo);
+      if (condlist[c][p]) { out[p] = choicelist[c][p]; }
     }
   }
-  outptr[idx] = default_val;
+}
+
+using namespace legate;
+
+template <Type::Code CODE, int DIM>
+struct SelectImplBody<VariantKind::GPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(const AccessorWO<VAL, DIM>& out,
+                  const std::vector<AccessorRO<bool, DIM>>& condlist,
+                  const std::vector<AccessorRO<VAL, DIM>>& choicelist,
+                  VAL default_val,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense) const
+  {
+    const size_t out_size = rect.hi[0] - rect.lo[0] + 1;
+    uint32_t narrays      = condlist.size();
+#ifdef DEBUG_CUNUMERIC
+    assert(narrays == choicelist.size());
+#endif
+    const size_t blocks = (out_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+    auto stream = get_cached_stream();
+    if (dense && (DIM <= 1 || rect.volume() == 0)) {
+      auto cond_arr = create_buffer<const bool*>(condlist.size(), legate::Memory::Kind::Z_COPY_MEM);
+      for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx].ptr(rect);
+      auto choice_arr =
+        create_buffer<const VAL*>(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM);
+      for (uint32_t idx = 0; idx < choicelist.size(); ++idx)
+        choice_arr[idx] = choicelist[idx].ptr(rect);
+      VAL* outptr = out.ptr(rect);
+      select_kernel_dense<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        outptr, narrays, cond_arr, choice_arr, default_val, out_size);
+    } else {
+      auto cond_arr =
+        create_buffer<AccessorRO<bool, DIM>>(condlist.size(), legate::Memory::Kind::Z_COPY_MEM);
+      for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx];
+
+      auto choice_arr =
+        create_buffer<AccessorRO<VAL, DIM>>(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM);
+      for (uint32_t idx = 0; idx < choicelist.size(); ++idx) choice_arr[idx] = choicelist[idx];
+      if (out_size == 0) return;
+      select_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        out, narrays, cond_arr, choice_arr, default_val, rect, pitches, out_size, rect.volume());
+    }
+
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void SelectTask::gpu_variant(TaskContext& context)
+{
+  select_template<VariantKind::GPU>(context);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/index/select.h b/src/cunumeric/index/select.h
index 9d14df5bb..2179e12b7 100644
--- a/src/cunumeric/index/select.h
+++ b/src/cunumeric/index/select.h
@@ -17,3 +17,27 @@
 #pragma once
 
 #include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct SelectArgs {
+  const Array& out;
+  const std::vector<Array>& inputs;
+  const legate::Scalar& default_value;
+};
+
+class SelectTask : public CuNumericTask<SelectTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_SELECT;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/select_omp.cc b/src/cunumeric/index/select_omp.cc
index 87e3983aa..f015208a7 100644
--- a/src/cunumeric/index/select_omp.cc
+++ b/src/cunumeric/index/select_omp.cc
@@ -16,3 +16,64 @@
 
 #include "cunumeric/index/select.h"
 #include "cunumeric/index/select_template.inl"
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <Type::Code CODE, int DIM>
+struct SelectImplBody<VariantKind::OMP, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(const AccessorWO<VAL, DIM>& out,
+                  const std::vector<AccessorRO<bool, DIM>>& condlist,
+                  const std::vector<AccessorRO<VAL, DIM>>& choicelist,
+                  VAL default_val,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense) const
+  {
+    const size_t volume = rect.volume();
+    uint32_t narrays    = condlist.size();
+#ifdef DEBUG_CUNUMERIC
+    assert(narrays == choicelist.size());
+#endif
+
+    if (dense && DIM <= 1) {
+      auto outptr = out.ptr(rect);
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val;
+      for (int32_t c = (narrays - 1); c >= 0; c--) {
+        auto condptr   = condlist[c].ptr(rect);
+        auto choiseptr = choicelist[c].ptr(rect);
+#pragma omp parallel for schedule(static)
+        for (int32_t idx = (volume - 1); idx >= 0; idx--) {
+          if (condptr[idx]) outptr[idx] = choiseptr[idx];
+        }
+      }
+    } else {
+#pragma omp parallel for schedule(static)
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        out[p] = default_val;
+      }
+      const size_t out_size = rect.hi[0] - rect.lo[0] + 1;
+      for (int32_t c = (narrays - 1); c >= 0; c--) {
+#pragma omp parallel for schedule(static)
+        for (int32_t out_idx = 0; out_idx <= out_size; out_idx++) {
+          for (int32_t idx = (volume - out_size + out_idx); idx >= 0; idx -= out_size) {
+            auto p = pitches.unflatten(idx, rect.lo);
+            if (condlist[c][p]) out[p] = choicelist[c][p];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*static*/ void SelectTask::omp_variant(TaskContext& context)
+{
+  select_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/select_template.inl b/src/cunumeric/index/select_template.inl
index a6019594c..c37ff19aa 100644
--- a/src/cunumeric/index/select_template.inl
+++ b/src/cunumeric/index/select_template.inl
@@ -19,3 +19,67 @@
 // Useful for IDEs
 #include "cunumeric/index/select.h"
 #include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <VariantKind KIND, Type::Code CODE, int DIM>
+struct SelectImplBody;
+
+template <VariantKind KIND>
+struct SelectImpl {
+  template <Type::Code CODE, int DIM>
+  void operator()(SelectArgs& args) const
+  {
+    using VAL     = legate_type_of<CODE>;
+    auto out_rect = args.out.shape<DIM>();
+
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(out_rect);
+    if (volume == 0) return;
+
+    auto out = args.out.write_accessor<VAL, DIM>(out_rect);
+
+#ifndef LEGATE_BOUNDS_CHECKS
+    // Check to see if this is dense or not
+    bool dense = out.accessor.is_dense_row_major(out_rect);
+#else
+    // No dense execution if we're doing bounds checks
+    bool dense = false;
+#endif
+
+    std::vector<AccessorRO<bool, DIM>> condlist;
+    for (int i = 0; i < args.inputs.size() / 2; i++) {
+      auto rect_c = args.inputs[i].shape<DIM>();
+#ifdef DEBUG_CUNUMERIC
+      assert(rect_c == out_rect);
+#endif
+      condlist.push_back(args.inputs[i].read_accessor<bool, DIM>(rect_c));
+      dense = dense && condlist[i].accessor.is_dense_row_major(out_rect);
+    }
+
+    std::vector<AccessorRO<VAL, DIM>> choicelist;
+    for (int i = args.inputs.size() / 2; i < args.inputs.size(); i++) {
+      auto rect_c = args.inputs[i].shape<DIM>();
+#ifdef DEBUG_CUNUMERIC
+      assert(rect_c == out_rect);
+#endif
+      choicelist.push_back(args.inputs[i].read_accessor<VAL, DIM>(rect_c));
+      dense = dense && choicelist[i - args.inputs.size() / 2].accessor.is_dense_row_major(out_rect);
+    }
+
+    VAL default_value = args.default_value.value<VAL>();
+    SelectImplBody<KIND, CODE, DIM>()(
+      out, condlist, choicelist, default_value, out_rect, pitches, dense);
+  }
+};
+
+template <VariantKind KIND>
+static void select_template(TaskContext& context)
+{
+  SelectArgs args{context.outputs()[0], context.inputs(), context.scalars()[0]};
+  double_dispatch(args.out.dim(), args.out.code(), SelectImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/integration/test_index_routines.py b/tests/integration/test_index_routines.py
index 2068bb1cd..64ee32be6 100644
--- a/tests/integration/test_index_routines.py
+++ b/tests/integration/test_index_routines.py
@@ -268,7 +268,131 @@ def test_out_invalid_shape(self):
             num.choose(self.a, self.choices, out=aout)
 
 
-# TODO: test select
+DIM = 7
+
+SELECT_SHAPES = (
+    (DIM,),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, 0),
+    (DIM, DIM),
+    (1, 1, 1),
+    (1, 0, DIM),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+)
+
+DEFAULTS = (0, -100, 5)
+
+
+@pytest.mark.parametrize("size", SELECT_SHAPES)
+def test_select(size):
+    # test with 2 conditions/choices + no default passed
+    arr = np.random.randint(-15, 15, size=size)
+    cond_np1 = arr > 1
+    cond_num1 = num.array(cond_np1)
+    cond_np2 = arr < 0
+    cond_num2 = num.array(cond_np2)
+    choice_np1 = arr * 10
+    choice_num1 = num.array(choice_np1)
+    choice_np2 = arr * 2
+    choice_num2 = num.array(choice_np2)
+    res_np = np.select(
+        (
+            cond_np1,
+            cond_np2,
+        ),
+        (
+            choice_np1,
+            choice_np2,
+        ),
+    )
+    res_num = num.select(
+        (
+            cond_num1,
+            cond_num2,
+        ),
+        (
+            choice_num1,
+            choice_num2,
+        ),
+    )
+    assert np.array_equal(res_np, res_num)
+
+    # test with all False
+    cond_np = arr > 100
+    cond_num = num.array(cond_np)
+    choice_np = arr * 100
+    choice_num = num.array(choice_np)
+    res_np = np.select(cond_np, choice_np)
+    res_num = num.select(cond_num, choice_num)
+    assert np.array_equal(res_np, res_num)
+
+    # test with all True
+    cond_np = arr < 100
+    cond_num = num.array(cond_np)
+    choice_np = arr * 10
+    choice_num = num.array(choice_np)
+    res_np = np.select(cond_np, choice_np)
+    res_num = num.select(cond_num, choice_num)
+    assert np.array_equal(res_np, res_num)
+
+
+def test_select_maxdim():
+    for ndim in range(2, LEGATE_MAX_DIM + 1):
+        a_shape = tuple(np.random.randint(1, 9) for i in range(ndim))
+        arr = mk_seq_array(np, a_shape)
+        condlist_np = list()
+        choicelist_np = list()
+        condlist_num = list()
+        choicelist_num = list()
+        nlist = np.random.randint(1, 5)
+        for nl in range(0, nlist):
+            arr_con = arr > nl * 2
+            arr_ch = arr * nl
+            condlist_np += (arr_con,)
+            choicelist_np += (arr_ch,)
+            condlist_num += (num.array(arr_con),)
+            choicelist_num += (num.array(arr_ch),)
+        res_np = np.select(condlist_np, choicelist_np)
+        res_num = num.select(condlist_num, choicelist_num)
+        assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("size", SELECT_SHAPES)
+@pytest.mark.parametrize("default", DEFAULTS)
+def test_select_default(size, default):
+    arr_np = np.random.randint(-5, 5, size=size)
+    cond_np = arr_np > 1
+    cond_num = num.array(cond_np)
+    choice_np = arr_np**2
+    choice_num = num.array(choice_np)
+    res_np = np.select(cond_np, choice_np, default)
+    res_num = num.select(cond_num, choice_num, default)
+    assert np.array_equal(res_np, res_num)
+
+
+SELECT_ZERO_SHAPES = (
+    (0,),
+    (0, 1),
+)
+
+
+@pytest.mark.parametrize("size", SELECT_ZERO_SHAPES)
+def test_select_zero_shape(size):
+    arr_np = np.random.randint(-15, 15, size=size)
+    cond_np = arr_np > 1
+    cond_num = num.array(cond_np)
+    choice_np = arr_np * 10
+    choice_num = num.array(choice_np)
+    msg = "select with an empty condition list is not possible"
+    with pytest.raises(ValueError, match=msg):
+        np.select(cond_np, choice_np)
+    with pytest.raises(ValueError, match=msg):
+        num.select(cond_num, choice_num)
 
 
 def test_diagonal():