Skip to content

Commit

Permalink
implementing C++ part for np.select
Browse files Browse the repository at this point in the history
  • Loading branch information
ipdemes committed Nov 8, 2023
1 parent d3a89fa commit fa32b7a
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 7 deletions.
2 changes: 1 addition & 1 deletion cunumeric/eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def select(
default,
)
else:
self.array[:] = np.select(
self.array[...] = np.select(
tuple(c.array for c in condlist),
tuple(c.array for c in choicelist),
default,
Expand Down
59 changes: 59 additions & 0 deletions src/cunumeric/index/select.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,62 @@

#include "cunumeric/index/select.h"
#include "cunumeric/index/select_template.inl"

namespace cunumeric {

using namespace legate;

template <Type::Code CODE, int DIM>
struct SelectImplBody<VariantKind::CPU, CODE, DIM> {
using VAL = legate_type_of<CODE>;

void operator()(const AccessorWO<VAL, DIM>& out,
const std::vector<AccessorRO<bool, DIM>>& condlist,
const std::vector<AccessorRO<VAL, DIM>>& choicelist,
VAL default_val,
const Rect<DIM>& rect,
const Pitches<DIM - 1>& pitches,
bool dense) const
{
const size_t volume = rect.volume();
uint32_t narrays = condlist.size();
#ifdef DEBUG_CUNUMERIC
assert(narrays == choicelist.size());
#endif

if (dense) {
auto outptr = out.ptr(rect);
for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val;
for (int32_t c = (narrays - 1); c >= 0; c--) {
auto condptr = condlist[c].ptr(rect);
auto choiseptr = choicelist[c].ptr(rect);
for (int32_t idx = (volume - 1); idx >= 0; idx--) {
if (condptr[idx]) outptr[idx] = choiseptr[idx];
}
}
} else {
for (size_t idx = 0; idx < volume; ++idx) {
auto p = pitches.unflatten(idx, rect.lo);
out[p] = default_val;
}
for (int32_t c = (narrays - 1); c >= 0; c--) {
for (int32_t idx = (volume - 1); idx >= 0; idx--) {
auto p = pitches.unflatten(idx, rect.lo);
if (condlist[c][p]) out[p] = choicelist[c][p];
}
}
}
}
};

/*static*/ void SelectTask::cpu_variant(TaskContext& context)
{
select_template<VariantKind::CPU>(context);
}

namespace // unnamed
{
static void __attribute__((constructor)) register_tasks(void) { SelectTask::register_variants(); }
} // namespace

} // namespace cunumeric
88 changes: 83 additions & 5 deletions src/cunumeric/index/select.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,91 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
for (uint32_t c = 0; c < narrays; ++c) {
if (condlist[c][idx]) {
outptr[idx] = choicelist[c][idx];
return;
outptr[idx] = default_val;
for (int32_t c = (narrays - 1); c >= 0; c--) {
if (condlist[c][idx]) { outptr[idx] = choicelist[c][idx]; }
}
}

template <typename VAL, int DIM>
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
select_kernel(const AccessorWO<VAL, DIM> out,
uint32_t narrays,
const legate::Buffer<AccessorRO<bool, DIM>> condlist,
const legate::Buffer<AccessorRO<VAL, DIM>> choicelist,
VAL default_val,
const Rect<DIM> rect,
const Pitches<DIM - 1> pitches,
int out_size,
int volume)
{
const size_t tid = global_tid_1d();
if (tid >= out_size) return;
for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) {
auto p = pitches.unflatten(idx, rect.lo);
out[p] = default_val;
}
__syncthreads();
for (int32_t c = (narrays - 1); c >= 0; c--) {
for (int32_t idx = (volume - out_size + tid); idx >= 0; idx -= out_size) {
auto p = pitches.unflatten(idx, rect.lo);
if (condlist[c][p]) { out[p] = choicelist[c][p]; }
}
}
outptr[idx] = default_val;
}

using namespace legate;

template <Type::Code CODE, int DIM>
struct SelectImplBody<VariantKind::GPU, CODE, DIM> {
using VAL = legate_type_of<CODE>;

void operator()(const AccessorWO<VAL, DIM>& out,
const std::vector<AccessorRO<bool, DIM>>& condlist,
const std::vector<AccessorRO<VAL, DIM>>& choicelist,
VAL default_val,
const Rect<DIM>& rect,
const Pitches<DIM - 1>& pitches,
bool dense) const
{
const size_t out_size = rect.hi[0] - rect.lo[0] + 1;
uint32_t narrays = condlist.size();
#ifdef DEBUG_CUNUMERIC
assert(narrays == choicelist.size());
#endif
const size_t blocks = (out_size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

auto stream = get_cached_stream();
if (dense && (DIM <= 1 || rect.volume() == 0)) {
auto cond_arr = create_buffer<const bool*>(condlist.size(), legate::Memory::Kind::Z_COPY_MEM);
for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx].ptr(rect);
auto choice_arr =
create_buffer<const VAL*>(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM);
for (uint32_t idx = 0; idx < choicelist.size(); ++idx)
choice_arr[idx] = choicelist[idx].ptr(rect);
VAL* outptr = out.ptr(rect);
select_kernel_dense<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
outptr, narrays, cond_arr, choice_arr, default_val, out_size);
} else {
auto cond_arr =
create_buffer<AccessorRO<bool, DIM>>(condlist.size(), legate::Memory::Kind::Z_COPY_MEM);
for (uint32_t idx = 0; idx < condlist.size(); ++idx) cond_arr[idx] = condlist[idx];

auto choice_arr =
create_buffer<AccessorRO<VAL, DIM>>(choicelist.size(), legate::Memory::Kind::Z_COPY_MEM);
for (uint32_t idx = 0; idx < choicelist.size(); ++idx) choice_arr[idx] = choicelist[idx];
if (out_size == 0) return;
select_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
out, narrays, cond_arr, choice_arr, default_val, rect, pitches, out_size, rect.volume());
}

CHECK_CUDA_STREAM(stream);
}
};

/*static*/ void SelectTask::gpu_variant(TaskContext& context)
{
select_template<VariantKind::GPU>(context);
}

} // namespace cunumeric
24 changes: 24 additions & 0 deletions src/cunumeric/index/select.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,27 @@
#pragma once

#include "cunumeric/cunumeric.h"

namespace cunumeric {

struct SelectArgs {
const Array& out;
const std::vector<Array>& inputs;
const legate::Scalar& default_value;
};

class SelectTask : public CuNumericTask<SelectTask> {
public:
static const int TASK_ID = CUNUMERIC_SELECT;

public:
static void cpu_variant(legate::TaskContext& context);
#ifdef LEGATE_USE_OPENMP
static void omp_variant(legate::TaskContext& context);
#endif
#ifdef LEGATE_USE_CUDA
static void gpu_variant(legate::TaskContext& context);
#endif
};

} // namespace cunumeric
61 changes: 61 additions & 0 deletions src/cunumeric/index/select_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,64 @@

#include "cunumeric/index/select.h"
#include "cunumeric/index/select_template.inl"

namespace cunumeric {

using namespace legate;

template <Type::Code CODE, int DIM>
struct SelectImplBody<VariantKind::OMP, CODE, DIM> {
using VAL = legate_type_of<CODE>;

void operator()(const AccessorWO<VAL, DIM>& out,
const std::vector<AccessorRO<bool, DIM>>& condlist,
const std::vector<AccessorRO<VAL, DIM>>& choicelist,
VAL default_val,
const Rect<DIM>& rect,
const Pitches<DIM - 1>& pitches,
bool dense) const
{
const size_t volume = rect.volume();
uint32_t narrays = condlist.size();
#ifdef DEBUG_CUNUMERIC
assert(narrays == choicelist.size());
#endif

if (dense && DIM <= 1) {
auto outptr = out.ptr(rect);
#pragma omp parallel for schedule(static)
for (size_t idx = 0; idx < volume; ++idx) outptr[idx] = default_val;
for (int32_t c = (narrays - 1); c >= 0; c--) {
auto condptr = condlist[c].ptr(rect);
auto choiseptr = choicelist[c].ptr(rect);
#pragma omp parallel for schedule(static)
for (int32_t idx = (volume - 1); idx >= 0; idx--) {
if (condptr[idx]) outptr[idx] = choiseptr[idx];
}
}
} else {
#pragma omp parallel for schedule(static)
for (size_t idx = 0; idx < volume; ++idx) {
auto p = pitches.unflatten(idx, rect.lo);
out[p] = default_val;
}
const size_t out_size = rect.hi[0] - rect.lo[0] + 1;
for (int32_t c = (narrays - 1); c >= 0; c--) {
#pragma omp parallel for schedule(static)
for (int32_t out_idx = 0; out_idx <= out_size; out_idx++) {
for (int32_t idx = (volume - out_size + out_idx); idx >= 0; idx -= out_size) {
auto p = pitches.unflatten(idx, rect.lo);
if (condlist[c][p]) out[p] = choicelist[c][p];
}
}
}
}
}
};

/*static*/ void SelectTask::omp_variant(TaskContext& context)
{
select_template<VariantKind::OMP>(context);
}

} // namespace cunumeric
64 changes: 64 additions & 0 deletions src/cunumeric/index/select_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,67 @@
// Useful for IDEs
#include "cunumeric/index/select.h"
#include "cunumeric/pitches.h"

namespace cunumeric {

using namespace legate;

template <VariantKind KIND, Type::Code CODE, int DIM>
struct SelectImplBody;

template <VariantKind KIND>
struct SelectImpl {
template <Type::Code CODE, int DIM>
void operator()(SelectArgs& args) const
{
using VAL = legate_type_of<CODE>;
auto out_rect = args.out.shape<DIM>();

Pitches<DIM - 1> pitches;
size_t volume = pitches.flatten(out_rect);
if (volume == 0) return;

auto out = args.out.write_accessor<VAL, DIM>(out_rect);

#ifndef LEGATE_BOUNDS_CHECKS
// Check to see if this is dense or not
bool dense = out.accessor.is_dense_row_major(out_rect);
#else
// No dense execution if we're doing bounds checks
bool dense = false;
#endif

std::vector<AccessorRO<bool, DIM>> condlist;
for (int i = 0; i < args.inputs.size() / 2; i++) {
auto rect_c = args.inputs[i].shape<DIM>();
#ifdef DEBUG_CUNUMERIC
assert(rect_c == out_rect);
#endif
condlist.push_back(args.inputs[i].read_accessor<bool, DIM>(rect_c));
dense = dense && condlist[i].accessor.is_dense_row_major(out_rect);
}

std::vector<AccessorRO<VAL, DIM>> choicelist;
for (int i = args.inputs.size() / 2; i < args.inputs.size(); i++) {
auto rect_c = args.inputs[i].shape<DIM>();
#ifdef DEBUG_CUNUMERIC
assert(rect_c == out_rect);
#endif
choicelist.push_back(args.inputs[i].read_accessor<VAL, DIM>(rect_c));
dense = dense && choicelist[i - args.inputs.size() / 2].accessor.is_dense_row_major(out_rect);
}

VAL default_value = args.default_value.value<VAL>();
SelectImplBody<KIND, CODE, DIM>()(
out, condlist, choicelist, default_value, out_rect, pitches, dense);
}
};

template <VariantKind KIND>
static void select_template(TaskContext& context)
{
SelectArgs args{context.outputs()[0], context.inputs(), context.scalars()[0]};
double_dispatch(args.out.dim(), args.out.code(), SelectImpl<KIND>{}, args);
}

} // namespace cunumeric
Loading

0 comments on commit fa32b7a

Please sign in to comment.