From f0eef077a1ae183ad6c2a08c9780ce419a69d405 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 29 Aug 2023 13:29:36 -0700
Subject: [PATCH 01/66] adding support for adding where to unary reduction
 operations on the python side

---
 cunumeric/_ufunc/ufunc.py |  6 +----
 cunumeric/array.py        | 52 ++++++++++++++++++++++++---------------
 cunumeric/deferred.py     |  6 +++++
 cunumeric/eager.py        |  2 ++
 cunumeric/module.py       | 26 +++++++++++---------
 5 files changed, 56 insertions(+), 36 deletions(-)
diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 11800e53f..27692c383 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -688,7 +688,7 @@ def reduce(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[Any, None] = None,
-        where: bool = True,
+        where: Optional[ndarray] = None,
     ) -> ndarray:
         """
         reduce(array, axis=0, dtype=None, out=None, keepdims=False, initial=<no
@@ -748,10 +748,6 @@ def reduce(
             raise NotImplementedError(
                 f"reduction for {self} is not yet implemented"
             )
-        if not isinstance(where, bool) or not where:
-            raise NotImplementedError(
-                "the 'where' keyword is not yet supported"
-            )
 
         # NumPy seems to be using None as the default axis value for scalars
         if array.ndim == 0 and axis == 0:
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 91ad41dde..527767bee 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1761,7 +1761,7 @@ def all(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.all(axis=None, out=None, keepdims=False, initial=None, where=True)
 
@@ -1796,7 +1796,7 @@ def any(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.any(axis=None, out=None, keepdims=False, initial=None, where=True)
 
@@ -3051,7 +3051,7 @@ def max(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.max(axis=None, out=None, keepdims=False, initial=<no value>,
                  where=True)
@@ -3086,6 +3086,7 @@ def mean(
         dtype: Union[np.dtype[Any], None] = None,
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
+        where: Union[ndarray, None]=None,
     ) -> ndarray:
         """a.mean(axis=None, dtype=None, out=None, keepdims=False)
 
@@ -3120,12 +3121,14 @@ def mean(
                 dtype=dtype,
                 out=out,
                 keepdims=keepdims,
+                where=where
             )
         else:
             sum_array = self.sum(
                 axis=axis,
                 dtype=dtype,
                 keepdims=keepdims,
+                where=where
             )
         if axis is None:
             divisor = reduce(lambda x, y: x * y, self.shape, 1)
@@ -3154,7 +3157,7 @@ def min(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.min(axis=None, out=None, keepdims=False, initial=<no value>,
                  where=True)
@@ -3252,7 +3255,7 @@ def prod(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.prod(axis=None, dtype=None, out=None, keepdims=False, initial=1,
         where=True)
@@ -3622,10 +3625,10 @@ def sum(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None ] = None,
     ) -> ndarray:
         """a.sum(axis=None, dtype=None, out=None, keepdims=False, initial=0,
-        where=True)
+        where=None)
 
         Return the sum of the array elements over the given axis.
 
@@ -4134,7 +4137,7 @@ def _perform_unary_reduction(
         keepdims: bool = False,
         args: Union[Any, None] = None,
         initial: Union[int, float, None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         # When 'res_dtype' is not None, the input and output of the reduction
         # have different types. Such reduction operators don't take a dtype of
@@ -4156,7 +4159,12 @@ def _perform_unary_reduction(
 
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
-        if isinstance(where, ndarray):
+
+        #if not where:
+        #   where = ndarray(shape=(1,), dtype=bool)
+        #   where.fill(True)
+
+        if where and isinstance(where, ndarray):
             # The where array has to broadcast to the src.shape
             if np.broadcast_shapes(src.shape, where.shape) != src.shape:
                 raise ValueError(
@@ -4191,7 +4199,7 @@ def _perform_unary_reduction(
 
         if out is None:
             out = ndarray(
-                shape=out_shape, dtype=res_dtype, inputs=(src, where)
+                shape=out_shape, dtype=res_dtype, inputs=(src,where)
             )
         elif out.shape != out_shape:
             errmsg = (
@@ -4211,16 +4219,20 @@ def _perform_unary_reduction(
             )
 
         if where:
-            result._thunk.unary_reduction(
-                op,
-                src._thunk,
-                cls._get_where_thunk(where, result.shape),
-                axis,
-                axes,
-                keepdims,
-                args,
-                initial,
-            )
+            where_thunk = cls._get_where_thunk(where, result.shape)
+        else:
+            where_thunk=None
+       
+        result._thunk.unary_reduction(
+            op,
+            src._thunk,
+            where_thunk,
+            axis,
+            axes,
+            keepdims,
+            args,
+            initial,
+        )
 
         if result is not out:
             out._thunk.convert(result._thunk)
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 6a56f65db..8dce70e6d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3183,6 +3183,9 @@ def unary_reduction(
 
                 task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
                 task.add_input(rhs_array.base)
+                if where:
+                    task.add_input(where.base)
+                    task.add_alignment(rhs_array.base, where.base)
                 task.add_scalar_arg(op, ty.int32)
                 task.add_scalar_arg(rhs_array.shape, (ty.int64,))
 
@@ -3222,6 +3225,9 @@ def unary_reduction(
 
                 task.add_input(rhs_array.base)
                 task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
+                if where:
+                    task.add_input(where.base)
+                    task.add_alignment(rhs_array.base, where.base)
                 task.add_scalar_arg(axis, ty.int32)
                 task.add_scalar_arg(op, ty.int32)
 
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 680f1b5a1..c16b6ca4a 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1499,6 +1499,8 @@ def unary_reduction(
                 initial,
             )
             return
+        if not where:
+            where=True
         if op in _UNARY_RED_OPS_WITH_ARG:
             fn = _UNARY_RED_OPS_WITH_ARG[op]
             # arg based APIs don't have the following arguments: where, initial
diff --git a/cunumeric/module.py b/cunumeric/module.py
index e8d933da6..beadb57e8 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4864,7 +4864,7 @@ def all(
     axis: Optional[Union[int, tuple[int, ...]]] = None,
     out: Optional[ndarray] = None,
     keepdims: bool = False,
-    where: bool = True,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Test whether all array elements along a given axis evaluate to True.
@@ -4922,7 +4922,7 @@ def any(
     axis: Optional[Union[int, tuple[int, ...]]] = None,
     out: Optional[ndarray] = None,
     keepdims: bool = False,
-    where: bool = True,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Test whether any array element along a given axis evaluates to True.
@@ -5182,7 +5182,7 @@ def prod(
     out: Optional[ndarray] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: bool = True,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -5263,7 +5263,7 @@ def sum(
     out: Optional[ndarray] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: bool = True,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -5723,7 +5723,7 @@ def nanmin(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Any = True,
+    where: Optional[ndarray]= None,
 ) -> ndarray:
     """
     Return minimum of an array or minimum along an axis, ignoring any
@@ -5816,7 +5816,7 @@ def nanmax(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Any = True,
+    where: Optional[dnarray]=None,
 ) -> ndarray:
     """
     Return the maximum of an array or maximum along an axis, ignoring any
@@ -5913,7 +5913,7 @@ def nanprod(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Any = True,
+    where: Optional[ndarray]=None,
 ) -> ndarray:
     """
     Return the product of array elements over a given axis treating
@@ -6009,7 +6009,7 @@ def nansum(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Any = True,
+    where: Optionla[ndarray]=None,
 ) -> ndarray:
     """
     Return the sum of array elements over a given axis treating
@@ -6173,7 +6173,7 @@ def amax(
     out: Optional[ndarray] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: bool = True,
+    where:Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -6250,7 +6250,7 @@ def amin(
     out: Optional[ndarray] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: bool = True,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -7002,6 +7002,7 @@ def mean(
     dtype: Optional[np.dtype[Any]] = None,
     out: Optional[ndarray] = None,
     keepdims: bool = False,
+    where: Optional[Uniton[ndarray, tuple[bool, ...]]] = None,
 ) -> ndarray:
     """
 
@@ -7043,6 +7044,9 @@ def mean(
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the mean. 
+
     Returns
     -------
     m : ndarray
@@ -7058,7 +7062,7 @@ def mean(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    return a.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+    return a.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where)
 
 
 # Histograms

From a93748db98c88c12e58033cbb0d2e9a15c51541a Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 29 Aug 2023 21:44:56 -0700
Subject: [PATCH 02/66] adding support for adding where to unary reduction
 operations on the C++ side

---
 cunumeric/array.py                            |   4 -
 src/cunumeric/unary/scalar_unary_red.h        |   2 +
 .../unary/scalar_unary_red_template.inl       | 138 +++++++++++++++++-
 tests/integration/test_mean.py                |  19 +++
 4 files changed, 155 insertions(+), 8 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 527767bee..37b9be051 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -162,10 +162,6 @@ def convert_to_predicate_ndarray(obj: Any) -> bool:
     # Keep all boolean types as they are
     if obj is True or obj is False:
         return obj
-    # GH #135
-    raise NotImplementedError(
-        "the `where` parameter is currently not supported"
-    )
 
 
 def maybe_convert_to_np_ndarray(obj: Any) -> Any:
diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h
index ef96d1bfc..856eeba65 100644
--- a/src/cunumeric/unary/scalar_unary_red.h
+++ b/src/cunumeric/unary/scalar_unary_red.h
@@ -24,6 +24,8 @@ namespace cunumeric {
 struct ScalarUnaryRedArgs {
   const Array& out;
   const Array& in;
+  const Array& where;
+  bool has_where;
   UnaryRedCode op_code;
   legate::DomainPoint shape;
   std::vector<legate::Store> args;
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 9f073c716..8739119d4 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -28,8 +28,13 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
 struct ScalarUnaryRed {
+     ScalarUnaryRed(ScalarUnaryRedArgs& args){assert(false);}
+};
+
+template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, false> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
@@ -116,6 +121,114 @@ struct ScalarUnaryRed {
   }
 };
 
+template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
+  using OP    = UnaryRedOp<OP_CODE, CODE>;
+  using LG_OP = typename OP::OP;
+  using LHS   = typename OP::VAL;
+  using RHS   = legate_type_of<CODE>;
+  using OUT   = AccessorRD<LG_OP, true, 1>;
+  using IN    = AccessorRO<RHS, DIM>;
+  using WHERE = AccessorRO<bool, DIM>;
+
+  IN in;
+  const RHS* inptr;
+  OUT out;
+  size_t volume;
+  Pitches<DIM - 1> pitches;
+  Rect<DIM> rect;
+  Point<DIM> origin;
+  Point<DIM> shape;
+  RHS to_find;
+  bool dense;
+  WHERE where;
+  const bool* whereptr;
+
+  struct DenseReduction {};
+  struct SparseReduction {};
+
+  ScalarUnaryRed(ScalarUnaryRedArgs& args) : dense(false)
+  {
+    rect   = args.in.shape<DIM>();
+    origin = rect.lo;
+    in     = args.in.read_accessor<RHS, DIM>(rect);
+    volume = pitches.flatten(rect);
+    shape  = args.shape;
+
+    out = args.out.reduce_accessor<LG_OP, true, 1>();
+    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
+
+    if ((args.has_where==true) && (OP_CODE != UnaryRedCode::CONTAINS)) {
+        where =  args.where.read_accessor<bool, DIM>(rect);
+#ifndef LEGATE_BOUNDS_CHECKS
+        if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect) ) {
+          dense = true;
+          inptr = in.ptr(rect);
+          whereptr = where.ptr(rect);
+        }
+#endif
+    }
+    else{
+#ifndef LEGATE_BOUNDS_CHECKS
+        // Check to see if this is dense or not
+        if (in.accessor.is_dense_row_major(rect)) {
+          dense = true;
+          inptr = in.ptr(rect);
+        }
+#endif
+    }
+
+  }
+
+  __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
+  {
+    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
+      if (inptr[idx] == to_find) { lhs = true; }
+    } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
+                         OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
+      auto p = pitches.unflatten(idx, origin);
+      if (whereptr[idx]==true)
+          OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
+    } else {
+      if (whereptr[idx]==true)
+          OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
+    }
+  }
+
+  __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, SparseReduction) const noexcept
+  {
+    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
+      auto point = pitches.unflatten(idx, origin);
+      if (in[point] == to_find) { lhs = true; }
+    } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
+                         OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
+      auto p = pitches.unflatten(idx, origin);
+      if (where[p]==true)
+          OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
+    } else {
+      auto p = pitches.unflatten(idx, origin);
+      if (where[p]==true)
+          OP::template fold<true>(lhs, OP::convert(in[p], identity));
+    }
+  }
+
+  void execute() const noexcept
+  {
+    auto identity = LG_OP::identity;
+#ifndef LEGATE_BOUNDS_CHECKS
+    // The constexpr if here prevents the DenseReduction from being instantiated for GPU kernels
+    // which limits compile times and binary sizes.
+    if constexpr (KIND != VariantKind::GPU) {
+      // Check to see if this is dense or not
+      if (dense) {
+            return ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
+      }
+    }
+#endif
+    return ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
+  }
+};
+
 template <VariantKind KIND, UnaryRedCode OP_CODE>
 struct ScalarUnaryRedImpl {
   template <Type::Code CODE, int DIM>
@@ -123,8 +236,13 @@ struct ScalarUnaryRedImpl {
   {
     // The operation is always valid for contains
     if constexpr (UnaryRedOp<OP_CODE, CODE>::valid || OP_CODE == UnaryRedCode::CONTAINS) {
-      ScalarUnaryRed<KIND, OP_CODE, CODE, DIM> red(args);
-      red.execute();
+      if (args.has_where){
+          ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> red(args);
+          red.execute();}
+      else{
+        ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> red(args);
+        red.execute();
+      }
     }
   }
 };
@@ -155,9 +273,21 @@ static void scalar_unary_red_template(TaskContext& context)
     shape.dim = 1;
     shape[0]  = 1;
   }
+
+  bool has_where=false;
+  if (inputs.size()==2){
+      has_where=true;
+  }
+  if (has_where==true){
   ScalarUnaryRedArgs args{
-    context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)};
+    context.reductions()[0], inputs[0], inputs[1], has_where, op_code, shape, std::move(extra_args)};
   op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
+  }else{
+    Array where;
+    ScalarUnaryRedArgs args{
+    context.reductions()[0], inputs[0], where, has_where, op_code, shape, std::move(extra_args)};
+  op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
+  }
 }
 
 }  // namespace cunumeric
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index 0f9064280..7ecb3cd16 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -60,6 +60,13 @@ def test_scalar(val):
     assert np.array_equal(res_np, res_num)
 
 
+
+@pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
+def test_scalar_where(val):
+    res_np = np.mean(val, where=True)
+    res_num = num.mean(val,where=True)
+    assert np.array_equal(res_np, res_num)
+
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_basic(size):
     arr_np = np.random.randint(-5, 5, size=size)
@@ -69,6 +76,18 @@ def test_basic(size):
     np.array_equal(res_np, res_num)
 
 
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_basic_where(size):
+    arr_np = np.random.randint(-5, 5, size=size)
+    arr_num = num.array(arr_np)
+    where_np = arr_np%2
+    where_np = arr_np.astype(bool)
+    where_num = num.array(where_np)
+    res_np = np.mean(arr_np, where=where_np)
+    res_num = num.mean(arr_num, where=where_num)
+    np.array_equal(res_np, res_num)
+
+
 @pytest.mark.xfail
 @pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))
 def test_axis_tuple(axis):

From 79c7aca840b531c61a7c4e9d7d019254f7417172 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 29 Aug 2023 22:10:47 -0700
Subject: [PATCH 03/66] formatting

---
 cunumeric/_ufunc/ufunc.py                     |  2 +-
 cunumeric/array.py                            | 28 +++----
 cunumeric/eager.py                            |  2 +-
 cunumeric/module.py                           | 18 ++--
 .../unary/scalar_unary_red_template.inl       | 82 +++++++++----------
 tests/integration/test_mean.py                |  6 +-
 6 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 27692c383..763efa6db 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -14,7 +14,7 @@
 #
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Dict, Sequence, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Sequence, Union
 
 import numpy as np
 from legate.core.utils import OrderedSet
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 37b9be051..70d84f745 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -158,10 +158,11 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
     return ndarray(shape=None, thunk=thunk, writeable=writeable)
 
 
-def convert_to_predicate_ndarray(obj: Any) -> bool:
+def convert_to_predicate_ndarray(obj: Any) -> Any:
     # Keep all boolean types as they are
     if obj is True or obj is False:
         return obj
+    return
 
 
 def maybe_convert_to_np_ndarray(obj: Any) -> Any:
@@ -3082,7 +3083,7 @@ def mean(
         dtype: Union[np.dtype[Any], None] = None,
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
-        where: Union[ndarray, None]=None,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.mean(axis=None, dtype=None, out=None, keepdims=False)
 
@@ -3113,18 +3114,11 @@ def mean(
         # Do the sum
         if out is not None and out.dtype == dtype:
             sum_array = self.sum(
-                axis=axis,
-                dtype=dtype,
-                out=out,
-                keepdims=keepdims,
-                where=where
+                axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
             )
         else:
             sum_array = self.sum(
-                axis=axis,
-                dtype=dtype,
-                keepdims=keepdims,
-                where=where
+                axis=axis, dtype=dtype, keepdims=keepdims, where=where
             )
         if axis is None:
             divisor = reduce(lambda x, y: x * y, self.shape, 1)
@@ -3621,7 +3615,7 @@ def sum(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         initial: Union[int, float, None] = None,
-        where: Union[ndarray, None ] = None,
+        where: Union[ndarray, None] = None,
     ) -> ndarray:
         """a.sum(axis=None, dtype=None, out=None, keepdims=False, initial=0,
         where=None)
@@ -4003,7 +3997,7 @@ def find_common_type(*args: ndarray) -> np.dtype[Any]:
                 scalars.append(array.dtype.type(0))
             else:
                 array_types.append(array.dtype)
-        return np.result_type(*array_types, *scalars)
+        return np.find_common_type(array_types, scalar_types)
 
     def _maybe_convert(self, dtype: np.dtype[Any], hints: Any) -> ndarray:
         if self.dtype == dtype:
@@ -4156,7 +4150,7 @@ def _perform_unary_reduction(
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
 
-        #if not where:
+        # if not where:
         #   where = ndarray(shape=(1,), dtype=bool)
         #   where.fill(True)
 
@@ -4195,7 +4189,7 @@ def _perform_unary_reduction(
 
         if out is None:
             out = ndarray(
-                shape=out_shape, dtype=res_dtype, inputs=(src,where)
+                shape=out_shape, dtype=res_dtype, inputs=(src, where)
             )
         elif out.shape != out_shape:
             errmsg = (
@@ -4217,8 +4211,8 @@ def _perform_unary_reduction(
         if where:
             where_thunk = cls._get_where_thunk(where, result.shape)
         else:
-            where_thunk=None
-       
+            where_thunk = None
+
         result._thunk.unary_reduction(
             op,
             src._thunk,
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index c16b6ca4a..d7934d04e 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1500,7 +1500,7 @@ def unary_reduction(
             )
             return
         if not where:
-            where=True
+            where = True
         if op in _UNARY_RED_OPS_WITH_ARG:
             fn = _UNARY_RED_OPS_WITH_ARG[op]
             # arg based APIs don't have the following arguments: where, initial
diff --git a/cunumeric/module.py b/cunumeric/module.py
index beadb57e8..f8e60266b 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5723,7 +5723,7 @@ def nanmin(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Optional[ndarray]= None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Return minimum of an array or minimum along an axis, ignoring any
@@ -5816,7 +5816,7 @@ def nanmax(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Optional[dnarray]=None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Return the maximum of an array or maximum along an axis, ignoring any
@@ -5913,7 +5913,7 @@ def nanprod(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Optional[ndarray]=None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Return the product of array elements over a given axis treating
@@ -6009,7 +6009,7 @@ def nansum(
     out: Union[ndarray, None] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where: Optionla[ndarray]=None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
     Return the sum of array elements over a given axis treating
@@ -6173,7 +6173,7 @@ def amax(
     out: Optional[ndarray] = None,
     keepdims: bool = False,
     initial: Optional[Union[int, float]] = None,
-    where:Optional[ndarray] = None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -7002,7 +7002,7 @@ def mean(
     dtype: Optional[np.dtype[Any]] = None,
     out: Optional[ndarray] = None,
     keepdims: bool = False,
-    where: Optional[Uniton[ndarray, tuple[bool, ...]]] = None,
+    where: Optional[ndarray] = None,
 ) -> ndarray:
     """
 
@@ -7045,7 +7045,7 @@ def mean(
         exceptions will be raised.
 
     where : array_like of bool, optional
-        Elements to include in the mean. 
+        Elements to include in the mean.
 
     Returns
     -------
@@ -7062,7 +7062,9 @@ def mean(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    return a.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where)
+    return a.mean(
+        axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
+    )
 
 
 # Histograms
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 8739119d4..710eb334c 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -30,11 +30,11 @@ using namespace legate;
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
 struct ScalarUnaryRed {
-     ScalarUnaryRed(ScalarUnaryRedArgs& args){assert(false);}
+  ScalarUnaryRed(ScalarUnaryRedArgs& args) { assert(false); }
 };
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, false> {
+struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
@@ -122,7 +122,7 @@ struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, false> {
 };
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
+struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
@@ -158,26 +158,24 @@ struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
     out = args.out.reduce_accessor<LG_OP, true, 1>();
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
 
-    if ((args.has_where==true) && (OP_CODE != UnaryRedCode::CONTAINS)) {
-        where =  args.where.read_accessor<bool, DIM>(rect);
+    if ((args.has_where == true) && (OP_CODE != UnaryRedCode::CONTAINS)) {
+      where = args.where.read_accessor<bool, DIM>(rect);
 #ifndef LEGATE_BOUNDS_CHECKS
-        if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect) ) {
-          dense = true;
-          inptr = in.ptr(rect);
-          whereptr = where.ptr(rect);
-        }
+      if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect)) {
+        dense    = true;
+        inptr    = in.ptr(rect);
+        whereptr = where.ptr(rect);
+      }
 #endif
-    }
-    else{
+    } else {
 #ifndef LEGATE_BOUNDS_CHECKS
-        // Check to see if this is dense or not
-        if (in.accessor.is_dense_row_major(rect)) {
-          dense = true;
-          inptr = in.ptr(rect);
-        }
+      // Check to see if this is dense or not
+      if (in.accessor.is_dense_row_major(rect)) {
+        dense = true;
+        inptr = in.ptr(rect);
+      }
 #endif
     }
-
   }
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
@@ -187,11 +185,10 @@ struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
-      if (whereptr[idx]==true)
-          OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
+      if (whereptr[idx] == true)
+        OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
     } else {
-      if (whereptr[idx]==true)
-          OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
+      if (whereptr[idx] == true) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
     }
   }
 
@@ -203,12 +200,10 @@ struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
-      if (where[p]==true)
-          OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
+      if (where[p] == true) OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
     } else {
       auto p = pitches.unflatten(idx, origin);
-      if (where[p]==true)
-          OP::template fold<true>(lhs, OP::convert(in[p], identity));
+      if (where[p] == true) OP::template fold<true>(lhs, OP::convert(in[p], identity));
     }
   }
 
@@ -221,7 +216,7 @@ struct ScalarUnaryRed< KIND, OP_CODE, CODE, DIM, true> {
     if constexpr (KIND != VariantKind::GPU) {
       // Check to see if this is dense or not
       if (dense) {
-            return ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
+        return ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
       }
     }
 #endif
@@ -236,10 +231,10 @@ struct ScalarUnaryRedImpl {
   {
     // The operation is always valid for contains
     if constexpr (UnaryRedOp<OP_CODE, CODE>::valid || OP_CODE == UnaryRedCode::CONTAINS) {
-      if (args.has_where){
-          ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> red(args);
-          red.execute();}
-      else{
+      if (args.has_where) {
+        ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> red(args);
+        red.execute();
+      } else {
         ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> red(args);
         red.execute();
       }
@@ -274,19 +269,22 @@ static void scalar_unary_red_template(TaskContext& context)
     shape[0]  = 1;
   }
 
-  bool has_where=false;
-  if (inputs.size()==2){
-      has_where=true;
-  }
-  if (has_where==true){
-  ScalarUnaryRedArgs args{
-    context.reductions()[0], inputs[0], inputs[1], has_where, op_code, shape, std::move(extra_args)};
-  op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
-  }else{
+  bool has_where = false;
+  if (inputs.size() == 2) { has_where = true; }
+  if (has_where == true) {
+    ScalarUnaryRedArgs args{context.reductions()[0],
+                            inputs[0],
+                            inputs[1],
+                            has_where,
+                            op_code,
+                            shape,
+                            std::move(extra_args)};
+    op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
+  } else {
     Array where;
     ScalarUnaryRedArgs args{
-    context.reductions()[0], inputs[0], where, has_where, op_code, shape, std::move(extra_args)};
-  op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
+      context.reductions()[0], inputs[0], where, has_where, op_code, shape, std::move(extra_args)};
+    op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
   }
 }
 
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index 7ecb3cd16..68816cac6 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -60,13 +60,13 @@ def test_scalar(val):
     assert np.array_equal(res_np, res_num)
 
 
-
 @pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
 def test_scalar_where(val):
     res_np = np.mean(val, where=True)
-    res_num = num.mean(val,where=True)
+    res_num = num.mean(val, where=True)
     assert np.array_equal(res_np, res_num)
 
+
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_basic(size):
     arr_np = np.random.randint(-5, 5, size=size)
@@ -80,7 +80,7 @@ def test_basic(size):
 def test_basic_where(size):
     arr_np = np.random.randint(-5, 5, size=size)
     arr_num = num.array(arr_np)
-    where_np = arr_np%2
+    where_np = arr_np % 2
     where_np = arr_np.astype(bool)
     where_num = num.array(where_np)
     res_np = np.mean(arr_np, where=where_np)

From a64e82978aaf358dba567d3bd0f9b6697f0b3bf1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 30 Aug 2023 14:50:59 -0700
Subject: [PATCH 04/66] fixing error when calculation divisor

---
 cunumeric/array.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 70d84f745..17ac848fd 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3121,9 +3121,15 @@ def mean(
                 axis=axis, dtype=dtype, keepdims=keepdims, where=where
             )
         if axis is None:
-            divisor = reduce(lambda x, y: x * y, self.shape, 1)
+            if where:
+                divisor = np.sum(where)
+            else:
+                divisor = reduce(lambda x, y: x * y, self.shape, 1)
         else:
-            divisor = self.shape[axis]
+            if where:
+                divisor = np.sum(where, axis=axis)
+            else:
+                divisor = self.shape[axis]
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
         if dtype.kind == "f" or dtype.kind == "c":

From 6f7b0592924ebba74a13fd51ecbd1fe699f9b71c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 30 Aug 2023 22:33:04 -0700
Subject: [PATCH 05/66] fixing error with mean(where)

---
 cunumeric/array.py                            | 59 ++++++++++++---
 cunumeric/deferred.py                         | 14 ++--
 cunumeric/module.py                           | 74 ++++++++++++++++++-
 .../unary/scalar_unary_red_template.inl       | 12 +--
 4 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 17ac848fd..855da8ed4 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -161,8 +161,12 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
 def convert_to_predicate_ndarray(obj: Any) -> Any:
     # Keep all boolean types as they are
     if obj is True or obj is False:
-        return obj
-    return
+        where = ndarray(shape=(), dtype=bool)
+        where.fill(obj)
+        return where
+    if obj is None:
+        return None
+    return convert_to_cunumeric_ndarray(obj)
 
 
 def maybe_convert_to_np_ndarray(obj: Any) -> Any:
@@ -3121,13 +3125,14 @@ def mean(
                 axis=axis, dtype=dtype, keepdims=keepdims, where=where
             )
         if axis is None:
-            if where:
-                divisor = np.sum(where)
+            if where is not None:
+                divisor = np.sum(where.astype(int))
+
             else:
                 divisor = reduce(lambda x, y: x * y, self.shape, 1)
         else:
-            if where:
-                divisor = np.sum(where, axis=axis)
+            if where is not None:
+                divisor = np.sum(where.astype(int), axis=axis)
             else:
                 divisor = self.shape[axis]
         # Divide by the number of things in the collapsed dimensions
@@ -3146,6 +3151,40 @@ def mean(
         else:
             return sum_array
 
+    @add_boilerplate()
+    def nanmean(
+        self,
+        axis: Any = None,
+        dtype: Union[np.dtype[Any], None] = None,
+        out: Union[ndarray, None] = None,
+        keepdims: bool = False,
+        where: Union[ndarray, None] = None,
+    ) -> ndarray:
+        """a.mean(axis=None, dtype=None, out=None, keepdims=False)
+
+        Returns the average of the array elements along given axis,
+        ignoring NaN entires
+
+        Refer to :func:`cunumeric.mean` for full documentation.
+
+        See Also
+        --------
+        cunumeric.nanmean : equivalent function
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+
+        """
+        # nan_where = self != Nan
+        # if where is not None:
+        #    new_where = where and nan_where
+        # else:
+        #    new_where = nan_where
+        new_where = where
+
+        return self.mean(axis, dtype, out, keepdims, new_where)
+
     @add_boilerplate()
     def min(
         self,
@@ -4160,7 +4199,7 @@ def _perform_unary_reduction(
         #   where = ndarray(shape=(1,), dtype=bool)
         #   where.fill(True)
 
-        if where and isinstance(where, ndarray):
+        if where is not None and isinstance(where, ndarray):
             # The where array has to broadcast to the src.shape
             if np.broadcast_shapes(src.shape, where.shape) != src.shape:
                 raise ValueError(
@@ -4214,8 +4253,10 @@ def _perform_unary_reduction(
                 shape=out_shape, dtype=res_dtype, inputs=(src, where)
             )
 
-        if where:
-            where_thunk = cls._get_where_thunk(where, result.shape)
+        if where is not None:
+            where_thunk = (
+                where._thunk
+            )  # cls._get_where_thunk(where, result.shape)
         else:
             where_thunk = None
 
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 8dce70e6d..7f563396b 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3183,11 +3183,12 @@ def unary_reduction(
 
                 task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
                 task.add_input(rhs_array.base)
-                if where:
-                    task.add_input(where.base)
-                    task.add_alignment(rhs_array.base, where.base)
                 task.add_scalar_arg(op, ty.int32)
                 task.add_scalar_arg(rhs_array.shape, (ty.int64,))
+                task.add_scalar_arg((where is not None), ty.bool_)
+                if where is not None:
+                    task.add_input(where.base)
+                    task.add_alignment(rhs_array.base, where.base)
 
                 self.add_arguments(task, args)
 
@@ -3225,11 +3226,12 @@ def unary_reduction(
 
                 task.add_input(rhs_array.base)
                 task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
-                if where:
-                    task.add_input(where.base)
-                    task.add_alignment(rhs_array.base, where.base)
                 task.add_scalar_arg(axis, ty.int32)
                 task.add_scalar_arg(op, ty.int32)
+                task.add_scalar_arg((where is not None), ty.bool_)
+                if where is not None:
+                    task.add_input(where.base)
+                    task.add_alignment(rhs_array.base, where.base)
 
                 self.add_arguments(task, args)
 
diff --git a/cunumeric/module.py b/cunumeric/module.py
index f8e60266b..d738498af 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6995,7 +6995,7 @@ def count_nonzero(
 # Averages and variances
 
 
-@add_boilerplate("a")
+@add_boilerplate("a", "where")
 def mean(
     a: ndarray,
     axis: Optional[Union[int, tuple[int, ...]]] = None,
@@ -7067,6 +7067,78 @@ def mean(
     )
 
 
+@add_boilerplate("a")
+def nanmean(
+    a: ndarray,
+    axis: Optional[Union[int, tuple[int, ...]]] = None,
+    dtype: Optional[np.dtype[Any]] = None,
+    out: Optional[ndarray] = None,
+    keepdims: bool = False,
+    where: Optional[ndarray] = None,
+) -> ndarray:
+    """
+
+    Compute the arithmetic mean along the specified axis, ignoring NaNs.
+
+    Returns the average of the array elements.  The average is taken over
+    the flattened array by default, otherwise over the specified axis.
+    `float64` intermediate and return values are used for integer inputs.
+
+    Parameters
+    ----------
+    a : array_like
+        Array containing numbers whose mean is desired. If `a` is not an
+        array, a conversion is attempted.
+    axis : None or int or tuple[int], optional
+        Axis or axes along which the means are computed. The default is to
+        compute the mean of the flattened array.
+
+        If this is a tuple of ints, a mean is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the mean.  For integer inputs, the default
+        is `float64`; for floating point inputs, it is the same as the
+        input dtype.
+    out : ndarray, optional
+        Alternate output array in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output, but the type will be cast if necessary.
+        See `ufuncs-output-type` for more details.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `mean` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-class' method does not implement `keepdims` any
+        exceptions will be raised.
+
+    where : array_like of bool, optional
+        Elements to include in the mean.
+
+    Returns
+    -------
+    m : ndarray
+        If `out=None`, returns a new array of the same dtype a above
+        containing the mean values, otherwise a reference to the output
+        array is returned.
+
+    See Also
+    --------
+    numpy.nanmean
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    """
+    return a.nanmean(
+        axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
+    )
+
+
 # Histograms
 
 
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 710eb334c..034172ba6 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -258,19 +258,19 @@ static void scalar_unary_red_template(TaskContext& context)
   auto& inputs  = context.inputs();
   auto& scalars = context.scalars();
 
+  auto op_code     = scalars[0].value<UnaryRedCode>();
+  auto shape       = scalars[1].value<DomainPoint>();
+  bool has_where   = scalars[2].value<bool>();
+  size_t start_idx = has_where ? 2 : 1;
   std::vector<Store> extra_args;
-  for (size_t idx = 1; idx < inputs.size(); ++idx) extra_args.push_back(std::move(inputs[idx]));
-
-  auto op_code = scalars[0].value<UnaryRedCode>();
-  auto shape   = scalars[1].value<DomainPoint>();
+  for (size_t idx = start_idx; idx < inputs.size(); ++idx)
+    extra_args.push_back(std::move(inputs[idx]));
   // If the RHS was a scalar, use (1,) as the shape
   if (shape.dim == 0) {
     shape.dim = 1;
     shape[0]  = 1;
   }
 
-  bool has_where = false;
-  if (inputs.size() == 2) { has_where = true; }
   if (has_where == true) {
     ScalarUnaryRedArgs args{context.reductions()[0],
                             inputs[0],

From d12fe55f461feb3c19781d5a414fd9bead11dd47 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 1 Sep 2023 09:10:21 -0700
Subject: [PATCH 06/66] updating test

---
 cunumeric/array.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 855da8ed4..4659cd63e 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -161,9 +161,10 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
 def convert_to_predicate_ndarray(obj: Any) -> Any:
     # Keep all boolean types as they are
     if obj is True or obj is False:
-        where = ndarray(shape=(), dtype=bool)
-        where.fill(obj)
-        return where
+        # where = ndarray(shape=(), dtype=bool)
+        # where.fill(obj)
+        # return where
+        return obj
     if obj is None:
         return None
     return convert_to_cunumeric_ndarray(obj)
@@ -3104,7 +3105,7 @@ def mean(
         Multiple GPUs, Multiple CPUs
 
         """
-        if axis is not None and not isinstance(axis, int):
+        if axis is not None and type(axis) != int:
             raise NotImplementedError(
                 "cunumeric.mean only supports int types for "
                 "'axis' currently"
@@ -3115,6 +3116,13 @@ def mean(
                 dtype = np.dtype(np.float64)
             else:
                 dtype = self.dtype
+
+        # if self.size == 1:
+        #    if where is True:
+        #        return self.copy()
+        #    if where is False:
+        #        return self.copy() / 0
+
         # Do the sum
         if out is not None and out.dtype == dtype:
             sum_array = self.sum(
@@ -3124,12 +3132,14 @@ def mean(
             sum_array = self.sum(
                 axis=axis, dtype=dtype, keepdims=keepdims, where=where
             )
+
         if axis is None:
             if where is not None:
                 divisor = np.sum(where.astype(int))
 
             else:
                 divisor = reduce(lambda x, y: x * y, self.shape, 1)
+
         else:
             if where is not None:
                 divisor = np.sum(where.astype(int), axis=axis)
@@ -4012,8 +4022,8 @@ def _get_where_thunk(
     ) -> Union[Literal[True], NumPyThunk]:
         if where is True:
             return True
-        if where is False:
-            raise RuntimeError("should have caught this earlier")
+        # if where is False:
+        #    raise RuntimeError("should have caught this earlier")
         if not isinstance(where, ndarray) or where.dtype != np.bool_:
             raise RuntimeError("should have converted this earlier")
         if where.shape != out_shape:
@@ -4254,9 +4264,7 @@ def _perform_unary_reduction(
             )
 
         if where is not None:
-            where_thunk = (
-                where._thunk
-            )  # cls._get_where_thunk(where, result.shape)
+            where_thunk = cls._get_where_thunk(where, src.shape)
         else:
             where_thunk = None
 

From ec41b527a6929ded12683d314697378b6e7ce4a0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 1 Sep 2023 11:22:53 -0700
Subject: [PATCH 07/66] adding logic for where in  unary_red on C++

---
 cunumeric/array.py                         |  10 +-
 src/cunumeric/unary/unary_red.cc           |  23 ++++
 src/cunumeric/unary/unary_red.cu           | 124 +++++++++++++++++++++
 src/cunumeric/unary/unary_red.h            |   1 +
 src/cunumeric/unary/unary_red_omp.cc       |  28 +++++
 src/cunumeric/unary/unary_red_template.inl |  41 +++++--
 6 files changed, 213 insertions(+), 14 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 4659cd63e..d95e2e97e 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3117,11 +3117,11 @@ def mean(
             else:
                 dtype = self.dtype
 
-        # if self.size == 1:
-        #    if where is True:
-        #        return self.copy()
-        #    if where is False:
-        #        return self.copy() / 0
+        if self.size == 1:
+            if where is True:
+                return self.copy()  # type: ignore
+            if where is False:
+                return self.copy() / 0  # type: ignore
 
         # Do the sum
         if out is not None and out.dtype == dtype:
diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index bec85fc6f..34af69434 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -42,6 +42,29 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   }
 };
 
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct UnaryRedImplBodyWhere<VariantKind::CPU, OP_CODE, CODE, DIM> {
+  using OP    = UnaryRedOp<OP_CODE, CODE>;
+  using LG_OP = typename OP::OP;
+  using RHS   = legate_type_of<CODE>;
+
+  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
+                  AccessorRO<RHS, DIM> rhs,
+                  AccessorRO<bool, DIM> where,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  int collapsed_dim,
+                  size_t volume) const
+  {
+    for (size_t idx = 0; idx < volume; ++idx) {
+      auto point    = pitches.unflatten(idx, rect.lo);
+      auto identity = LG_OP::identity;
+      if (where[point] == true)
+        lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+    }
+  }
+};
+
 /*static*/ void UnaryRedTask::cpu_variant(TaskContext& context)
 {
   unary_red_template<VariantKind::CPU>(context);
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index b417b3638..708d21a17 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -279,6 +279,86 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   return point;
 }
 
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+static __device__ __forceinline__ Point<DIM> local_reduce_where(LHS& result,
+                                                                AccessorRO<RHS, DIM> in,
+                                                                AccessorRO<bool, DIM> where,
+                                                                LHS identity,
+                                                                const ThreadBlocks<DIM>& blocks,
+                                                                const Rect<DIM>& domain,
+                                                                int32_t collapsed_dim)
+{
+  const coord_t tid = threadIdx.x;
+  const coord_t bid = blockIdx.x;
+
+  Point<DIM> point = blocks.point(bid, tid, domain.lo);
+  if (!domain.contains(point)) return point;
+
+  while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
+    if (where[point] == true) {
+      LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
+      REDOP::template fold<true>(result, value);
+    }
+    blocks.next_point(point);
+  }
+
+#if __CUDA_ARCH__ >= 700
+  // If we're collapsing the innermost dimension, we perform some optimization
+  // with shared memory to reduce memory traffic due to atomic updates
+  if (collapsed_dim == DIM - 1) {
+    __shared__ uint8_t shmem[THREADS_PER_BLOCK * sizeof(LHS)];
+    LHS* trampoline = reinterpret_cast<LHS*>(shmem);
+    // Check for the case where all the threads in the same warp have
+    // the same x value in which case they're all going to conflict
+    // so instead we do a warp-level reduction so just one thread ends
+    // up doing the full atomic
+    coord_t bucket = 0;
+    for (int32_t dim = DIM - 2; dim >= 0; --dim)
+      bucket = bucket * (domain.hi[dim] - domain.lo[dim] + 1) + point[dim] - domain.lo[dim];
+
+    const uint32_t same_mask = __match_any_sync(0xffffffff, bucket);
+    int32_t laneid;
+    asm volatile("mov.s32 %0, %laneid;" : "=r"(laneid));
+    const uint32_t active_mask = __ballot_sync(0xffffffff, same_mask - (1 << laneid));
+    if ((active_mask & (1 << laneid)) != 0) {
+      // Store our data into shared
+      trampoline[tid] = result;
+      // Make sure all the threads in the warp are done writing
+      __syncwarp(active_mask);
+      // Have the lowest thread in each mask pull in the values
+      int32_t lowest_index = -1;
+      for (int32_t i = 0; i < warpSize; i++)
+        if (same_mask & (1 << i)) {
+          if (lowest_index == -1) {
+            if (i != laneid) {
+              // We're not the lowest thread in the warp for
+              // this value so we're done, set the value back
+              // to identity to ensure that we don't try to
+              // perform the reduction out to memory
+              result = identity;
+              break;
+            } else  // Make sure we don't do this test again
+              lowest_index = i;
+            // It was already our value, so just keep going
+          } else {
+            // Pull in the value from shared memory
+            const int32_t index = tid + i - laneid;
+            REDOP::template fold<true>(result, trampoline[index]);
+          }
+        }
+    }
+  }
+#endif
+
+#ifdef LEGATE_BOUNDS_CHECKS
+  // Note: this isn't necessary because we know that the affine transformation on the output
+  // accessor will ignore coordinates of the collapsed dimension. However, Legion's bounds checks
+  // want the accessor to honor the sub-rectangle passed when it was created, so we need to
+  // put points back in the bounds to appease the checks.
+  point[collapsed_dim] = domain.lo[collapsed_dim];
+#endif
+  return point;
+}
 template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduce_with_rd_acc(AccessorRD<REDOP, false, DIM> out,
@@ -294,6 +374,22 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   if (result != identity) out.reduce(point, result);
 }
 
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  reduce_with_rd_acc_where(AccessorRD<REDOP, false, DIM> out,
+                           AccessorRO<RHS, DIM> in,
+                           AccessorRO<bool, DIM> where,
+                           LHS identity,
+                           ThreadBlocks<DIM> blocks,
+                           Rect<DIM> domain,
+                           int32_t collapsed_dim)
+{
+  auto result = identity;
+  auto point  = local_reduce_where<OP, REDOP, LHS, RHS, DIM>(
+    result, in, where, identity, blocks, domain, collapsed_dim);
+  if (result != identity) out.reduce(point, result);
+}
+
 template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
 struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
@@ -321,6 +417,34 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   }
 };
 
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct UnaryRedImplBodyWhere<VariantKind::GPU, OP_CODE, CODE, DIM> {
+  using OP    = UnaryRedOp<OP_CODE, CODE>;
+  using LG_OP = typename OP::OP;
+  using RHS   = legate_type_of<CODE>;
+  using LHS   = typename OP::VAL;
+
+  void operator()(AccessorRD<LG_OP, false, DIM> lhs,
+                  AccessorRO<RHS, DIM> rhs,
+                  AccessorRO<bool, DIM> where,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  int collapsed_dim,
+                  size_t volume) const
+  {
+    auto Kernel = reduce_with_rd_acc_where<OP, LG_OP, LHS, RHS, DIM>;
+    auto stream = get_cached_stream();
+
+    ThreadBlocks<DIM> blocks;
+    blocks.initialize(rect, collapsed_dim);
+
+    blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
+    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
+      lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
 /*static*/ void UnaryRedTask::gpu_variant(TaskContext& context)
 {
   unary_red_template<VariantKind::GPU>(context);
diff --git a/src/cunumeric/unary/unary_red.h b/src/cunumeric/unary/unary_red.h
index bea848468..a7b44584f 100644
--- a/src/cunumeric/unary/unary_red.h
+++ b/src/cunumeric/unary/unary_red.h
@@ -24,6 +24,7 @@ namespace cunumeric {
 struct UnaryRedArgs {
   const Array& lhs;
   const Array& rhs;
+  const Array& where;
   int32_t collapsed_dim;
   UnaryRedCode op_code;
 };
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index 21ba49d4a..ab1e071d8 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -98,6 +98,34 @@ struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   }
 };
 
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct UnaryRedImplBodyWhere<VariantKind::OMP, OP_CODE, CODE, DIM> {
+  using OP    = UnaryRedOp<OP_CODE, CODE>;
+  using LG_OP = typename OP::OP;
+  using RHS   = legate_type_of<CODE>;
+
+  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
+                  AccessorRO<RHS, DIM> rhs,
+                  AccessorRO<bool, DIM> where,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  int collapsed_dim,
+                  size_t volume) const
+  {
+    Splitter<DIM> splitter;
+    auto split = splitter.split(rect, collapsed_dim);
+
+#pragma omp parallel for schedule(static)
+    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
+      for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
+        auto point    = splitter.combine(o_idx, i_idx, rect.lo);
+        auto identity = LG_OP::identity;
+        if (where[point] == true)
+          lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+      }
+  }
+};
+
 /*static*/ void UnaryRedTask::omp_variant(TaskContext& context)
 {
   unary_red_template<VariantKind::OMP>(context);
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 1e3b298d3..3c77fdd50 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -30,7 +30,10 @@ using namespace legate;
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
 struct UnaryRedImplBody;
 
-template <VariantKind KIND, UnaryRedCode OP_CODE>
+template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+struct UnaryRedImplBodyWhere;
+
+template <VariantKind KIND, UnaryRedCode OP_CODE, bool HAS_WHERE>
 struct UnaryRedImpl {
   template <Type::Code CODE,
             int DIM,
@@ -49,8 +52,15 @@ struct UnaryRedImpl {
     auto rhs = args.rhs.read_accessor<RHS, DIM>(rect);
 
     auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
-    UnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(
-      lhs, rhs, rect, pitches, args.collapsed_dim, volume);
+
+    if constexpr (HAS_WHERE) {
+      auto where = args.where.read_accessor<bool, DIM>(rect);
+      UnaryRedImplBodyWhere<KIND, OP_CODE, CODE, DIM>()(
+        lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
+    } else {
+      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(
+        lhs, rhs, rect, pitches, args.collapsed_dim, volume);
+    }
   }
 
   template <Type::Code CODE,
@@ -62,13 +72,13 @@ struct UnaryRedImpl {
   }
 };
 
-template <VariantKind KIND>
+template <VariantKind KIND, bool HAS_WHERE>
 struct UnaryRedDispatch {
   template <UnaryRedCode OP_CODE>
   void operator()(UnaryRedArgs& args) const
   {
     auto dim = std::max(1, args.rhs.dim());
-    return double_dispatch(dim, args.rhs.code(), UnaryRedImpl<KIND, OP_CODE>{}, args);
+    return double_dispatch(dim, args.rhs.code(), UnaryRedImpl<KIND, OP_CODE, HAS_WHERE>{}, args);
   }
 };
 
@@ -78,10 +88,23 @@ static void unary_red_template(TaskContext& context)
   auto& inputs     = context.inputs();
   auto& reductions = context.reductions();
   auto& scalars    = context.scalars();
-
-  UnaryRedArgs args{
-    reductions[0], inputs[0], scalars[0].value<int32_t>(), scalars[1].value<UnaryRedCode>()};
-  op_dispatch(args.op_code, UnaryRedDispatch<KIND>{}, args);
+  bool has_where   = scalars[2].value<bool>();
+  if (has_where) {
+    UnaryRedArgs args{reductions[0],
+                      inputs[0],
+                      inputs[1],
+                      scalars[0].value<int32_t>(),
+                      scalars[1].value<UnaryRedCode>()};
+    op_dispatch(args.op_code, UnaryRedDispatch<KIND, true>{}, args);
+  } else {
+    Array where;
+    UnaryRedArgs args{reductions[0],
+                      inputs[0],
+                      where,
+                      scalars[0].value<int32_t>(),
+                      scalars[1].value<UnaryRedCode>()};
+    op_dispatch(args.op_code, UnaryRedDispatch<KIND, false>{}, args);
+  }
 }
 
 }  // namespace cunumeric

From 91285302fb370b5a60160a665e7d0598a4bcc354 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Sat, 2 Sep 2023 10:59:08 -0700
Subject: [PATCH 08/66] towards improving data usage by mean

---
 cunumeric/array.py  | 29 +++++++++++++++++++----------
 cunumeric/module.py |  9 +--------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index d95e2e97e..570f6f2ab 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3081,6 +3081,16 @@ def max(
             where=where,
         )
 
+    def _count_nonzero(self, axis: Any = None) -> Union[int, ndarray]:
+        if self.size == 0:
+            return 0
+        return ndarray._perform_unary_reduction(
+            UnaryRedCode.COUNT_NONZERO,
+            self,
+            res_dtype=np.dtype(np.uint64),
+            axis=axis,
+        )
+
     @add_boilerplate()
     def mean(
         self,
@@ -3119,40 +3129,39 @@ def mean(
 
         if self.size == 1:
             if where is True:
-                return self.copy()  # type: ignore
+                return self.astype(dtype)  # type: ignore
             if where is False:
-                return self.copy() / 0  # type: ignore
+                return self.astype(dtype) / 0  # type: ignore
 
         # Do the sum
         if out is not None and out.dtype == dtype:
             sum_array = self.sum(
-                axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
+                axis=axis, out=out, keepdims=keepdims, where=where
             )
         else:
-            sum_array = self.sum(
-                axis=axis, dtype=dtype, keepdims=keepdims, where=where
-            )
+            sum_array = self.sum(axis=axis, keepdims=keepdims, where=where)
 
         if axis is None:
             if where is not None:
-                divisor = np.sum(where.astype(int))
+                divisor = where._count_nonzero()
 
             else:
                 divisor = reduce(lambda x, y: x * y, self.shape, 1)
 
         else:
             if where is not None:
-                divisor = np.sum(where.astype(int), axis=axis)
+                divisor = where._count_nonzero(axis=axis)
             else:
                 divisor = self.shape[axis]
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
+        sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
-                np.array(divisor, dtype=sum_array.dtype),
+                np.array(divisor, dtype=dtype),
             )
         else:
-            sum_array.__ifloordiv__(np.array(divisor, dtype=sum_array.dtype))
+            sum_array.__ifloordiv__(np.array(divisor, dtype=dtype))
         # Convert to the output we didn't already put it there
         if out is not None and sum_array is not out:
             assert out.dtype != sum_array.dtype
diff --git a/cunumeric/module.py b/cunumeric/module.py
index d738498af..f66c1bec4 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6978,14 +6978,7 @@ def count_nonzero(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    if a.size == 0:
-        return 0
-    return ndarray._perform_unary_reduction(
-        UnaryRedCode.COUNT_NONZERO,
-        a,
-        res_dtype=np.dtype(np.uint64),
-        axis=axis,
-    )
+    return a._count_nonzero(axis)
 
 
 ############

From 76dc871c2cbf476543be9fdc3aa1062b8c64cdb8 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Sun, 3 Sep 2023 09:51:27 -0700
Subject: [PATCH 09/66] debugging

---
 cunumeric/deferred.py          | 15 +++++++++++----
 tests/integration/test_mean.py | 28 +++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 7f563396b..7ae1edc3f 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3158,6 +3158,9 @@ def unary_reduction(
                 inputs=[self],
             )
 
+        is_where = bool(where is not None)
+        if is_where:
+            print("IRINA DEBUG inside cunumeric", where.shape, is_where)
         # See if we are doing reduction to a point or another region
         if lhs_array.size == 1:
             assert axes is None or lhs_array.ndim == rhs_array.ndim - (
@@ -3181,12 +3184,15 @@ def unary_reduction(
                     CuNumericOpCode.SCALAR_UNARY_RED
                 )
 
+                print(
+                    "IRINA DEBUG inside scalar unary_red", lhs.shape, lhs.size
+                )
                 task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
                 task.add_input(rhs_array.base)
                 task.add_scalar_arg(op, ty.int32)
                 task.add_scalar_arg(rhs_array.shape, (ty.int64,))
-                task.add_scalar_arg((where is not None), ty.bool_)
-                if where is not None:
+                task.add_scalar_arg(is_where, ty.bool_)
+                if is_where:
                     task.add_input(where.base)
                     task.add_alignment(rhs_array.base, where.base)
 
@@ -3223,13 +3229,14 @@ def unary_reduction(
 
             with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
                 task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
+                print("IRINA DEBUG inside unary_red", result.shape)
 
                 task.add_input(rhs_array.base)
                 task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
                 task.add_scalar_arg(axis, ty.int32)
                 task.add_scalar_arg(op, ty.int32)
-                task.add_scalar_arg((where is not None), ty.bool_)
-                if where is not None:
+                task.add_scalar_arg(is_where, ty.bool_)
+                if is_where:
                     task.add_input(where.base)
                     task.add_alignment(rhs_array.base, where.base)
 
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index 68816cac6..ec12e86f7 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -60,11 +60,11 @@ def test_scalar(val):
     assert np.array_equal(res_np, res_num)
 
 
-@pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
-def test_scalar_where(val):
-    res_np = np.mean(val, where=True)
-    res_num = num.mean(val, where=True)
-    assert np.array_equal(res_np, res_num)
+# @pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
+# def test_scalar_where(val):
+#    res_np = np.mean(val, where=True)
+#    res_num = num.mean(val, where=True)
+#    assert np.array_equal(res_np, res_num)
 
 
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
@@ -73,7 +73,7 @@ def test_basic(size):
     arr_num = num.array(arr_np)
     res_np = np.mean(arr_np)
     res_num = num.mean(arr_num)
-    np.array_equal(res_np, res_num)
+    assert np.array_equal(res_np, res_num)
 
 
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
@@ -85,7 +85,7 @@ def test_basic_where(size):
     where_num = num.array(where_np)
     res_np = np.mean(arr_np, where=where_np)
     res_num = num.mean(arr_num, where=where_num)
-    np.array_equal(res_np, res_num)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
 
 
 @pytest.mark.xfail
@@ -113,6 +113,20 @@ def test_axis_keepdims(size, keepdims):
         assert np.array_equal(out_np, out_num)
 
 
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_axis_where(size):
+    arr_np = np.random.randint(-5, 5, size=size)
+    arr_num = num.array(arr_np)
+    where_np = arr_np % 2
+    where_np = arr_np.astype(bool)
+    where_num = num.array(where_np)
+    ndim = arr_np.ndim
+    for axis in range(-ndim, ndim):
+        out_np = np.mean(arr_np, axis=axis, where=where_np)
+        out_num = num.mean(arr_num, axis=axis, where=where_num)
+        assert np.array_equal(out_np, out_num, equal_nan=True)
+
+
 @pytest.mark.parametrize("array_dt", (np.int32, np.float32, np.complex64))
 @pytest.mark.parametrize("dt", (np.int32, np.float32, np.complex64))
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)

From cbe93d91afd5f2dccf013d8976df03b86edc8c29 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Sun, 3 Sep 2023 10:06:48 -0700
Subject: [PATCH 10/66] removing debug output

---
 cunumeric/deferred.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 7ae1edc3f..bb4744653 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3159,8 +3159,6 @@ def unary_reduction(
             )
 
         is_where = bool(where is not None)
-        if is_where:
-            print("IRINA DEBUG inside cunumeric", where.shape, is_where)
         # See if we are doing reduction to a point or another region
         if lhs_array.size == 1:
             assert axes is None or lhs_array.ndim == rhs_array.ndim - (
@@ -3184,9 +3182,6 @@ def unary_reduction(
                     CuNumericOpCode.SCALAR_UNARY_RED
                 )
 
-                print(
-                    "IRINA DEBUG inside scalar unary_red", lhs.shape, lhs.size
-                )
                 task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
                 task.add_input(rhs_array.base)
                 task.add_scalar_arg(op, ty.int32)
@@ -3229,7 +3224,6 @@ def unary_reduction(
 
             with Annotation({"OpCode": op.name, "ArgRed?": str(argred)}):
                 task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
-                print("IRINA DEBUG inside unary_red", result.shape)
 
                 task.add_input(rhs_array.base)
                 task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])

From 419a5afc9c6ad2a2a5dc2cf1cea003a94ab3ead3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 10 Oct 2023 17:23:11 -0700
Subject: [PATCH 11/66] adding support for broadcasting where + initial
 implementation of nanmean

---
 cunumeric/array.py             | 73 +++++++++++++++++++++-------------
 tests/integration/test_mean.py | 21 +++++++---
 2 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 570f6f2ab..e1442be29 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -191,6 +191,19 @@ def check_writeable(arr: Union[ndarray, tuple[ndarray, ...], None]) -> None:
     if any(not arr.flags.writeable for arr in check_list):
         raise ValueError("array is not writeable")
 
+def broadcast_where(where:Union[ndarray, None], shape:NdShape )-> ndarray:
+    if where is not None:
+        where_array = convert_to_cunumeric_ndarray(where)
+    else:
+         where_array = None
+
+    if where_array is not None and np.ndim(where_array)!=0 and where_array.shape != shape:
+        where_array = ndarray(
+            shape=shape,
+            thunk=where_array._thunk.broadcast_to(shape),
+            writeable=False,
+        )
+    return where_array
 
 class flagsobj:
     """
@@ -857,7 +870,7 @@ def __bool__(self) -> bool:
     def __complex__(self) -> complex:
         """a.__complex__(/)"""
         return complex(self.__array__())
-
+    
     def __contains__(self, item: Any) -> ndarray:
         """a.__contains__(key, /)
 
@@ -3126,31 +3139,26 @@ def mean(
                 dtype = np.dtype(np.float64)
             else:
                 dtype = self.dtype
-
-        if self.size == 1:
-            if where is True:
-                return self.astype(dtype)  # type: ignore
-            if where is False:
-                return self.astype(dtype) / 0  # type: ignore
-
+        
+        where_array = broadcast_where(where, self.shape)
         # Do the sum
         if out is not None and out.dtype == dtype:
             sum_array = self.sum(
-                axis=axis, out=out, keepdims=keepdims, where=where
+                axis=axis, out=out, keepdims=keepdims, where=where_array
             )
         else:
-            sum_array = self.sum(axis=axis, keepdims=keepdims, where=where)
+            sum_array = self.sum(axis=axis, keepdims=keepdims, where=where_array)
 
         if axis is None:
-            if where is not None:
-                divisor = where._count_nonzero()
+            if where_array is not None:
+                divisor = where_array._count_nonzero()
 
             else:
                 divisor = reduce(lambda x, y: x * y, self.shape, 1)
 
         else:
-            if where is not None:
-                divisor = where._count_nonzero(axis=axis)
+            if where_array is not None:
+                divisor = where_array._count_nonzero(axis=axis)
             else:
                 divisor = self.shape[axis]
         # Divide by the number of things in the collapsed dimensions
@@ -3195,14 +3203,30 @@ def nanmean(
         Multiple GPUs, Multiple CPUs
 
         """
-        # nan_where = self != Nan
-        # if where is not None:
-        #    new_where = where and nan_where
-        # else:
-        #    new_where = nan_where
-        new_where = where
+        if dtype is None:
+            dtype = a.dtype
+
+        if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_): 
+            return a.mean(axis, dtype, out, keepdims, where=where)
+
+        nan_mask = _ufunc.bit_twiddling.bitwise_not(_ufunc.floating.isnan(a))
+        normalizer = nan_mask.sum( axis=axis, dtype=np.int32, keepdims=keepdims, where=where)
+        #normalizer = normalizer.astype(dtype)
+        sum_array = a.nansum(axis, dtype=dtype, keepdims=keepdims, where=where)
+        if dtype.kind == "f" or dtype.kind == "c":
+            sum_array.__itruediv__(
+                np.array(normalizer, dtype=dtype),
+            )
+        else:
+            sum_array.__ifloordiv__(np.array(normalizer, dtype=dtype))
+
+        if out is not None and sum_array is not out:
+            assert out.dtype != sum_array.dtype
+            out._thunk.convert(sum_array._thunk)
+            return out
+        else:
+            return sum_array
 
-        return self.mean(axis, dtype, out, keepdims, new_where)
 
     @add_boilerplate()
     def min(
@@ -4213,11 +4237,6 @@ def _perform_unary_reduction(
 
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
-
-        # if not where:
-        #   where = ndarray(shape=(1,), dtype=bool)
-        #   where.fill(True)
-
         if where is not None and isinstance(where, ndarray):
             # The where array has to broadcast to the src.shape
             if np.broadcast_shapes(src.shape, where.shape) != src.shape:
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index ec12e86f7..4c2fc4f4d 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -60,11 +60,11 @@ def test_scalar(val):
     assert np.array_equal(res_np, res_num)
 
 
-# @pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
-# def test_scalar_where(val):
-#    res_np = np.mean(val, where=True)
-#    res_num = num.mean(val, where=True)
-#    assert np.array_equal(res_np, res_num)
+@pytest.mark.parametrize("val", (0.0, 10.0, -5, 1 + 1j))
+def test_scalar_where(val):
+    res_np = np.mean(val, where=True)
+    res_num = num.mean(val, where=True)
+    assert np.array_equal(res_np, res_num)
 
 
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
@@ -87,6 +87,17 @@ def test_basic_where(size):
     res_num = num.mean(arr_num, where=where_num)
     assert np.array_equal(res_np, res_num, equal_nan=True)
 
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_where_broadcast(size):
+    arr_np = np.random.randint(-5, 5, size=size)
+    arr_num = num.array(arr_np)
+    where_np = np.zeros((1,), bool)
+    where_num = num.array(where_np)
+    res_np = np.mean(arr_np, where=where_np)
+    res_num = num.mean(arr_num, where=where_num)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
+
+
 
 @pytest.mark.xfail
 @pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))

From 0d7d66c6bdd5b2268cba3be8cf71ae9a1e6b5b9e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Oct 2023 09:39:58 -0700
Subject: [PATCH 12/66] adding test for nanmean

---
 cunumeric/array.py                |  63 ++++++++++--
 cunumeric/module.py               |  20 +---
 tests/integration/test_nanmean.py | 154 ++++++++++++++++++++++++++++++
 3 files changed, 211 insertions(+), 26 deletions(-)
 create mode 100755 tests/integration/test_nanmean.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index e1442be29..d9165512f 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3203,16 +3203,18 @@ def nanmean(
         Multiple GPUs, Multiple CPUs
 
         """
-        if dtype is None:
-            dtype = a.dtype
+        from . import _ufunc
 
         if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_): 
-            return a.mean(axis, dtype, out, keepdims, where=where)
+            return self.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where)
 
-        nan_mask = _ufunc.bit_twiddling.bitwise_not(_ufunc.floating.isnan(a))
+        if dtype is None:
+            dtype = self.dtype
+ 
+        nan_mask = _ufunc.bit_twiddling.bitwise_not(_ufunc.floating.isnan(self))
         normalizer = nan_mask.sum( axis=axis, dtype=np.int32, keepdims=keepdims, where=where)
         #normalizer = normalizer.astype(dtype)
-        sum_array = a.nansum(axis, dtype=dtype, keepdims=keepdims, where=where)
+        sum_array = self.nansum(axis, dtype=dtype, keepdims=keepdims, where=where)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
                 np.array(normalizer, dtype=dtype),
@@ -3221,8 +3223,10 @@ def nanmean(
             sum_array.__ifloordiv__(np.array(normalizer, dtype=dtype))
 
         if out is not None and sum_array is not out:
-            assert out.dtype != sum_array.dtype
-            out._thunk.convert(sum_array._thunk)
+            if out.dtype != sum_array.dtype:
+                out._thunk.convert(sum_array._thunk)
+            else:
+                out.copy(sum_array)
             return out
         else:
             return sum_array
@@ -3742,6 +3746,51 @@ def sum(
             where=where,
         )
 
+    @add_boilerplate()
+    def nansum(
+        self,
+        axis: Any = None,
+        dtype: Any = None,
+        out: Union[ndarray, None] = None,
+        keepdims: bool = False,
+        initial: Optional[Union[int, float]] = None,
+        where: Optional[ndarray] = None,
+    ) -> ndarray:
+        """a.nansum(axis=None, dtype=None, out=None, keepdims=False, initial=0,
+        where=None)
+
+        Return the sum of the array elements over the given axis ignoring nan.
+
+        Refer to :func:`cunumeric.nansum` for full documentation.
+
+        See Also
+        --------
+        cunumeric.nansum : equivalent function
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+
+        """
+        # Note that np.nansum and np.sum allow complex datatypes
+        # so there are no "disallowed types" for this API
+
+        if self.dtype.kind in ("f", "c"):
+            unary_red_code = UnaryRedCode.NANSUM
+        else:
+            unary_red_code = UnaryRedCode.SUM
+
+        return self._perform_unary_reduction(
+            unary_red_code,
+            self,
+            axis=axis,
+            dtype=dtype,
+            out=out,
+            keepdims=keepdims,
+            initial=initial,
+            where=where,
+        )
+
     def swapaxes(self, axis1: Any, axis2: Any) -> ndarray:
         """a.swapaxes(axis1, axis2)
 
diff --git a/cunumeric/module.py b/cunumeric/module.py
index f66c1bec4..7704eb89f 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6076,25 +6076,7 @@ def nansum(
     Multiple GPUs, Multiple CPUs
     """
 
-    # Note that np.nansum and np.sum allow complex datatypes
-    # so there are no "disallowed types" for this API
-
-    if a.dtype.kind in ("f", "c"):
-        unary_red_code = UnaryRedCode.NANSUM
-    else:
-        unary_red_code = UnaryRedCode.SUM
-
-    return a._perform_unary_reduction(
-        unary_red_code,
-        a,
-        axis=axis,
-        dtype=dtype,
-        out=out,
-        keepdims=keepdims,
-        initial=initial,
-        where=where,
-    )
-
+    return a.nansum(axis, dtype, out, keepdims, initial, where)
 
 # Exponents and logarithms
 
diff --git a/tests/integration/test_nanmean.py b/tests/integration/test_nanmean.py
new file mode 100755
index 000000000..d1e8dc433
--- /dev/null
+++ b/tests/integration/test_nanmean.py
@@ -0,0 +1,154 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+import cunumeric as num
+
+DIM = 7
+
+NO_EMPTY_SIZE = (
+    (1,),
+    (DIM,),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+)
+
+
+def gen_out_shape(size, axis):
+    if axis is None:
+        return ()
+    if axis < 0:
+        axis += len(size)
+    if axis >= 0 and axis < len(size):
+        return size[:axis] + size[axis + 1 :]
+    else:
+        return -1
+
+
+@pytest.mark.parametrize("arr", ([], [[], []]))
+def test_empty_arr(arr):
+    res_np = np.nanmean(arr)
+    res_num = num.nanmean(arr)
+    assert np.isnan(res_np) and np.isnan(res_num)
+
+
+@pytest.mark.parametrize("val", (np.nan, 0.0, 10.0, -5, 1 + 1j))
+def test_scalar(val):
+    res_np = np.nanmean(val)
+    res_num = num.nanmean(val)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("val", (np.nan, 0.0, 10.0, -5, 1 + 1j))
+def test_scalar_where(val):
+    res_np = np.nanmean(val, where=True)
+    res_num = num.nanmean(val, where=True)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_basic(size):
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==0] = np.nan
+    arr_num = num.array(arr_np)
+    res_np = np.nanmean(arr_np)
+    res_num = num.nanmean(arr_num)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_basic_where(size):
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==0] = np.nan
+    arr_num = num.array(arr_np)
+    where_np = arr_np % 2
+    where_np = arr_np.astype(bool)
+    where_num = num.array(where_np)
+    res_np = np.nanmean(arr_np, where=where_np)
+    res_num = num.nanmean(arr_num, where=where_num)
+    assert np.array_equal(res_np, res_num, equal_nan=True)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))
+def test_axis_tuple(axis):
+    # In Numpy, it pass
+    # In cuNumeric, it raises NotImplementedError
+    size = (3, 4, 7)
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==1] = np.nan
+    arr_num = num.array(arr_np)
+    out_np = np.nanmean(arr_np, axis=axis)
+    out_num = num.nanmean(arr_num, axis=axis)
+    assert np.array_equal(out_np, out_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("keepdims", (False, True))
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_axis_keepdims(size, keepdims):
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==1] = np.nan
+    arr_num = num.array(arr_np)
+    ndim = arr_np.ndim
+    for axis in range(-ndim, ndim):
+        out_np = np.nanmean(arr_np, axis=axis, keepdims=keepdims)
+        out_num = num.nanmean(arr_num, axis=axis, keepdims=keepdims)
+        assert np.array_equal(out_np, out_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_axis_where(size):
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==0] = np.nan
+    arr_num = num.array(arr_np)
+    where_np = arr_np[arr_np%2==1] % 2
+    where_np = arr_np.astype(bool)
+    where_num = num.array(where_np)
+    ndim = arr_np.ndim
+    for axis in range(-ndim, ndim):
+        out_np = np.nanmean(arr_np, axis=axis, where=where_np)
+        out_num = num.nanmean(arr_num, axis=axis, where=where_num)
+        assert np.array_equal(out_np, out_num, equal_nan=True)
+
+
+@pytest.mark.parametrize("out_dt", (np.float32, np.complex128))
+@pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+def test_out(size, out_dt):
+    arr_np = np.random.randint(-5, 5, size=size).astype(float)
+    arr_np[arr_np%2==0] = np.nan
+    arr_num = num.array(arr_np)
+    ndim = arr_np.ndim
+    for axis in (-1, ndim - 1, None):
+        out_shape = gen_out_shape(size, axis)
+        out_np = np.empty(out_shape, dtype=out_dt)
+        out_num = num.empty(out_shape, dtype=out_dt)
+        np.nanmean(arr_np, axis=axis, out=out_np)
+        num.nanmean(arr_num, axis=axis, out=out_num)
+        np.array_equal(out_np, out_num, equal_nan=True)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))

From baf61ed58d94e6a0455163e482ae63c00e3628ee Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Oct 2023 09:56:44 -0700
Subject: [PATCH 13/66] some clean-up + formatting

---
 cunumeric/array.py                | 45 ++++++++++++++++++++-----------
 cunumeric/module.py               |  1 +
 tests/integration/test_mean.py    |  2 +-
 tests/integration/test_nanmean.py | 14 +++++-----
 4 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index d9165512f..1fea5a905 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -191,13 +191,20 @@ def check_writeable(arr: Union[ndarray, tuple[ndarray, ...], None]) -> None:
     if any(not arr.flags.writeable for arr in check_list):
         raise ValueError("array is not writeable")
 
-def broadcast_where(where:Union[ndarray, None], shape:NdShape )-> ndarray:
+
+def broadcast_where(
+    where: Union[ndarray, None], shape: NdShape
+) -> Union[ndarray, None]:
     if where is not None:
         where_array = convert_to_cunumeric_ndarray(where)
     else:
-         where_array = None
+        where_array = None
 
-    if where_array is not None and np.ndim(where_array)!=0 and where_array.shape != shape:
+    if (
+        where_array is not None
+        and np.ndim(where_array) != 0
+        and where_array.shape != shape
+    ):
         where_array = ndarray(
             shape=shape,
             thunk=where_array._thunk.broadcast_to(shape),
@@ -205,6 +212,7 @@ def broadcast_where(where:Union[ndarray, None], shape:NdShape )-> ndarray:
         )
     return where_array
 
+
 class flagsobj:
     """
     Information about the memory layout of the array.
@@ -870,7 +878,7 @@ def __bool__(self) -> bool:
     def __complex__(self) -> complex:
         """a.__complex__(/)"""
         return complex(self.__array__())
-    
+
     def __contains__(self, item: Any) -> ndarray:
         """a.__contains__(key, /)
 
@@ -3139,7 +3147,7 @@ def mean(
                 dtype = np.dtype(np.float64)
             else:
                 dtype = self.dtype
-        
+
         where_array = broadcast_where(where, self.shape)
         # Do the sum
         if out is not None and out.dtype == dtype:
@@ -3147,12 +3155,13 @@ def mean(
                 axis=axis, out=out, keepdims=keepdims, where=where_array
             )
         else:
-            sum_array = self.sum(axis=axis, keepdims=keepdims, where=where_array)
+            sum_array = self.sum(
+                axis=axis, keepdims=keepdims, where=where_array
+            )
 
         if axis is None:
             if where_array is not None:
                 divisor = where_array._count_nonzero()
-
             else:
                 divisor = reduce(lambda x, y: x * y, self.shape, 1)
 
@@ -3205,16 +3214,21 @@ def nanmean(
         """
         from . import _ufunc
 
-        if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_): 
-            return self.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where)
+        if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_):
+            return self.mean(
+                axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
+            )
 
         if dtype is None:
             dtype = self.dtype
- 
-        nan_mask = _ufunc.bit_twiddling.bitwise_not(_ufunc.floating.isnan(self))
-        normalizer = nan_mask.sum( axis=axis, dtype=np.int32, keepdims=keepdims, where=where)
-        #normalizer = normalizer.astype(dtype)
-        sum_array = self.nansum(axis, dtype=dtype, keepdims=keepdims, where=where)
+
+        nan_mask = _ufunc.bit_twiddling.bitwise_not(
+            _ufunc.floating.isnan(self)
+        )
+        normalizer = nan_mask.sum(axis=axis, keepdims=keepdims, where=where)
+        # normalizer = normalizer.astype(dtype)
+        sum_array = self.nansum(axis, keepdims=keepdims, where=where)
+        sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
                 np.array(normalizer, dtype=dtype),
@@ -3226,12 +3240,11 @@ def nanmean(
             if out.dtype != sum_array.dtype:
                 out._thunk.convert(sum_array._thunk)
             else:
-                out.copy(sum_array)
+                out = sum_array
             return out
         else:
             return sum_array
 
-
     @add_boilerplate()
     def min(
         self,
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 7704eb89f..9504636e6 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6078,6 +6078,7 @@ def nansum(
 
     return a.nansum(axis, dtype, out, keepdims, initial, where)
 
+
 # Exponents and logarithms
 
 
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index 4c2fc4f4d..f6ffb801a 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -87,6 +87,7 @@ def test_basic_where(size):
     res_num = num.mean(arr_num, where=where_num)
     assert np.array_equal(res_np, res_num, equal_nan=True)
 
+
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_where_broadcast(size):
     arr_np = np.random.randint(-5, 5, size=size)
@@ -98,7 +99,6 @@ def test_where_broadcast(size):
     assert np.array_equal(res_np, res_num, equal_nan=True)
 
 
-
 @pytest.mark.xfail
 @pytest.mark.parametrize("axis", ((-3, -1), (-1, 0), (-2, 2), (0, 2)))
 def test_axis_tuple(axis):
diff --git a/tests/integration/test_nanmean.py b/tests/integration/test_nanmean.py
index d1e8dc433..9849e62dd 100755
--- a/tests/integration/test_nanmean.py
+++ b/tests/integration/test_nanmean.py
@@ -70,7 +70,7 @@ def test_scalar_where(val):
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_basic(size):
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==0] = np.nan
+    arr_np[arr_np % 2 == 0] = np.nan
     arr_num = num.array(arr_np)
     res_np = np.nanmean(arr_np)
     res_num = num.nanmean(arr_num)
@@ -80,7 +80,7 @@ def test_basic(size):
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_basic_where(size):
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==0] = np.nan
+    arr_np[arr_np % 2 == 0] = np.nan
     arr_num = num.array(arr_np)
     where_np = arr_np % 2
     where_np = arr_np.astype(bool)
@@ -97,7 +97,7 @@ def test_axis_tuple(axis):
     # In cuNumeric, it raises NotImplementedError
     size = (3, 4, 7)
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==1] = np.nan
+    arr_np[arr_np % 2 == 1] = np.nan
     arr_num = num.array(arr_np)
     out_np = np.nanmean(arr_np, axis=axis)
     out_num = num.nanmean(arr_num, axis=axis)
@@ -108,7 +108,7 @@ def test_axis_tuple(axis):
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_axis_keepdims(size, keepdims):
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==1] = np.nan
+    arr_np[arr_np % 2 == 1] = np.nan
     arr_num = num.array(arr_np)
     ndim = arr_np.ndim
     for axis in range(-ndim, ndim):
@@ -120,9 +120,9 @@ def test_axis_keepdims(size, keepdims):
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_axis_where(size):
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==0] = np.nan
+    arr_np[arr_np % 2 == 0] = np.nan
     arr_num = num.array(arr_np)
-    where_np = arr_np[arr_np%2==1] % 2
+    where_np = arr_np[arr_np % 2 == 1] % 2
     where_np = arr_np.astype(bool)
     where_num = num.array(where_np)
     ndim = arr_np.ndim
@@ -136,7 +136,7 @@ def test_axis_where(size):
 @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
 def test_out(size, out_dt):
     arr_np = np.random.randint(-5, 5, size=size).astype(float)
-    arr_np[arr_np%2==0] = np.nan
+    arr_np[arr_np % 2 == 0] = np.nan
     arr_num = num.array(arr_np)
     ndim = arr_np.ndim
     for axis in (-1, ndim - 1, None):

From 30b1e1829c9a1eee7d442c46144f83874a1e4bc1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Oct 2023 13:34:58 -0700
Subject: [PATCH 14/66] fixing issues after rebase

---
 cunumeric/array.py  | 4 ++--
 cunumeric/module.py | 9 ++++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 1fea5a905..6d58be54f 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3136,7 +3136,7 @@ def mean(
         Multiple GPUs, Multiple CPUs
 
         """
-        if axis is not None and type(axis) != int:
+        if axis is not None and not isinstance(axis, int):
             raise NotImplementedError(
                 "cunumeric.mean only supports int types for "
                 "'axis' currently"
@@ -4147,7 +4147,7 @@ def find_common_type(*args: ndarray) -> np.dtype[Any]:
                 scalars.append(array.dtype.type(0))
             else:
                 array_types.append(array.dtype)
-        return np.find_common_type(array_types, scalar_types)
+        return np.result_type(*array_types, *scalars)
 
     def _maybe_convert(self, dtype: np.dtype[Any], hints: Any) -> ndarray:
         if self.dtype == dtype:
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 9504636e6..0f984f7e5 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6076,7 +6076,14 @@ def nansum(
     Multiple GPUs, Multiple CPUs
     """
 
-    return a.nansum(axis, dtype, out, keepdims, initial, where)
+    return a.nansum(
+        axis=axis,
+        dtype=dtype,
+        out=out,
+        keepdims=keepdims,
+        initial=initial,
+        where=where,
+    )
 
 
 # Exponents and logarithms

From 2277234775b85fad335221497561db2a21528d63 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Oct 2023 15:02:04 -0700
Subject: [PATCH 15/66] code clean-up

---
 cunumeric/_ufunc/ufunc.py                     |   2 +-
 cunumeric/array.py                            |  34 ++----
 cunumeric/deferred.py                         |   2 +-
 cunumeric/eager.py                            |   2 +-
 cunumeric/module.py                           |   2 +-
 src/cunumeric/unary/scalar_unary_red.cc       |   2 +-
 src/cunumeric/unary/scalar_unary_red.cu       |   2 +-
 src/cunumeric/unary/scalar_unary_red.h        |   2 +-
 src/cunumeric/unary/scalar_unary_red_omp.cc   |   2 +-
 .../unary/scalar_unary_red_template.inl       |   2 +-
 src/cunumeric/unary/unary_red.cc              |   2 +-
 src/cunumeric/unary/unary_red.cu              | 107 ++++++------------
 src/cunumeric/unary/unary_red_omp.cc          |   2 +-
 tests/integration/test_mean.py                |   2 +-
 tests/integration/test_nanmean.py             |   2 +-
 15 files changed, 57 insertions(+), 110 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 763efa6db..01d2fc30b 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 6d58be54f..043d01e16 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -159,14 +159,9 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
 
 
 def convert_to_predicate_ndarray(obj: Any) -> Any:
-    # Keep all boolean types as they are
-    if obj is True or obj is False:
-        # where = ndarray(shape=(), dtype=bool)
-        # where.fill(obj)
-        # return where
+    # Keep all boolean types and None as they are
+    if obj is True or obj is False or obj is None:
         return obj
-    if obj is None:
-        return None
     return convert_to_cunumeric_ndarray(obj)
 
 
@@ -3225,9 +3220,11 @@ def nanmean(
         nan_mask = _ufunc.bit_twiddling.bitwise_not(
             _ufunc.floating.isnan(self)
         )
-        normalizer = nan_mask.sum(axis=axis, keepdims=keepdims, where=where)
-        # normalizer = normalizer.astype(dtype)
-        sum_array = self.nansum(axis, keepdims=keepdims, where=where)
+        where_array = broadcast_where(where, nan_mask.shape)
+        normalizer = nan_mask.sum(
+            axis=axis, keepdims=keepdims, where=where_array
+        )
+        sum_array = self.nansum(axis, keepdims=keepdims, where=where_array)
         sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
@@ -4113,12 +4110,10 @@ def unique(self) -> ndarray:
 
     @classmethod
     def _get_where_thunk(
-        cls, where: Union[bool, ndarray], out_shape: NdShape
-    ) -> Union[Literal[True], NumPyThunk]:
-        if where is True:
-            return True
-        # if where is False:
-        #    raise RuntimeError("should have caught this earlier")
+        cls, where: Union[Literal[True], None, ndarray], out_shape: NdShape
+    ) -> Union[Literal[True], None, NumPyThunk]:
+        if where is True or where is None:
+            return where
         if not isinstance(where, ndarray) or where.dtype != np.bool_:
             raise RuntimeError("should have converted this earlier")
         if where.shape != out_shape:
@@ -4353,15 +4348,10 @@ def _perform_unary_reduction(
                 shape=out_shape, dtype=res_dtype, inputs=(src, where)
             )
 
-        if where is not None:
-            where_thunk = cls._get_where_thunk(where, src.shape)
-        else:
-            where_thunk = None
-
         result._thunk.unary_reduction(
             op,
             src._thunk,
-            where_thunk,
+            cls._get_where_thunk(where, src.shape),
             axis,
             axes,
             keepdims,
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index bb4744653..02dcbb13f 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index d7934d04e..20d18cd10 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 0f984f7e5..900c2ec30 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc
index 77f746eb4..c10e065f9 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cunumeric/unary/scalar_unary_red.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu
index 6959d05c9..76dcaeb32 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cunumeric/unary/scalar_unary_red.cu
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h
index 856eeba65..aa7925cf9 100644
--- a/src/cunumeric/unary/scalar_unary_red.h
+++ b/src/cunumeric/unary/scalar_unary_red.h
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc
index c33576355..646f0193a 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cunumeric/unary/scalar_unary_red_omp.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 034172ba6..a061105ab 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index 34af69434..ddfdf7a59 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 708d21a17..de82f3583 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -201,26 +201,14 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks<DIM>& blocks)
   return os;
 }
 
-template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
-static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
-                                                          AccessorRO<RHS, DIM> in,
-                                                          LHS identity,
-                                                          const ThreadBlocks<DIM>& blocks,
-                                                          const Rect<DIM>& domain,
-                                                          int32_t collapsed_dim)
+template <typename REDOP, typename LHS, int32_t DIM>
+static void __device__ collapse_dims(LHS& result,
+                                     Point<DIM>& point,
+                                     const Rect<DIM>& domain,
+                                     int32_t collapsed_dim,
+                                     LHS identity,
+                                     coord_t tid)
 {
-  const coord_t tid = threadIdx.x;
-  const coord_t bid = blockIdx.x;
-
-  Point<DIM> point = blocks.point(bid, tid, domain.lo);
-  if (!domain.contains(point)) return point;
-
-  while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
-    LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
-    REDOP::template fold<true>(result, value);
-    blocks.next_point(point);
-  }
-
 #if __CUDA_ARCH__ >= 700
   // If we're collapsing the innermost dimension, we perform some optimization
   // with shared memory to reduce memory traffic due to atomic updates
@@ -276,6 +264,29 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   // put points back in the bounds to appease the checks.
   point[collapsed_dim] = domain.lo[collapsed_dim];
 #endif
+}
+
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
+                                                          AccessorRO<RHS, DIM> in,
+                                                          LHS identity,
+                                                          const ThreadBlocks<DIM>& blocks,
+                                                          const Rect<DIM>& domain,
+                                                          int32_t collapsed_dim)
+{
+  const coord_t tid = threadIdx.x;
+  const coord_t bid = blockIdx.x;
+
+  Point<DIM> point = blocks.point(bid, tid, domain.lo);
+  if (!domain.contains(point)) return point;
+
+  while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
+    LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
+    REDOP::template fold<true>(result, value);
+    blocks.next_point(point);
+  }
+
+  collapse_dims<REDOP, LHS>(result, point, domain, collapsed_dim, identity, tid);
   return point;
 }
 
@@ -302,61 +313,7 @@ static __device__ __forceinline__ Point<DIM> local_reduce_where(LHS& result,
     blocks.next_point(point);
   }
 
-#if __CUDA_ARCH__ >= 700
-  // If we're collapsing the innermost dimension, we perform some optimization
-  // with shared memory to reduce memory traffic due to atomic updates
-  if (collapsed_dim == DIM - 1) {
-    __shared__ uint8_t shmem[THREADS_PER_BLOCK * sizeof(LHS)];
-    LHS* trampoline = reinterpret_cast<LHS*>(shmem);
-    // Check for the case where all the threads in the same warp have
-    // the same x value in which case they're all going to conflict
-    // so instead we do a warp-level reduction so just one thread ends
-    // up doing the full atomic
-    coord_t bucket = 0;
-    for (int32_t dim = DIM - 2; dim >= 0; --dim)
-      bucket = bucket * (domain.hi[dim] - domain.lo[dim] + 1) + point[dim] - domain.lo[dim];
-
-    const uint32_t same_mask = __match_any_sync(0xffffffff, bucket);
-    int32_t laneid;
-    asm volatile("mov.s32 %0, %laneid;" : "=r"(laneid));
-    const uint32_t active_mask = __ballot_sync(0xffffffff, same_mask - (1 << laneid));
-    if ((active_mask & (1 << laneid)) != 0) {
-      // Store our data into shared
-      trampoline[tid] = result;
-      // Make sure all the threads in the warp are done writing
-      __syncwarp(active_mask);
-      // Have the lowest thread in each mask pull in the values
-      int32_t lowest_index = -1;
-      for (int32_t i = 0; i < warpSize; i++)
-        if (same_mask & (1 << i)) {
-          if (lowest_index == -1) {
-            if (i != laneid) {
-              // We're not the lowest thread in the warp for
-              // this value so we're done, set the value back
-              // to identity to ensure that we don't try to
-              // perform the reduction out to memory
-              result = identity;
-              break;
-            } else  // Make sure we don't do this test again
-              lowest_index = i;
-            // It was already our value, so just keep going
-          } else {
-            // Pull in the value from shared memory
-            const int32_t index = tid + i - laneid;
-            REDOP::template fold<true>(result, trampoline[index]);
-          }
-        }
-    }
-  }
-#endif
-
-#ifdef LEGATE_BOUNDS_CHECKS
-  // Note: this isn't necessary because we know that the affine transformation on the output
-  // accessor will ignore coordinates of the collapsed dimension. However, Legion's bounds checks
-  // want the accessor to honor the sub-rectangle passed when it was created, so we need to
-  // put points back in the bounds to appease the checks.
-  point[collapsed_dim] = domain.lo[collapsed_dim];
-#endif
+  collapse_dims<REDOP, LHS>(result, point, domain, collapsed_dim, identity, tid);
   return point;
 }
 template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index ab1e071d8..fdded9b68 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021-2022 NVIDIA Corporation
+/* Copyright 2021-2023 NVIDIA Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/tests/integration/test_mean.py b/tests/integration/test_mean.py
index f6ffb801a..40092455c 100755
--- a/tests/integration/test_mean.py
+++ b/tests/integration/test_mean.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/integration/test_nanmean.py b/tests/integration/test_nanmean.py
index 9849e62dd..98962842b 100755
--- a/tests/integration/test_nanmean.py
+++ b/tests/integration/test_nanmean.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2021-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 73e06091bb9d209c414eef55337b398a05fd3d0e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Oct 2023 21:30:22 -0700
Subject: [PATCH 16/66] code clean-up

---
 cunumeric/array.py  | 52 +++++++++++++--------------------------------
 cunumeric/module.py |  2 +-
 2 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 043d01e16..6c40767b4 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -188,18 +188,14 @@ def check_writeable(arr: Union[ndarray, tuple[ndarray, ...], None]) -> None:
 
 
 def broadcast_where(
-    where: Union[ndarray, None], shape: NdShape
+    where: Union[bool, ndarray, None], shape: NdShape
 ) -> Union[ndarray, None]:
     if where is not None:
         where_array = convert_to_cunumeric_ndarray(where)
     else:
         where_array = None
 
-    if (
-        where_array is not None
-        and np.ndim(where_array) != 0
-        and where_array.shape != shape
-    ):
+    if where_array is not None and where_array.shape != shape:
         where_array = ndarray(
             shape=shape,
             thunk=where_array._thunk.broadcast_to(shape),
@@ -3220,11 +3216,8 @@ def nanmean(
         nan_mask = _ufunc.bit_twiddling.bitwise_not(
             _ufunc.floating.isnan(self)
         )
-        where_array = broadcast_where(where, nan_mask.shape)
-        normalizer = nan_mask.sum(
-            axis=axis, keepdims=keepdims, where=where_array
-        )
-        sum_array = self.nansum(axis, keepdims=keepdims, where=where_array)
+        normalizer = nan_mask.sum(axis=axis, keepdims=keepdims, where=where)
+        sum_array = self._nansum(axis, keepdims=keepdims, where=where)
         sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
@@ -3757,7 +3750,7 @@ def sum(
         )
 
     @add_boilerplate()
-    def nansum(
+    def _nansum(
         self,
         axis: Any = None,
         dtype: Any = None,
@@ -3766,22 +3759,6 @@ def nansum(
         initial: Optional[Union[int, float]] = None,
         where: Optional[ndarray] = None,
     ) -> ndarray:
-        """a.nansum(axis=None, dtype=None, out=None, keepdims=False, initial=0,
-        where=None)
-
-        Return the sum of the array elements over the given axis ignoring nan.
-
-        Refer to :func:`cunumeric.nansum` for full documentation.
-
-        See Also
-        --------
-        cunumeric.nansum : equivalent function
-
-        Availability
-        --------
-        Multiple GPUs, Multiple CPUs
-
-        """
         # Note that np.nansum and np.sum allow complex datatypes
         # so there are no "disallowed types" for this API
 
@@ -4170,7 +4147,7 @@ def _perform_unary_op(
         out: Union[Any, None] = None,
         extra_args: Any = None,
         dtype: Union[np.dtype[Any], None] = None,
-        where: Union[bool, ndarray] = True,
+        where: Union[bool, None, ndarray] = True,
         out_dtype: Union[np.dtype[Any], None] = None,
     ) -> ndarray:
         if out is not None:
@@ -4212,7 +4189,7 @@ def _perform_unary_op(
         # Quick exit
         if where is False:
             return out
-
+        where_array = broadcast_where(where, out.shape)
         if out_dtype is None:
             if out.dtype != src.dtype and not (
                 op == UnaryOpCode.ABSOLUTE and src.dtype.kind == "c"
@@ -4220,12 +4197,12 @@ def _perform_unary_op(
                 temp = ndarray(
                     out.shape,
                     dtype=src.dtype,
-                    inputs=(src, where),
+                    inputs=(src, where_array),
                 )
                 temp._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where, out.shape),
+                    cls._get_where_thunk(where_array, out.shape),
                     extra_args,
                 )
                 out._thunk.convert(temp._thunk)
@@ -4233,7 +4210,7 @@ def _perform_unary_op(
                 out._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where, out.shape),
+                    cls._get_where_thunk(where_array, out.shape),
                     extra_args,
                 )
         else:
@@ -4241,12 +4218,12 @@ def _perform_unary_op(
                 temp = ndarray(
                     out.shape,
                     dtype=out_dtype,
-                    inputs=(src, where),
+                    inputs=(src, where_array),
                 )
                 temp._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where, out.shape),
+                    cls._get_where_thunk(where_array, out.shape),
                     extra_args,
                 )
                 out._thunk.convert(temp._thunk)
@@ -4254,7 +4231,7 @@ def _perform_unary_op(
                 out._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where, out.shape),
+                    cls._get_where_thunk(where_array, out.shape),
                     extra_args,
                 )
         return out
@@ -4348,10 +4325,11 @@ def _perform_unary_reduction(
                 shape=out_shape, dtype=res_dtype, inputs=(src, where)
             )
 
+        where_array = broadcast_where(where, src.shape)
         result._thunk.unary_reduction(
             op,
             src._thunk,
-            cls._get_where_thunk(where, src.shape),
+            cls._get_where_thunk(where_array, src.shape),
             axis,
             axes,
             keepdims,
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 900c2ec30..b6363eef5 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6076,7 +6076,7 @@ def nansum(
     Multiple GPUs, Multiple CPUs
     """
 
-    return a.nansum(
+    return a._nansum(
         axis=axis,
         dtype=dtype,
         out=out,

From dfafdcc6e01ada995621b12255fb650ff585f3a3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 18 Oct 2023 10:05:30 -0700
Subject: [PATCH 17/66] adding test_where for unary operations

---
 cunumeric/deferred.py                   |  2 ++
 tests/integration/test_logical.py       | 15 ++++++++++-----
 tests/integration/test_nan_reduction.py | 19 +++++++++++++++++++
 tests/integration/test_prod.py          | 14 ++++++++++----
 tests/integration/test_reduction.py     | 12 +++++++++---
 5 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 02dcbb13f..c1a102e73 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3159,6 +3159,8 @@ def unary_reduction(
             )
 
         is_where = bool(where is not None)
+        if is_where:
+            where = self.runtime.to_deferred_array(where)
         # See if we are doing reduction to a point or another region
         if lhs_array.size == 1:
             assert axes is None or lhs_array.ndim == rhs_array.ndim - (
diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py
index ca9b99220..b0f83aaa6 100644
--- a/tests/integration/test_logical.py
+++ b/tests/integration/test_logical.py
@@ -102,19 +102,24 @@ def test_nd_inputs(ndim, func):
         assert np.array_equal(out_np, out_num)
 
 
-@pytest.mark.skip
 def test_where():
-    # "the `where` parameter is currently not supported"
-    x = np.array([[True, True, False], [True, True, True]])
     y = np.array([[True, False], [True, True]])
     cy = num.array(y)
 
+    # where needs to be broadcasted
     assert num.array_equal(
-        num.all(cy, where=[True, False]), np.all(x, where=[True, False])
+        num.all(cy, where=[True, False]), np.all(y, where=[True, False])
     )
     assert num.array_equal(
         num.any(cy, where=[[True], [False]]),
-        np.any(x, where=[[True], [False]]),
+        np.any(y, where=[[True], [False]]),
+    )
+
+    # Where is a boolean
+    assert num.array_equal(num.all(cy, where=True), np.all(y, where=True))
+    assert num.array_equal(
+        num.any(cy, where=False),
+        np.any(y, where=False),
     )
 
 
diff --git a/tests/integration/test_nan_reduction.py b/tests/integration/test_nan_reduction.py
index 34bbd1447..d3b5fd7e9 100644
--- a/tests/integration/test_nan_reduction.py
+++ b/tests/integration/test_nan_reduction.py
@@ -291,6 +291,25 @@ def test_all_nans_nansum(self, ndim):
 
         assert out_num == 0.0
 
+    def test_where(self):
+        arr = [[1, np.nan, 3], [2, np.nan, 4]]
+        out_np = np.nansum(arr, where=[False, True, True])
+        out_num = num.nansum(arr, where=[False, True, True])
+        assert np.allclose(out_np, out_num)
+
+        out_np = np.nanprod(arr, where=[False, True, True])
+        out_num = num.nanprod(arr, where=[False, True, True])
+        assert np.allclose(out_np, out_num)
+
+        # where is a boolean
+        out_np = np.nanmax(arr, where=True)
+        out_num = num.nanmax(arr, where=True)
+        assert np.allclose(out_np, out_num)
+
+        out_np = np.nanmin(arr, where=True)
+        out_num = num.nanmin(arr, where=True)
+        assert np.allclose(out_np, out_num)
+
 
 class TestCornerCases:
     """
diff --git a/tests/integration/test_prod.py b/tests/integration/test_prod.py
index ab0f4def8..c004c95a3 100644
--- a/tests/integration/test_prod.py
+++ b/tests/integration/test_prod.py
@@ -147,15 +147,21 @@ def test_initial_empty_array(self):
         out_np = np.prod(arr_np, initial=initial_value)
         assert allclose(out_np, out_num)
 
-    @pytest.mark.xfail
     def test_where(self):
         arr = [[1, 2], [3, 4]]
-        out_np = np.prod(arr, where=[False, True])  # return 8
-        # cuNumeric raises NotImplementedError:
-        # the `where` parameter is currently not supported
+        out_np = np.prod(arr, where=[False, True])
         out_num = num.prod(arr, where=[False, True])
         assert allclose(out_np, out_num)
 
+        # where is boolean
+        out_np = np.prod(arr, where=True)
+        out_num = num.prod(arr, where=True)
+        assert allclose(out_np, out_num)
+
+        out_np = np.prod(arr, where=False)
+        out_num = num.prod(arr, where=False)
+        assert allclose(out_np, out_num)
+
 
 class TestProdPositive(object):
     """
diff --git a/tests/integration/test_reduction.py b/tests/integration/test_reduction.py
index a7a89a6af..f3379265b 100644
--- a/tests/integration/test_reduction.py
+++ b/tests/integration/test_reduction.py
@@ -151,15 +151,21 @@ def test_initial_empty_array(self):
         out_np = np.sum(arr_np, initial=initial_value)  # return initial_value
         assert allclose(out_np, out_num)
 
-    @pytest.mark.xfail
     def test_where(self):
         arr = [[1, 2], [3, 4]]
         out_np = np.sum(arr, where=[False, True])  # return 6
-        # cuNumeric raises NotImplementedError:
-        # "the `where` parameter is currently not supported"
         out_num = num.sum(arr, where=[False, True])
         assert allclose(out_np, out_num)
 
+        # where is a boolean
+        out_np = np.sum(arr, where=True)
+        out_num = num.sum(arr, where=True)
+        assert allclose(out_np, out_num)
+
+        out_np = np.sum(arr, where=False)
+        out_num = num.sum(arr, where=False)
+        assert allclose(out_np, out_num)
+
 
 class TestSumPositive(object):
     """

From 3071edde7e518d87333771ed97450197fa0a7255 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 18 Oct 2023 10:51:05 -0700
Subject: [PATCH 18/66] adding nanmean to docs

---
 docs/cunumeric/source/api/statistics.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/cunumeric/source/api/statistics.rst b/docs/cunumeric/source/api/statistics.rst
index 7d844d788..ef3056da8 100644
--- a/docs/cunumeric/source/api/statistics.rst
+++ b/docs/cunumeric/source/api/statistics.rst
@@ -10,6 +10,7 @@ Averages and variances
    :toctree: generated/
 
    mean
+   nanmean
 
 
 Histograms

From 67aab76159442fdc0b0ed8896c809fa699fe3b68 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 18 Oct 2023 12:53:22 -0700
Subject: [PATCH 19/66] nanmean is not oficiall method of ndarray

---
 cunumeric/array.py  | 18 +-----------------
 cunumeric/module.py |  2 +-
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 6c40767b4..03810a3b7 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3179,7 +3179,7 @@ def mean(
             return sum_array
 
     @add_boilerplate()
-    def nanmean(
+    def _nanmean(
         self,
         axis: Any = None,
         dtype: Union[np.dtype[Any], None] = None,
@@ -3187,22 +3187,6 @@ def nanmean(
         keepdims: bool = False,
         where: Union[ndarray, None] = None,
     ) -> ndarray:
-        """a.mean(axis=None, dtype=None, out=None, keepdims=False)
-
-        Returns the average of the array elements along given axis,
-        ignoring NaN entires
-
-        Refer to :func:`cunumeric.mean` for full documentation.
-
-        See Also
-        --------
-        cunumeric.nanmean : equivalent function
-
-        Availability
-        --------
-        Multiple GPUs, Multiple CPUs
-
-        """
         from . import _ufunc
 
         if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.bool_):
diff --git a/cunumeric/module.py b/cunumeric/module.py
index b6363eef5..146fcd462 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -7117,7 +7117,7 @@ def nanmean(
     --------
     Multiple GPUs, Multiple CPUs
     """
-    return a.nanmean(
+    return a._nanmean(
         axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
     )
 

From 09a4477b8f1a9bc7f3dea0ce8457146474670f06 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 18 Oct 2023 14:44:10 -0700
Subject: [PATCH 20/66] fixing test

---
 tests/integration/test_nan_reduction.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_nan_reduction.py b/tests/integration/test_nan_reduction.py
index d3b5fd7e9..b764dbf93 100644
--- a/tests/integration/test_nan_reduction.py
+++ b/tests/integration/test_nan_reduction.py
@@ -302,12 +302,12 @@ def test_where(self):
         assert np.allclose(out_np, out_num)
 
         # where is a boolean
-        out_np = np.nanmax(arr, where=True)
-        out_num = num.nanmax(arr, where=True)
+        out_np = np.nanmax(arr, where=True, initial=-1)
+        out_num = num.nanmax(arr, where=True, initial=-1)
         assert np.allclose(out_np, out_num)
 
-        out_np = np.nanmin(arr, where=True)
-        out_num = num.nanmin(arr, where=True)
+        out_np = np.nanmin(arr, where=True, initial=10)
+        out_num = num.nanmin(arr, where=True, initial=10)
         assert np.allclose(out_np, out_num)
 
 

From 851dd42ceb15c6c157f11fe9e382f5e0ba8551aa Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 10:57:20 -0800
Subject: [PATCH 21/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Jacob Faibussowitsch <jacob.fai@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index a061105ab..d30c93154 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -185,7 +185,7 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
-      if (whereptr[idx] == true)
+      if (whereptr[idx])
         OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
     } else {
       if (whereptr[idx] == true) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));

From 83308a65e31281d230eaac80b8963597587ce86e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 10:57:51 -0800
Subject: [PATCH 22/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Jacob Faibussowitsch <jacob.fai@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index d30c93154..a8bc5272f 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -188,7 +188,7 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
       if (whereptr[idx])
         OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
     } else {
-      if (whereptr[idx] == true) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
+      if (whereptr[idx]) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
     }
   }
 

From df70b28de1a48b88db9cd0ae6fc6d61f0453f11d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 10:58:40 -0800
Subject: [PATCH 23/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Jacob Faibussowitsch <jacob.fai@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index a8bc5272f..251d5ae3f 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -216,11 +216,12 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
     if constexpr (KIND != VariantKind::GPU) {
       // Check to see if this is dense or not
       if (dense) {
-        return ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
+       ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
+       return;
       }
     }
 #endif
-    return ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
+   ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
   }
 };
 

From 7070d9f8e8eff697777864edba7912a6bc4474fa Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 10:59:28 -0800
Subject: [PATCH 24/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Jacob Faibussowitsch <jacob.fai@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 251d5ae3f..b1b3a6729 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -264,8 +264,9 @@ static void scalar_unary_red_template(TaskContext& context)
   bool has_where   = scalars[2].value<bool>();
   size_t start_idx = has_where ? 2 : 1;
   std::vector<Store> extra_args;
+  extra_args.reserve(inputs.size());
   for (size_t idx = start_idx; idx < inputs.size(); ++idx)
-    extra_args.push_back(std::move(inputs[idx]));
+     extra_args.emplace_back(std::move(inputs[idx]));
   // If the RHS was a scalar, use (1,) as the shape
   if (shape.dim == 0) {
     shape.dim = 1;

From c00661a434fd1c9af1117657882ef5cb0ed66ba9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:14:41 -0800
Subject: [PATCH 25/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 03810a3b7..9a9c8d10f 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3154,7 +3154,7 @@ def mean(
             if where_array is not None:
                 divisor = where_array._count_nonzero()
             else:
-                divisor = reduce(lambda x, y: x * y, self.shape, 1)
+                divisor = np.array(reduce(lambda x, y: x * y, self.shape, 1))
 
         else:
             if where_array is not None:

From ab3bb104a98b752444d9927899b30015359a9465 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:14:51 -0800
Subject: [PATCH 26/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index b1b3a6729..b0b8b8f86 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -273,7 +273,7 @@ static void scalar_unary_red_template(TaskContext& context)
     shape[0]  = 1;
   }
 
-  if (has_where == true) {
+  if (has_where) {
     ScalarUnaryRedArgs args{context.reductions()[0],
                             inputs[0],
                             inputs[1],

From df5248e709841643dadf27b258bb985d87b9c844 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:15:37 -0800
Subject: [PATCH 27/66] Update src/cunumeric/unary/unary_red.cu

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index de82f3583..b89905903 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -202,7 +202,7 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks<DIM>& blocks)
 }
 
 template <typename REDOP, typename LHS, int32_t DIM>
-static void __device__ collapse_dims(LHS& result,
+static void __device__ __forceinline__ collapse_dims(LHS& result,
                                      Point<DIM>& point,
                                      const Rect<DIM>& domain,
                                      int32_t collapsed_dim,

From 8b3d0b716f67b9b3ca704d8af8820d22b71e6ce4 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:15:55 -0800
Subject: [PATCH 28/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 9a9c8d10f..271dfe92d 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3166,7 +3166,7 @@ def mean(
         sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
-                np.array(divisor, dtype=dtype),
+                divisor.astype(dtype),
             )
         else:
             sum_array.__ifloordiv__(np.array(divisor, dtype=dtype))

From 2a2ead5a6181dcc11c38089329430f7f72a44d28 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:16:08 -0800
Subject: [PATCH 29/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 271dfe92d..f0eb480e5 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3169,7 +3169,7 @@ def mean(
                 divisor.astype(dtype),
             )
         else:
-            sum_array.__ifloordiv__(np.array(divisor, dtype=dtype))
+            sum_array.__ifloordiv__(divisor.astype(dtype))
         # Convert to the output we didn't already put it there
         if out is not None and sum_array is not out:
             assert out.dtype != sum_array.dtype

From 329bf5ac13e0546b8f1542f74958d2c7608aabf5 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:16:19 -0800
Subject: [PATCH 30/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index f0eb480e5..60e8cf2b2 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3160,7 +3160,7 @@ def mean(
             if where_array is not None:
                 divisor = where_array._count_nonzero(axis=axis)
             else:
-                divisor = self.shape[axis]
+                divisor = np.array(self.shape[axis])
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
         sum_array = sum_array.astype(dtype)

From dcce884102a1d2dc0ec4210ead38e12c365545bf Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:23:01 -0800
Subject: [PATCH 31/66] Update cunumeric/eager.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/eager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 20d18cd10..e9cce7db4 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1499,7 +1499,7 @@ def unary_reduction(
                 initial,
             )
             return
-        if not where:
+        if where is None:
             where = True
         if op in _UNARY_RED_OPS_WITH_ARG:
             fn = _UNARY_RED_OPS_WITH_ARG[op]

From a50f165c9c4af931488d7e71adea519a230ff34d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:24:07 -0800
Subject: [PATCH 32/66] Update cunumeric/module.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/module.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 146fcd462..7fc7fe28d 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -7093,11 +7093,6 @@ def nanmean(
         in the result as dimensions with size one. With this option,
         the result will broadcast correctly against the input array.
 
-        If the default value is passed, then `keepdims` will not be
-        passed through to the `mean` method of sub-classes of
-        `ndarray`, however any non-default value will be.  If the
-        sub-class' method does not implement `keepdims` any
-        exceptions will be raised.
 
     where : array_like of bool, optional
         Elements to include in the mean.

From c52e0c58f436e6704265d6bddd473684b782ad4b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:26:08 -0800
Subject: [PATCH 33/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 60e8cf2b2..0a362a166 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3181,7 +3181,7 @@ def mean(
     @add_boilerplate()
     def _nanmean(
         self,
-        axis: Any = None,
+        axis: Optional[Union[int, tuple[int, ...]]] = None,
         dtype: Union[np.dtype[Any], None] = None,
         out: Union[ndarray, None] = None,
         keepdims: bool = False,

From f7235b1ccce83935b870bfcdcd92961d6d6a12fd Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 11:30:34 -0800
Subject: [PATCH 34/66] Update cunumeric/module.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 7fc7fe28d..a59a7a0b4 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -7100,7 +7100,7 @@ def nanmean(
     Returns
     -------
     m : ndarray
-        If `out=None`, returns a new array of the same dtype a above
+        If `out=None`, returns a new array of the same dtype as a above
         containing the mean values, otherwise a reference to the output
         array is returned.
 

From 568dfc529c8c7d23de57f319c98f9525473c58fc Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 9 Nov 2023 11:45:08 -0800
Subject: [PATCH 35/66] updating documentation + fixing mypy issue

---
 cunumeric/array.py  | 4 ++--
 cunumeric/module.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 0a362a166..d30569135 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3152,13 +3152,13 @@ def mean(
 
         if axis is None:
             if where_array is not None:
-                divisor = where_array._count_nonzero()
+                divisor = np.array(where_array._count_nonzero())
             else:
                 divisor = np.array(reduce(lambda x, y: x * y, self.shape, 1))
 
         else:
             if where_array is not None:
-                divisor = where_array._count_nonzero(axis=axis)
+                divisor = np.array(where_array._count_nonzero(axis=axis))
             else:
                 divisor = np.array(self.shape[axis])
         # Divide by the number of things in the collapsed dimensions
diff --git a/cunumeric/module.py b/cunumeric/module.py
index a59a7a0b4..e633dcbf3 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -7033,7 +7033,7 @@ def mean(
     Returns
     -------
     m : ndarray
-        If `out=None`, returns a new array of the same dtype a above
+        If `out is None`, returns a new array of the same dtype a above
         containing the mean values, otherwise a reference to the output
         array is returned.
 
@@ -7100,7 +7100,7 @@ def nanmean(
     Returns
     -------
     m : ndarray
-        If `out=None`, returns a new array of the same dtype as a above
+        If `out is None`, returns a new array of the same dtype as a above
         containing the mean values, otherwise a reference to the output
         array is returned.
 

From 67abe0ca585daebefc1f5c367fb0300095c840c1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 9 Nov 2023 13:47:12 -0800
Subject: [PATCH 36/66] cleaning up some of the C++ code

---
 src/cunumeric/unary/scalar_unary_red.h        |  1 -
 .../unary/scalar_unary_red_template.inl       | 90 +++++++------------
 src/cunumeric/unary/unary_red_template.inl    | 23 ++---
 3 files changed, 41 insertions(+), 73 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h
index aa7925cf9..570c0d605 100644
--- a/src/cunumeric/unary/scalar_unary_red.h
+++ b/src/cunumeric/unary/scalar_unary_red.h
@@ -25,7 +25,6 @@ struct ScalarUnaryRedArgs {
   const Array& out;
   const Array& in;
   const Array& where;
-  bool has_where;
   UnaryRedCode op_code;
   legate::DomainPoint shape;
   std::vector<legate::Store> args;
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index b0b8b8f86..d339f557a 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -34,7 +34,7 @@ struct ScalarUnaryRed {
 };
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> {
+struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false /*doesn't have where*/> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
@@ -122,7 +122,7 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> {
 };
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
+struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true /*has where*/> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
@@ -158,35 +158,24 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
     out = args.out.reduce_accessor<LG_OP, true, 1>();
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
 
-    if ((args.has_where == true) && (OP_CODE != UnaryRedCode::CONTAINS)) {
-      where = args.where.read_accessor<bool, DIM>(rect);
+    where = args.where.read_accessor<bool, DIM>(rect);
 #ifndef LEGATE_BOUNDS_CHECKS
-      if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect)) {
-        dense    = true;
-        inptr    = in.ptr(rect);
-        whereptr = where.ptr(rect);
-      }
-#endif
-    } else {
-#ifndef LEGATE_BOUNDS_CHECKS
-      // Check to see if this is dense or not
-      if (in.accessor.is_dense_row_major(rect)) {
-        dense = true;
-        inptr = in.ptr(rect);
-      }
-#endif
+    if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect)) {
+      dense    = true;
+      inptr    = in.ptr(rect);
+      whereptr = where.ptr(rect);
     }
+#endif
   }
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
   {
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      if (inptr[idx] == to_find) { lhs = true; }
+      if (whereptr[idx] && (inptr[idx] == to_find)) { lhs = true; }
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
-      if (whereptr[idx])
-        OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
+      if (whereptr[idx]) OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
     } else {
       if (whereptr[idx]) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
     }
@@ -194,16 +183,14 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, SparseReduction) const noexcept
   {
+    auto p = pitches.unflatten(idx, origin);
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      auto point = pitches.unflatten(idx, origin);
-      if (in[point] == to_find) { lhs = true; }
+      if (where[p] && (in[p] == to_find)) { lhs = true; }
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
-      auto p = pitches.unflatten(idx, origin);
-      if (where[p] == true) OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
+      if (where[p]) OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
     } else {
-      auto p = pitches.unflatten(idx, origin);
-      if (where[p] == true) OP::template fold<true>(lhs, OP::convert(in[p], identity));
+      if (where[p]) OP::template fold<true>(lhs, OP::convert(in[p], identity));
     }
   }
 
@@ -216,29 +203,24 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> {
     if constexpr (KIND != VariantKind::GPU) {
       // Check to see if this is dense or not
       if (dense) {
-       ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
-       return;
+        ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
+        return;
       }
     }
 #endif
-   ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
+    ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
   }
 };
 
-template <VariantKind KIND, UnaryRedCode OP_CODE>
+template <VariantKind KIND, UnaryRedCode OP_CODE, int HAS_WHERE>
 struct ScalarUnaryRedImpl {
   template <Type::Code CODE, int DIM>
   void operator()(ScalarUnaryRedArgs& args) const
   {
     // The operation is always valid for contains
     if constexpr (UnaryRedOp<OP_CODE, CODE>::valid || OP_CODE == UnaryRedCode::CONTAINS) {
-      if (args.has_where) {
-        ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true> red(args);
-        red.execute();
-      } else {
-        ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false> red(args);
-        red.execute();
-      }
+      ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, HAS_WHERE> red(args);
+      red.execute();
     }
   }
 };
@@ -246,10 +228,13 @@ struct ScalarUnaryRedImpl {
 template <VariantKind KIND>
 struct ScalarUnaryRedDispatch {
   template <UnaryRedCode OP_CODE>
-  void operator()(ScalarUnaryRedArgs& args) const
+  void operator()(ScalarUnaryRedArgs& args, bool has_where) const
   {
     auto dim = std::max(1, args.in.dim());
-    double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl<KIND, OP_CODE>{}, args);
+    if (has_where)
+      double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl<KIND, OP_CODE, true>{}, args);
+    else
+      double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl<KIND, OP_CODE, false>{}, args);
   }
 };
 
@@ -266,28 +251,21 @@ static void scalar_unary_red_template(TaskContext& context)
   std::vector<Store> extra_args;
   extra_args.reserve(inputs.size());
   for (size_t idx = start_idx; idx < inputs.size(); ++idx)
-     extra_args.emplace_back(std::move(inputs[idx]));
+    extra_args.emplace_back(std::move(inputs[idx]));
   // If the RHS was a scalar, use (1,) as the shape
   if (shape.dim == 0) {
     shape.dim = 1;
     shape[0]  = 1;
   }
 
-  if (has_where) {
-    ScalarUnaryRedArgs args{context.reductions()[0],
-                            inputs[0],
-                            inputs[1],
-                            has_where,
-                            op_code,
-                            shape,
-                            std::move(extra_args)};
-    op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
-  } else {
-    Array where;
-    ScalarUnaryRedArgs args{
-      context.reductions()[0], inputs[0], where, has_where, op_code, shape, std::move(extra_args)};
-    op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
-  }
+  Array dummy_where;
+  ScalarUnaryRedArgs args{context.reductions()[0],
+                          inputs[0],
+                          has_where ? inputs[1] : dummy_where,
+                          op_code,
+                          shape,
+                          std::move(extra_args)};
+  op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args, has_where);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 3c77fdd50..2a0224370 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -89,22 +89,13 @@ static void unary_red_template(TaskContext& context)
   auto& reductions = context.reductions();
   auto& scalars    = context.scalars();
   bool has_where   = scalars[2].value<bool>();
-  if (has_where) {
-    UnaryRedArgs args{reductions[0],
-                      inputs[0],
-                      inputs[1],
-                      scalars[0].value<int32_t>(),
-                      scalars[1].value<UnaryRedCode>()};
-    op_dispatch(args.op_code, UnaryRedDispatch<KIND, true>{}, args);
-  } else {
-    Array where;
-    UnaryRedArgs args{reductions[0],
-                      inputs[0],
-                      where,
-                      scalars[0].value<int32_t>(),
-                      scalars[1].value<UnaryRedCode>()};
-    op_dispatch(args.op_code, UnaryRedDispatch<KIND, false>{}, args);
-  }
+  Array dummy_where;
+  UnaryRedArgs args{reductions[0],
+                    inputs[0],
+                    has_where ? inputs[1] : dummy_where,
+                    scalars[0].value<int32_t>(),
+                    scalars[1].value<UnaryRedCode>()};
+  op_dispatch(args.op_code, UnaryRedDispatch<KIND, false>{}, args);
 }
 
 }  // namespace cunumeric

From bb8a46274544b546e846d39f782c3f84a2dc98e3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Thu, 9 Nov 2023 13:54:48 -0800
Subject: [PATCH 37/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index d30569135..1b2a73038 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3194,30 +3194,14 @@ def _nanmean(
                 axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=where
             )
 
-        if dtype is None:
-            dtype = self.dtype
-
         nan_mask = _ufunc.bit_twiddling.bitwise_not(
             _ufunc.floating.isnan(self)
         )
-        normalizer = nan_mask.sum(axis=axis, keepdims=keepdims, where=where)
-        sum_array = self._nansum(axis, keepdims=keepdims, where=where)
-        sum_array = sum_array.astype(dtype)
-        if dtype.kind == "f" or dtype.kind == "c":
-            sum_array.__itruediv__(
-                np.array(normalizer, dtype=dtype),
-            )
-        else:
-            sum_array.__ifloordiv__(np.array(normalizer, dtype=dtype))
-
-        if out is not None and sum_array is not out:
-            if out.dtype != sum_array.dtype:
-                out._thunk.convert(sum_array._thunk)
-            else:
-                out = sum_array
-            return out
-        else:
-            return sum_array
+        if where is not None:
+            nan_mask &= where
+        return self.mean(
+            axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=nan_mask
+        )
 
     @add_boilerplate()
     def min(

From 186af6f078b8bf046e864e666ab08c2949312fe2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 10:46:16 -0800
Subject: [PATCH 38/66] fixing the logic for nanmean after previous commit

---
 cunumeric/array.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 1b2a73038..2434360a2 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3111,6 +3111,7 @@ def mean(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         where: Union[ndarray, None] = None,
+        ignore_nan: bool = False,
     ) -> ndarray:
         """a.mean(axis=None, dtype=None, out=None, keepdims=False)
 
@@ -3141,14 +3142,19 @@ def mean(
 
         where_array = broadcast_where(where, self.shape)
         # Do the sum
-        if out is not None and out.dtype == dtype:
-            sum_array = self.sum(
+        sum_array = (
+            self._nansum(
                 axis=axis, out=out, keepdims=keepdims, where=where_array
             )
-        else:
-            sum_array = self.sum(
-                axis=axis, keepdims=keepdims, where=where_array
+            if out is not None and out.dtype == dtype and ignore_nan
+            else self.sum(
+                axis=axis, out=out, keepdims=keepdims, where=where_array
             )
+            if out is not None and out.dtype == dtype
+            else self._nansum(axis=axis, keepdims=keepdims, where=where_array)
+            if ignore_nan
+            else self.sum(axis=axis, keepdims=keepdims, where=where_array)
+        )
 
         if axis is None:
             if where_array is not None:
@@ -3158,9 +3164,12 @@ def mean(
 
         else:
             if where_array is not None:
-                divisor = np.array(where_array._count_nonzero(axis=axis))
+                divisor = np.array(
+                    where_array.sum(axis=axis, keepdims=keepdims)
+                )
             else:
                 divisor = np.array(self.shape[axis])
+
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
         sum_array = sum_array.astype(dtype)
@@ -3200,7 +3209,12 @@ def _nanmean(
         if where is not None:
             nan_mask &= where
         return self.mean(
-            axis=axis, dtype=dtype, out=out, keepdims=keepdims, where=nan_mask
+            axis=axis,
+            dtype=dtype,
+            out=out,
+            keepdims=keepdims,
+            where=nan_mask,
+            ignore_nan=True,
         )
 
     @add_boilerplate()

From 2729b55b32ef8cea02cdd1d2cde2900b838d9240 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 11:38:42 -0800
Subject: [PATCH 39/66] claning up scalar_unary_red_template.inl

---
 .../unary/scalar_unary_red_template.inl       | 126 +++---------------
 1 file changed, 21 insertions(+), 105 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index d339f557a..11d0bbe12 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -30,17 +30,13 @@ using namespace legate;
 
 template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
 struct ScalarUnaryRed {
-  ScalarUnaryRed(ScalarUnaryRedArgs& args) { assert(false); }
-};
-
-template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false /*doesn't have where*/> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using LHS   = typename OP::VAL;
   using RHS   = legate_type_of<CODE>;
   using OUT   = AccessorRD<LG_OP, true, 1>;
   using IN    = AccessorRO<RHS, DIM>;
+  using WHERE = AccessorRO<bool, DIM>;
 
   IN in;
   const RHS* inptr;
@@ -52,6 +48,8 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false /*doesn't have where*/> {
   Point<DIM> shape;
   RHS to_find;
   bool dense;
+  WHERE where;
+  const bool* whereptr;
 
   struct DenseReduction {};
   struct SparseReduction {};
@@ -67,40 +65,49 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false /*doesn't have where*/> {
     out = args.out.reduce_accessor<LG_OP, true, 1>();
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
 
+    if constexpr (HAS_WHERE) where = args.where.read_accessor<bool, DIM>(rect);
 #ifndef LEGATE_BOUNDS_CHECKS
     // Check to see if this is dense or not
     if (in.accessor.is_dense_row_major(rect)) {
       dense = true;
       inptr = in.ptr(rect);
     }
+    if constexpr (HAS_WHERE) {
+      dense = dense && where.accessor.is_dense_row_major(rect);
+      if (dense) whereptr = where.ptr(rect);
+    }
 #endif
   }
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
   {
+    bool mask = true;
+    if constexpr (HAS_WHERE) mask = (whereptr[idx] == true);
+
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      if (inptr[idx] == to_find) { lhs = true; }
+      if (mask && (inptr[idx] == to_find)) { lhs = true; }
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
-      OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
+      if (mask) OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
     } else {
-      OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
+      if (mask) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
     }
   }
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, SparseReduction) const noexcept
   {
+    auto p    = pitches.unflatten(idx, origin);
+    bool mask = true;
+    if constexpr (HAS_WHERE) mask = (where[p] == true);
+
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      auto point = pitches.unflatten(idx, origin);
-      if (in[point] == to_find) { lhs = true; }
+      if (mask && (in[p] == to_find)) { lhs = true; }
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
-      auto p = pitches.unflatten(idx, origin);
-      OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
+      if (mask) OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
     } else {
-      auto p = pitches.unflatten(idx, origin);
-      OP::template fold<true>(lhs, OP::convert(in[p], identity));
+      if (mask) OP::template fold<true>(lhs, OP::convert(in[p], identity));
     }
   }
 
@@ -121,97 +128,6 @@ struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, false /*doesn't have where*/> {
   }
 };
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct ScalarUnaryRed<KIND, OP_CODE, CODE, DIM, true /*has where*/> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using LHS   = typename OP::VAL;
-  using RHS   = legate_type_of<CODE>;
-  using OUT   = AccessorRD<LG_OP, true, 1>;
-  using IN    = AccessorRO<RHS, DIM>;
-  using WHERE = AccessorRO<bool, DIM>;
-
-  IN in;
-  const RHS* inptr;
-  OUT out;
-  size_t volume;
-  Pitches<DIM - 1> pitches;
-  Rect<DIM> rect;
-  Point<DIM> origin;
-  Point<DIM> shape;
-  RHS to_find;
-  bool dense;
-  WHERE where;
-  const bool* whereptr;
-
-  struct DenseReduction {};
-  struct SparseReduction {};
-
-  ScalarUnaryRed(ScalarUnaryRedArgs& args) : dense(false)
-  {
-    rect   = args.in.shape<DIM>();
-    origin = rect.lo;
-    in     = args.in.read_accessor<RHS, DIM>(rect);
-    volume = pitches.flatten(rect);
-    shape  = args.shape;
-
-    out = args.out.reduce_accessor<LG_OP, true, 1>();
-    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
-
-    where = args.where.read_accessor<bool, DIM>(rect);
-#ifndef LEGATE_BOUNDS_CHECKS
-    if (in.accessor.is_dense_row_major(rect) && where.accessor.is_dense_row_major(rect)) {
-      dense    = true;
-      inptr    = in.ptr(rect);
-      whereptr = where.ptr(rect);
-    }
-#endif
-  }
-
-  __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
-  {
-    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      if (whereptr[idx] && (inptr[idx] == to_find)) { lhs = true; }
-    } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
-                         OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
-      auto p = pitches.unflatten(idx, origin);
-      if (whereptr[idx]) OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
-    } else {
-      if (whereptr[idx]) OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
-    }
-  }
-
-  __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, SparseReduction) const noexcept
-  {
-    auto p = pitches.unflatten(idx, origin);
-    if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      if (where[p] && (in[p] == to_find)) { lhs = true; }
-    } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
-                         OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
-      if (where[p]) OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
-    } else {
-      if (where[p]) OP::template fold<true>(lhs, OP::convert(in[p], identity));
-    }
-  }
-
-  void execute() const noexcept
-  {
-    auto identity = LG_OP::identity;
-#ifndef LEGATE_BOUNDS_CHECKS
-    // The constexpr if here prevents the DenseReduction from being instantiated for GPU kernels
-    // which limits compile times and binary sizes.
-    if constexpr (KIND != VariantKind::GPU) {
-      // Check to see if this is dense or not
-      if (dense) {
-        ScalarReductionPolicy<KIND, LG_OP, DenseReduction>()(volume, out, identity, *this);
-        return;
-      }
-    }
-#endif
-    ScalarReductionPolicy<KIND, LG_OP, SparseReduction>()(volume, out, identity, *this);
-  }
-};
-
 template <VariantKind KIND, UnaryRedCode OP_CODE, int HAS_WHERE>
 struct ScalarUnaryRedImpl {
   template <Type::Code CODE, int DIM>

From e67537474bdce7b3ad618b758c0eed6b00c87859 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 11:54:44 -0800
Subject: [PATCH 40/66] claning up unary_red_* files

---
 src/cunumeric/unary/unary_red.cc           | 30 +++----------
 src/cunumeric/unary/unary_red.cu           | 51 ++++++----------------
 src/cunumeric/unary/unary_red_omp.cc       | 35 +++------------
 src/cunumeric/unary/unary_red_template.inl | 11 ++---
 4 files changed, 28 insertions(+), 99 deletions(-)

diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index ddfdf7a59..bdd1c7012 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -21,29 +21,8 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using RHS   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto point    = pitches.unflatten(idx, rect.lo);
-      auto identity = LG_OP::identity;
-      lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
-    }
-  }
-};
-
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBodyWhere<VariantKind::CPU, OP_CODE, CODE, DIM> {
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
+struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM, HAS_WHERE> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using RHS   = legate_type_of<CODE>;
@@ -59,8 +38,9 @@ struct UnaryRedImplBodyWhere<VariantKind::CPU, OP_CODE, CODE, DIM> {
     for (size_t idx = 0; idx < volume; ++idx) {
       auto point    = pitches.unflatten(idx, rect.lo);
       auto identity = LG_OP::identity;
-      if (where[point] == true)
-        lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+      bool mask     = true;
+      if constexpr (HAS_WHERE) mask = (where[point] == true);
+      if (mask) lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
     }
   }
 };
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index b89905903..7127046a6 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -203,11 +203,11 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks<DIM>& blocks)
 
 template <typename REDOP, typename LHS, int32_t DIM>
 static void __device__ __forceinline__ collapse_dims(LHS& result,
-                                     Point<DIM>& point,
-                                     const Rect<DIM>& domain,
-                                     int32_t collapsed_dim,
-                                     LHS identity,
-                                     coord_t tid)
+                                                     Point<DIM>& point,
+                                                     const Rect<DIM>& domain,
+                                                     int32_t collapsed_dim,
+                                                     LHS identity,
+                                                     coord_t tid)
 {
 #if __CUDA_ARCH__ >= 700
   // If we're collapsing the innermost dimension, we perform some optimization
@@ -347,35 +347,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   if (result != identity) out.reduce(point, result);
 }
 
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using RHS   = legate_type_of<CODE>;
-  using LHS   = typename OP::VAL;
-
-  void operator()(AccessorRD<LG_OP, false, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    auto Kernel = reduce_with_rd_acc<OP, LG_OP, LHS, RHS, DIM>;
-    auto stream = get_cached_stream();
-
-    ThreadBlocks<DIM> blocks;
-    blocks.initialize(rect, collapsed_dim);
-
-    blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-      lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
-    CHECK_CUDA_STREAM(stream);
-  }
-};
-
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBodyWhere<VariantKind::GPU, OP_CODE, CODE, DIM> {
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
+struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using RHS   = legate_type_of<CODE>;
@@ -389,15 +362,19 @@ struct UnaryRedImplBodyWhere<VariantKind::GPU, OP_CODE, CODE, DIM> {
                   int collapsed_dim,
                   size_t volume) const
   {
-    auto Kernel = reduce_with_rd_acc_where<OP, LG_OP, LHS, RHS, DIM>;
+    auto Kernel = reduce_with_rd_acc<OP, LG_OP, LHS, RHS, DIM>;
     auto stream = get_cached_stream();
 
     ThreadBlocks<DIM> blocks;
     blocks.initialize(rect, collapsed_dim);
 
     blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-      lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
+    if constexpr (HAS_WHERE)
+      Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
+        lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
+    else
+      Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
+        lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
     CHECK_CUDA_STREAM(stream);
   }
 };
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index fdded9b68..5074031df 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -72,34 +72,8 @@ class Splitter {
   size_t pitches_[DIM];
 };
 
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using RHS   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    Splitter<DIM> splitter;
-    auto split = splitter.split(rect, collapsed_dim);
-
-#pragma omp parallel for schedule(static)
-    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
-      for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
-        auto point    = splitter.combine(o_idx, i_idx, rect.lo);
-        auto identity = LG_OP::identity;
-        lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
-      }
-  }
-};
-
-template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBodyWhere<VariantKind::OMP, OP_CODE, CODE, DIM> {
+template <UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
+struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM, HAS_WHERE> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using RHS   = legate_type_of<CODE>;
@@ -120,8 +94,9 @@ struct UnaryRedImplBodyWhere<VariantKind::OMP, OP_CODE, CODE, DIM> {
       for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
         auto point    = splitter.combine(o_idx, i_idx, rect.lo);
         auto identity = LG_OP::identity;
-        if (where[point] == true)
-          lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+        bool mask     = true;
+        if constexpr (HAS_WHERE) mask = (where[point] == true);
+        if (mask) lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
       }
   }
 };
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 2a0224370..eecc45ddf 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -27,12 +27,9 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
+template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM, bool HAS_WHERE>
 struct UnaryRedImplBody;
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, Type::Code CODE, int DIM>
-struct UnaryRedImplBodyWhere;
-
 template <VariantKind KIND, UnaryRedCode OP_CODE, bool HAS_WHERE>
 struct UnaryRedImpl {
   template <Type::Code CODE,
@@ -55,11 +52,11 @@ struct UnaryRedImpl {
 
     if constexpr (HAS_WHERE) {
       auto where = args.where.read_accessor<bool, DIM>(rect);
-      UnaryRedImplBodyWhere<KIND, OP_CODE, CODE, DIM>()(
+      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
         lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
     } else {
-      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(
-        lhs, rhs, rect, pitches, args.collapsed_dim, volume);
+      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
+        lhs, rhs, AccessorRO<bool, DIM>(), rect, pitches, args.collapsed_dim, volume);
     }
   }
 

From c17f3d2718bb51ccbb3a00ce9b45e61cf31c29c9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 13:07:17 -0800
Subject: [PATCH 41/66] removing convert_to_predicate_ndarray

---
 cunumeric/array.py | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 2434360a2..852f9d34f 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -130,13 +130,11 @@ def wrapper(*args: Any, **kwargs: Any) -> R:
             for k, v in kwargs.items():
                 if v is None:
                     continue
-                elif k == "where":
-                    kwargs[k] = convert_to_predicate_ndarray(v)
                 elif k == "out":
                     kwargs[k] = convert_to_cunumeric_ndarray(v, share=True)
                     if not kwargs[k].flags.writeable:
                         raise ValueError("out is not writeable")
-                elif k in keys:
+                elif (k in keys) or (k == "where"):
                     kwargs[k] = convert_to_cunumeric_ndarray(v)
 
             return func(*args, **kwargs)
@@ -158,13 +156,6 @@ def convert_to_cunumeric_ndarray(obj: Any, share: bool = False) -> ndarray:
     return ndarray(shape=None, thunk=thunk, writeable=writeable)
 
 
-def convert_to_predicate_ndarray(obj: Any) -> Any:
-    # Keep all boolean types and None as they are
-    if obj is True or obj is False or obj is None:
-        return obj
-    return convert_to_cunumeric_ndarray(obj)
-
-
 def maybe_convert_to_np_ndarray(obj: Any) -> Any:
     """
     Converts cuNumeric arrays into NumPy arrays, otherwise has no effect.
@@ -188,20 +179,15 @@ def check_writeable(arr: Union[ndarray, tuple[ndarray, ...], None]) -> None:
 
 
 def broadcast_where(
-    where: Union[bool, ndarray, None], shape: NdShape
+    where: Union[ndarray, None], shape: NdShape
 ) -> Union[ndarray, None]:
-    if where is not None:
-        where_array = convert_to_cunumeric_ndarray(where)
-    else:
-        where_array = None
-
-    if where_array is not None and where_array.shape != shape:
-        where_array = ndarray(
+    if where is not None and where.shape != shape:
+        where = ndarray(
             shape=shape,
-            thunk=where_array._thunk.broadcast_to(shape),
+            thunk=where._thunk.broadcast_to(shape),
             writeable=False,
         )
-    return where_array
+    return where
 
 
 class flagsobj:
@@ -4129,7 +4115,7 @@ def _perform_unary_op(
         out: Union[Any, None] = None,
         extra_args: Any = None,
         dtype: Union[np.dtype[Any], None] = None,
-        where: Union[bool, None, ndarray] = True,
+        where: Union[None, ndarray] = None,
         out_dtype: Union[np.dtype[Any], None] = None,
     ) -> ndarray:
         if out is not None:
@@ -4168,9 +4154,6 @@ def _perform_unary_op(
                     inputs=(src, where),
                 )
 
-        # Quick exit
-        if where is False:
-            return out
         where_array = broadcast_where(where, out.shape)
         if out_dtype is None:
             if out.dtype != src.dtype and not (

From e7d9096d92222fde2eb48411d799159c7c59497d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 13:17:45 -0800
Subject: [PATCH 42/66] small clean-up

---
 cunumeric/array.py    | 16 ++++++++--------
 cunumeric/deferred.py |  4 +---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 852f9d34f..2443ecc85 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -182,11 +182,9 @@ def broadcast_where(
     where: Union[ndarray, None], shape: NdShape
 ) -> Union[ndarray, None]:
     if where is not None and where.shape != shape:
-        where = ndarray(
-            shape=shape,
-            thunk=where._thunk.broadcast_to(shape),
-            writeable=False,
-        )
+        from .module import broadcast_to
+
+        where = broadcast_to(where, shape)
     return where
 
 
@@ -4059,10 +4057,12 @@ def _get_where_thunk(
     ) -> Union[Literal[True], None, NumPyThunk]:
         if where is True or where is None:
             return where
-        if not isinstance(where, ndarray) or where.dtype != np.bool_:
+        if (
+            not isinstance(where, ndarray)
+            or where.dtype != np.bool_
+            or where.shape != out_shape
+        ):
             raise RuntimeError("should have converted this earlier")
-        if where.shape != out_shape:
-            raise ValueError("where parameter must have same shape as output")
         return where._thunk
 
     @staticmethod
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index c1a102e73..ae5c2ae57 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -3127,7 +3127,7 @@ def unary_op(
 
     # Perform a unary reduction operation from one set of dimensions down to
     # fewer
-    @auto_convert("src")
+    @auto_convert("src", "where")
     def unary_reduction(
         self,
         op: UnaryRedCode,
@@ -3159,8 +3159,6 @@ def unary_reduction(
             )
 
         is_where = bool(where is not None)
-        if is_where:
-            where = self.runtime.to_deferred_array(where)
         # See if we are doing reduction to a point or another region
         if lhs_array.size == 1:
             assert axes is None or lhs_array.ndim == rhs_array.ndim - (

From b6a90c9f184396b40a75cf50a396f0d5b7631243 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Mon, 13 Nov 2023 13:21:43 -0800
Subject: [PATCH 43/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 2443ecc85..bcc21d11a 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -4053,9 +4053,9 @@ def unique(self) -> ndarray:
 
     @classmethod
     def _get_where_thunk(
-        cls, where: Union[Literal[True], None, ndarray], out_shape: NdShape
-    ) -> Union[Literal[True], None, NumPyThunk]:
-        if where is True or where is None:
+        cls, where: Union[None, ndarray], out_shape: NdShape
+    ) -> Union[None, NumPyThunk]:
+        if where is None:
             return where
         if (
             not isinstance(where, ndarray)

From 7ec317cc9e5d13dbedca33db75759e80d85814c9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 13:50:00 -0800
Subject: [PATCH 44/66] removing where from perform_unary_op

---
 cunumeric/array.py | 43 ++++++++++++++++---------------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index bcc21d11a..934a37ee5 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -22,7 +22,6 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Literal,
     Optional,
     Sequence,
     TypeVar,
@@ -4115,33 +4114,31 @@ def _perform_unary_op(
         out: Union[Any, None] = None,
         extra_args: Any = None,
         dtype: Union[np.dtype[Any], None] = None,
-        where: Union[None, ndarray] = None,
         out_dtype: Union[np.dtype[Any], None] = None,
     ) -> ndarray:
         if out is not None:
             # If the shapes don't match see if we can broadcast
             # This will raise an exception if they can't be broadcast together
-            if isinstance(where, ndarray):
-                np.broadcast_shapes(src.shape, out.shape, where.shape)
-            else:
-                np.broadcast_shapes(src.shape, out.shape)
+            if np.broadcast_shapes(src.shape, out.shape) != out.shape:
+                raise ValueError(
+                    f"non-broadcastable output operand with shape {out.shape} "
+                    f"doesn't match the broadcast shape {src.shape}"
+                )
         else:
             # No output yet, so make one
-            if isinstance(where, ndarray):
-                out_shape = np.broadcast_shapes(src.shape, where.shape)
-            else:
-                out_shape = src.shape
+            out_shape = src.shape
+
             if dtype is not None:
                 out = ndarray(
                     shape=out_shape,
                     dtype=dtype,
-                    inputs=(src, where),
+                    inputs=(src,),
                 )
             elif out_dtype is not None:
                 out = ndarray(
                     shape=out_shape,
                     dtype=out_dtype,
-                    inputs=(src, where),
+                    inputs=(src,),
                 )
             else:
                 out = ndarray(
@@ -4151,10 +4148,9 @@ def _perform_unary_op(
                     else np.dtype(np.float32)
                     if src.dtype == np.dtype(np.complex64)
                     else np.dtype(np.float64),
-                    inputs=(src, where),
+                    inputs=(src,),
                 )
 
-        where_array = broadcast_where(where, out.shape)
         if out_dtype is None:
             if out.dtype != src.dtype and not (
                 op == UnaryOpCode.ABSOLUTE and src.dtype.kind == "c"
@@ -4162,12 +4158,12 @@ def _perform_unary_op(
                 temp = ndarray(
                     out.shape,
                     dtype=src.dtype,
-                    inputs=(src, where_array),
+                    inputs=(src,),
                 )
                 temp._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where_array, out.shape),
+                    True,
                     extra_args,
                 )
                 out._thunk.convert(temp._thunk)
@@ -4175,7 +4171,7 @@ def _perform_unary_op(
                 out._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where_array, out.shape),
+                    True,
                     extra_args,
                 )
         else:
@@ -4183,12 +4179,12 @@ def _perform_unary_op(
                 temp = ndarray(
                     out.shape,
                     dtype=out_dtype,
-                    inputs=(src, where_array),
+                    inputs=(src,),
                 )
                 temp._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where_array, out.shape),
+                    True,
                     extra_args,
                 )
                 out._thunk.convert(temp._thunk)
@@ -4196,7 +4192,7 @@ def _perform_unary_op(
                 out._thunk.unary_op(
                     op,
                     src._thunk,
-                    cls._get_where_thunk(where_array, out.shape),
+                    True,
                     extra_args,
                 )
         return out
@@ -4236,13 +4232,6 @@ def _perform_unary_reduction(
 
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
-        if where is not None and isinstance(where, ndarray):
-            # The where array has to broadcast to the src.shape
-            if np.broadcast_shapes(src.shape, where.shape) != src.shape:
-                raise ValueError(
-                    '"where" array must broadcast against source array '
-                    "for reduction"
-                )
         if (
             op
             in (

From 47096389f6e0658e6ee2631aa841ffaf41ceaff3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 14:46:15 -0800
Subject: [PATCH 45/66] fixing logic for nanmin/nanmax for where

---
 cunumeric/module.py                     |  8 ++++----
 tests/integration/test_nan_reduction.py | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index e633dcbf3..798aac5d9 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -5803,8 +5803,8 @@ def nanmin(
     )
 
     if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
-        where = all(isnan(a), axis=axis, keepdims=keepdims)
-        putmask(out_array, where, np.nan)  # type: ignore
+        all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
+        putmask(out_array, all_nan, np.nan)  # type: ignore
 
     return out_array
 
@@ -5899,8 +5899,8 @@ def nanmax(
     )
 
     if cunumeric_settings.numpy_compat() and a.dtype.kind == "f":
-        where = all(isnan(a), axis=axis, keepdims=keepdims)
-        putmask(out_array, where, np.nan)  # type: ignore
+        all_nan = all(isnan(a), axis=axis, keepdims=keepdims, where=where)
+        putmask(out_array, all_nan, np.nan)  # type: ignore
 
     return out_array
 
diff --git a/tests/integration/test_nan_reduction.py b/tests/integration/test_nan_reduction.py
index b764dbf93..e57a46d0b 100644
--- a/tests/integration/test_nan_reduction.py
+++ b/tests/integration/test_nan_reduction.py
@@ -301,7 +301,31 @@ def test_where(self):
         out_num = num.nanprod(arr, where=[False, True, True])
         assert np.allclose(out_np, out_num)
 
+        out_np = np.nanmax(
+            arr, where=[[False, True, True], [False, False, True]], initial=-1
+        )
+        out_num = num.nanmax(
+            arr, where=[[False, True, True], [False, False, True]], initial=-1
+        )
+        assert np.allclose(out_np, out_num)
+
+        out_np = np.nanmin(
+            arr, where=[[False, True, True], [False, True, True]], initial=10
+        )
+        out_num = num.nanmin(
+            arr, where=[[False, True, True], [False, True, True]], initial=10
+        )
+        assert np.allclose(out_np, out_num)
+
         # where is a boolean
+        out_np = np.nansum(arr, where=True)
+        out_num = num.nansum(arr, where=True)
+        assert np.allclose(out_np, out_num)
+
+        out_np = np.nanprod(arr, where=False)
+        out_num = num.nanprod(arr, where=False)
+        assert np.allclose(out_np, out_num)
+
         out_np = np.nanmax(arr, where=True, initial=-1)
         out_num = num.nanmax(arr, where=True, initial=-1)
         assert np.allclose(out_np, out_num)

From 95bfbebbde11ee7b516bb8529bc9cedcbc3d1f96 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 20:47:54 -0800
Subject: [PATCH 46/66] addressing the rest of the comments from Manolis

---
 cunumeric/_ufunc/ufunc.py | 10 +++++++---
 cunumeric/array.py        |  1 -
 cunumeric/module.py       |  6 +++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 01d2fc30b..3bba6b8ad 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -19,7 +19,12 @@
 import numpy as np
 from legate.core.utils import OrderedSet
 
-from ..array import check_writeable, convert_to_cunumeric_ndarray, ndarray
+from ..array import (
+    add_boilerplate,
+    check_writeable,
+    convert_to_cunumeric_ndarray,
+    ndarray,
+)
 from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
 from ..types import NdShape
 
@@ -680,6 +685,7 @@ def __call__(
 
         return self._maybe_cast_output(out, result)
 
+    @add_boilerplate("array")
     def reduce(
         self,
         array: ndarray,
@@ -742,8 +748,6 @@ def reduce(
         --------
         numpy.ufunc.reduce
         """
-        array = convert_to_cunumeric_ndarray(array)
-
         if self._red_code is None:
             raise NotImplementedError(
                 f"reduction for {self} is not yet implemented"
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 934a37ee5..3eb8939c8 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3170,7 +3170,6 @@ def mean(
         else:
             return sum_array
 
-    @add_boilerplate()
     def _nanmean(
         self,
         axis: Optional[Union[int, tuple[int, ...]]] = None,
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 798aac5d9..5d75476ea 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -1487,6 +1487,10 @@ def _broadcast_to(
     arr = array(arr, copy=False, subok=subok)
     # 'broadcast_to' returns a read-only view of the original array
     out_shape = broadcast_shapes(arr.shape, shape)
+    if out_shape != shape:
+        raise ValueError(
+            f"cannot broadcast an array of shape {arr.shape} to {shape}"
+        )
     result = ndarray(
         shape=out_shape,
         thunk=arr._thunk.broadcast_to(out_shape),
@@ -6978,7 +6982,7 @@ def count_nonzero(
 # Averages and variances
 
 
-@add_boilerplate("a", "where")
+@add_boilerplate("a")
 def mean(
     a: ndarray,
     axis: Optional[Union[int, tuple[int, ...]]] = None,

From 90f27bc67a09f68cd9e9728383c7388371e774c0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 20:57:45 -0800
Subject: [PATCH 47/66] some C++ code clean-up

---
 src/cunumeric/unary/unary_red.cc     | 10 ++++++----
 src/cunumeric/unary/unary_red_omp.cc | 13 ++++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index bdd1c7012..bc1717a47 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -36,11 +36,13 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM, HAS_WHERE> {
                   size_t volume) const
   {
     for (size_t idx = 0; idx < volume; ++idx) {
-      auto point    = pitches.unflatten(idx, rect.lo);
-      auto identity = LG_OP::identity;
-      bool mask     = true;
+      auto point = pitches.unflatten(idx, rect.lo);
+      bool mask  = true;
       if constexpr (HAS_WHERE) mask = (where[point] == true);
-      if (mask) lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+      if (mask) {
+        auto identity = LG_OP::identity;
+        lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+      }
     }
   }
 };
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index 5074031df..a2765ccff 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -90,14 +90,17 @@ struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM, HAS_WHERE> {
     auto split = splitter.split(rect, collapsed_dim);
 
 #pragma omp parallel for schedule(static)
-    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
+    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) {
       for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
-        auto point    = splitter.combine(o_idx, i_idx, rect.lo);
-        auto identity = LG_OP::identity;
-        bool mask     = true;
+        auto point = splitter.combine(o_idx, i_idx, rect.lo);
+        bool mask  = true;
         if constexpr (HAS_WHERE) mask = (where[point] == true);
-        if (mask) lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+        if (mask) {
+          auto identity = LG_OP::identity;
+          lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));
+        }
       }
+    }
   }
 };
 

From fa19be77aa6bf3e60a930b6e301973de216eec8b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 13 Nov 2023 21:49:49 -0800
Subject: [PATCH 48/66] more C++ code clean-up

---
 cunumeric/array.py                            |  1 -
 .../unary/scalar_unary_red_template.inl       |  2 +-
 src/cunumeric/unary/unary_red.cu              | 58 ++++---------------
 3 files changed, 12 insertions(+), 49 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 3eb8939c8..33233d284 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3713,7 +3713,6 @@ def sum(
             where=where,
         )
 
-    @add_boilerplate()
     def _nansum(
         self,
         axis: Any = None,
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 11d0bbe12..bc9de8077 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -128,7 +128,7 @@ struct ScalarUnaryRed {
   }
 };
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, int HAS_WHERE>
+template <VariantKind KIND, UnaryRedCode OP_CODE, bool HAS_WHERE>
 struct ScalarUnaryRedImpl {
   template <Type::Code CODE, int DIM>
   void operator()(ScalarUnaryRedArgs& args) const
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 7127046a6..17697e73b 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -266,9 +266,10 @@ static void __device__ __forceinline__ collapse_dims(LHS& result,
 #endif
 }
 
-template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM, bool HAS_WHERE>
 static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
                                                           AccessorRO<RHS, DIM> in,
+                                                          AccessorRO<bool, DIM> where,
                                                           LHS identity,
                                                           const ThreadBlocks<DIM>& blocks,
                                                           const Rect<DIM>& domain,
@@ -280,33 +281,10 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   Point<DIM> point = blocks.point(bid, tid, domain.lo);
   if (!domain.contains(point)) return point;
 
+  bool mask = true;
+  if constexpr (HAS_WHERE) mask = (where[point] == true);
   while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
-    LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
-    REDOP::template fold<true>(result, value);
-    blocks.next_point(point);
-  }
-
-  collapse_dims<REDOP, LHS>(result, point, domain, collapsed_dim, identity, tid);
-  return point;
-}
-
-template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
-static __device__ __forceinline__ Point<DIM> local_reduce_where(LHS& result,
-                                                                AccessorRO<RHS, DIM> in,
-                                                                AccessorRO<bool, DIM> where,
-                                                                LHS identity,
-                                                                const ThreadBlocks<DIM>& blocks,
-                                                                const Rect<DIM>& domain,
-                                                                int32_t collapsed_dim)
-{
-  const coord_t tid = threadIdx.x;
-  const coord_t bid = blockIdx.x;
-
-  Point<DIM> point = blocks.point(bid, tid, domain.lo);
-  if (!domain.contains(point)) return point;
-
-  while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
-    if (where[point] == true) {
+    if (mask) {
       LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
       REDOP::template fold<true>(result, value);
     }
@@ -316,33 +294,19 @@ static __device__ __forceinline__ Point<DIM> local_reduce_where(LHS& result,
   collapse_dims<REDOP, LHS>(result, point, domain, collapsed_dim, identity, tid);
   return point;
 }
-template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM, bool HAS_WHERE>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduce_with_rd_acc(AccessorRD<REDOP, false, DIM> out,
                      AccessorRO<RHS, DIM> in,
+                     AccessorRO<bool, DIM> where,
                      LHS identity,
                      ThreadBlocks<DIM> blocks,
                      Rect<DIM> domain,
                      int32_t collapsed_dim)
 {
   auto result = identity;
-  auto point =
-    local_reduce<OP, REDOP, LHS, RHS, DIM>(result, in, identity, blocks, domain, collapsed_dim);
-  if (result != identity) out.reduce(point, result);
-}
-
-template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  reduce_with_rd_acc_where(AccessorRD<REDOP, false, DIM> out,
-                           AccessorRO<RHS, DIM> in,
-                           AccessorRO<bool, DIM> where,
-                           LHS identity,
-                           ThreadBlocks<DIM> blocks,
-                           Rect<DIM> domain,
-                           int32_t collapsed_dim)
-{
-  auto result = identity;
-  auto point  = local_reduce_where<OP, REDOP, LHS, RHS, DIM>(
+  auto point  = local_reduce<OP, REDOP, LHS, RHS, DIM, HAS_WHERE>(
     result, in, where, identity, blocks, domain, collapsed_dim);
   if (result != identity) out.reduce(point, result);
 }
@@ -362,7 +326,7 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
                   int collapsed_dim,
                   size_t volume) const
   {
-    auto Kernel = reduce_with_rd_acc<OP, LG_OP, LHS, RHS, DIM>;
+    auto Kernel = reduce_with_rd_acc<OP, LG_OP, LHS, RHS, DIM, HAS_WHERE>;
     auto stream = get_cached_stream();
 
     ThreadBlocks<DIM> blocks;
@@ -374,7 +338,7 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
         lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
     else
       Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-        lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
+        lhs, rhs, AccessorRO<bool, DIM>(), LG_OP::identity, blocks, rect, collapsed_dim);
     CHECK_CUDA_STREAM(stream);
   }
 };

From a51135df4d728ed9cca382bf0576cba584416de5 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:35:44 -0800
Subject: [PATCH 49/66] Update src/cunumeric/unary/unary_red.cu

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 17697e73b..47ae3ecc4 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -282,7 +282,7 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   if (!domain.contains(point)) return point;
 
   bool mask = true;
-  if constexpr (HAS_WHERE) mask = (where[point] == true);
+  if constexpr (HAS_WHERE) mask = where[point];
   while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
     if (mask) {
       LHS value = OP::convert(point, collapsed_dim, identity, in[point]);

From e703cc99e6c80fcf51d1db62f852fee8eea07999 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:35:53 -0800
Subject: [PATCH 50/66] Update src/cunumeric/unary/unary_red.cc

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index bc1717a47..b37d1a4b2 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -38,7 +38,7 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM, HAS_WHERE> {
     for (size_t idx = 0; idx < volume; ++idx) {
       auto point = pitches.unflatten(idx, rect.lo);
       bool mask  = true;
-      if constexpr (HAS_WHERE) mask = (where[point] == true);
+      if constexpr (HAS_WHERE) mask = where[point];
       if (mask) {
         auto identity = LG_OP::identity;
         lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));

From 7ea110f10ac7af8a8bd1ce54c61e225b79a146ed Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:36:02 -0800
Subject: [PATCH 51/66] Update src/cunumeric/unary/unary_red_omp.cc

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red_omp.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index a2765ccff..fd9ccce60 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -94,7 +94,7 @@ struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM, HAS_WHERE> {
       for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
         auto point = splitter.combine(o_idx, i_idx, rect.lo);
         bool mask  = true;
-        if constexpr (HAS_WHERE) mask = (where[point] == true);
+        if constexpr (HAS_WHERE) mask = where[point];
         if (mask) {
           auto identity = LG_OP::identity;
           lhs.reduce(point, OP::convert(point, collapsed_dim, identity, rhs[point]));

From f080a95707b5922426b13adf701cc93bb79624d5 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:39:46 -0800
Subject: [PATCH 52/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index bc9de8077..9b52cd6bf 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -99,7 +99,7 @@ struct ScalarUnaryRed {
   {
     auto p    = pitches.unflatten(idx, origin);
     bool mask = true;
-    if constexpr (HAS_WHERE) mask = (where[p] == true);
+    if constexpr (HAS_WHERE) mask = where[p];
 
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
       if (mask && (in[p] == to_find)) { lhs = true; }

From c039cb2b6c9f7bc2b6108f380eff8f81437599c3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:39:53 -0800
Subject: [PATCH 53/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 9b52cd6bf..2eb2ed21f 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -82,7 +82,7 @@ struct ScalarUnaryRed {
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, DenseReduction) const noexcept
   {
     bool mask = true;
-    if constexpr (HAS_WHERE) mask = (whereptr[idx] == true);
+    if constexpr (HAS_WHERE) mask = whereptr[idx];
 
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
       if (mask && (inptr[idx] == to_find)) { lhs = true; }

From 6896b4ea61ef9ca75c56059536258bb4b2afc6db Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:40:21 -0800
Subject: [PATCH 54/66] Update src/cunumeric/unary/unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red_template.inl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index eecc45ddf..87750e17c 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -92,7 +92,11 @@ static void unary_red_template(TaskContext& context)
                     has_where ? inputs[1] : dummy_where,
                     scalars[0].value<int32_t>(),
                     scalars[1].value<UnaryRedCode>()};
-  op_dispatch(args.op_code, UnaryRedDispatch<KIND, false>{}, args);
+  if (has_where) {
+    op_dispatch(args.op_code, UnaryRedDispatch<KIND, true>{}, args);
+  } else {
+    op_dispatch(args.op_code, UnaryRedDispatch<KIND, false>{}, args);
+  }
 }
 
 }  // namespace cunumeric

From 40142397687ad4fa017527c98a8ffbf19ab700e9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:40:37 -0800
Subject: [PATCH 55/66] Update
 src/cunumeric/unary/scalar_unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/scalar_unary_red_template.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 2eb2ed21f..110a73e7a 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -165,7 +165,7 @@ static void scalar_unary_red_template(TaskContext& context)
   bool has_where   = scalars[2].value<bool>();
   size_t start_idx = has_where ? 2 : 1;
   std::vector<Store> extra_args;
-  extra_args.reserve(inputs.size());
+  extra_args.reserve(inputs.size() - start_idx);
   for (size_t idx = start_idx; idx < inputs.size(); ++idx)
     extra_args.emplace_back(std::move(inputs[idx]));
   // If the RHS was a scalar, use (1,) as the shape

From fe84bb15cf5ffb75ae92bf6ca29dc66e63b0fded Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 10:41:02 -0800
Subject: [PATCH 56/66] Update src/cunumeric/unary/unary_red_template.inl

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red_template.inl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 87750e17c..1b92d5a32 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -50,14 +50,12 @@ struct UnaryRedImpl {
 
     auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
 
+    AccessorRO<bool, DIM> where;
     if constexpr (HAS_WHERE) {
-      auto where = args.where.read_accessor<bool, DIM>(rect);
-      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
-        lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
-    } else {
-      UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
-        lhs, rhs, AccessorRO<bool, DIM>(), rect, pitches, args.collapsed_dim, volume);
+      where = args.where.read_accessor<bool, DIM>(rect);
     }
+    UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
+        lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
   }
 
   template <Type::Code CODE,

From 70fa320e98cf0e170db5980a40130ec0d5c92db1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 13:00:29 -0800
Subject: [PATCH 57/66] Update src/cunumeric/unary/unary_red.cu

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 src/cunumeric/unary/unary_red.cu | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 47ae3ecc4..d6617df71 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -333,12 +333,8 @@ struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM, HAS_WHERE> {
     blocks.initialize(rect, collapsed_dim);
 
     blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    if constexpr (HAS_WHERE)
-      Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-        lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
-    else
-      Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-        lhs, rhs, AccessorRO<bool, DIM>(), LG_OP::identity, blocks, rect, collapsed_dim);
+    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
+      lhs, rhs, where, LG_OP::identity, blocks, rect, collapsed_dim);
     CHECK_CUDA_STREAM(stream);
   }
 };

From 9d5b364698c239428297a309632069feae38668d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 13:21:04 -0800
Subject: [PATCH 58/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 33233d284..411980d82 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3141,7 +3141,7 @@ def mean(
 
         if axis is None:
             if where_array is not None:
-                divisor = np.array(where_array._count_nonzero())
+                divisor = where_array._count_nonzero()
             else:
                 divisor = np.array(reduce(lambda x, y: x * y, self.shape, 1))
 

From 2bd62ebc9e5c520440e693236ecf0f0fc2f58353 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Fri, 17 Nov 2023 13:21:17 -0800
Subject: [PATCH 59/66] Update cunumeric/array.py

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 411980d82..7a962d22b 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3147,9 +3147,7 @@ def mean(
 
         else:
             if where_array is not None:
-                divisor = np.array(
-                    where_array.sum(axis=axis, keepdims=keepdims)
-                )
+                divisor = where_array.sum(axis=axis, keepdims=keepdims)
             else:
                 divisor = np.array(self.shape[axis])
 

From f504f05a95190de88cac5b5927cbf2d5d37c8ff2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 17 Nov 2023 13:11:07 -0800
Subject: [PATCH 60/66] fixing bug in cuda code

---
 src/cunumeric/unary/unary_red.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index d6617df71..b5e0e5eb1 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -282,8 +282,8 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   if (!domain.contains(point)) return point;
 
   bool mask = true;
-  if constexpr (HAS_WHERE) mask = where[point];
   while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
+    if constexpr (HAS_WHERE) mask = where[point];
     if (mask) {
       LHS value = OP::convert(point, collapsed_dim, identity, in[point]);
       REDOP::template fold<true>(result, value);

From b15f65673def63c090682e30e303d62fbbd34612 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 17 Nov 2023 13:56:55 -0800
Subject: [PATCH 61/66] adding dtype back to sum

---
 cunumeric/array.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 7a962d22b..2dd01e0ab 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3127,39 +3127,52 @@ def mean(
         # Do the sum
         sum_array = (
             self._nansum(
-                axis=axis, out=out, keepdims=keepdims, where=where_array
+                axis=axis,
+                out=out,
+                keepdims=keepdims,
+                dtype=dtype,
+                where=where_array,
             )
             if out is not None and out.dtype == dtype and ignore_nan
             else self.sum(
-                axis=axis, out=out, keepdims=keepdims, where=where_array
+                axis=axis,
+                out=out,
+                keepdims=keepdims,
+                dtype=dtype,
+                where=where_array,
             )
             if out is not None and out.dtype == dtype
-            else self._nansum(axis=axis, keepdims=keepdims, where=where_array)
+            else self._nansum(
+                axis=axis, keepdims=keepdims, dtype=dtype, where=where_array
+            )
             if ignore_nan
-            else self.sum(axis=axis, keepdims=keepdims, where=where_array)
+            else self.sum(
+                axis=axis, keepdims=keepdims, dtype=dtype, where=where_array
+            )
         )
 
         if axis is None:
             if where_array is not None:
                 divisor = where_array._count_nonzero()
             else:
-                divisor = np.array(reduce(lambda x, y: x * y, self.shape, 1))
+                divisor = reduce(lambda x, y: x * y, self.shape, 1)
 
         else:
             if where_array is not None:
-                divisor = where_array.sum(axis=axis, keepdims=keepdims)
+                divisor = where_array.sum(
+                    axis=axis, dtype=dtype, keepdims=keepdims
+                )
             else:
-                divisor = np.array(self.shape[axis])
+                divisor = self.shape[axis]
 
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
-        sum_array = sum_array.astype(dtype)
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(
-                divisor.astype(dtype),
+                divisor,
             )
         else:
-            sum_array.__ifloordiv__(divisor.astype(dtype))
+            sum_array.__ifloordiv__(divisor)
         # Convert to the output we didn't already put it there
         if out is not None and sum_array is not out:
             assert out.dtype != sum_array.dtype

From 7a775350c715d12763b29f1b40b98e3ce1b01038 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 17 Nov 2023 14:20:16 -0800
Subject: [PATCH 62/66] removing ignore_nan

---
 cunumeric/array.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 2dd01e0ab..0b7ce23c0 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3094,7 +3094,6 @@ def mean(
         out: Union[ndarray, None] = None,
         keepdims: bool = False,
         where: Union[ndarray, None] = None,
-        ignore_nan: bool = False,
     ) -> ndarray:
         """a.mean(axis=None, dtype=None, out=None, keepdims=False)
 
@@ -3126,15 +3125,7 @@ def mean(
         where_array = broadcast_where(where, self.shape)
         # Do the sum
         sum_array = (
-            self._nansum(
-                axis=axis,
-                out=out,
-                keepdims=keepdims,
-                dtype=dtype,
-                where=where_array,
-            )
-            if out is not None and out.dtype == dtype and ignore_nan
-            else self.sum(
+            self.sum(
                 axis=axis,
                 out=out,
                 keepdims=keepdims,
@@ -3142,10 +3133,6 @@ def mean(
                 where=where_array,
             )
             if out is not None and out.dtype == dtype
-            else self._nansum(
-                axis=axis, keepdims=keepdims, dtype=dtype, where=where_array
-            )
-            if ignore_nan
             else self.sum(
                 axis=axis, keepdims=keepdims, dtype=dtype, where=where_array
             )
@@ -3207,7 +3194,6 @@ def _nanmean(
             out=out,
             keepdims=keepdims,
             where=nan_mask,
-            ignore_nan=True,
         )
 
     @add_boilerplate()

From bdb49e986055b8e746092ec749048d6b37aefdcd Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 29 Nov 2023 21:06:22 -0800
Subject: [PATCH 63/66] fixing var for the case when where is not None

---
 cunumeric/array.py              | 31 ++++++++++++++++++++++++-------
 tests/integration/test_stats.py | 20 +++++++++++++++++++-
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 2767e1a0b..6c226d747 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3119,16 +3119,25 @@ def _normalize_summation(
                 divisor = reduce(lambda x, y: x * y, self.shape, 1) - ddof
         else:
             if where is not None:
-                divisor = (
-                    where.sum(axis=axis, dtype=dtype, keepdims=keepdims) - ddof
+                divisor = where.sum(
+                    axis=axis, dtype=sum_array.dtype, keepdims=keepdims
                 )
+                if ddof != 0 and not np.isscalar(divisor):
+                    mask = divisor != 0
+                    values = divisor - ddof
+                    divisor._thunk.putmask(mask._thunk, values._thunk)
+                else:
+                    divisor -= ddof
             else:
                 divisor = self.shape[axis] - ddof
 
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
-        if np.ndim(divisor) == 0:
+        if isinstance(divisor, ndarray):
+            divisor = divisor.astype(sum_array.dtype)
+        else:
             divisor = np.array(divisor, dtype=sum_array.dtype)  # type: ignore [assignment] # noqa
+
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(divisor)
         else:
@@ -3267,8 +3276,9 @@ def var(
         # mean can be broadcast against the original array
         mu = self.mean(axis=axis, dtype=dtype, keepdims=True, where=where)
 
+        where_array = broadcast_where(where, self.shape)
+
         # 1D arrays (or equivalent) should benefit from this unary reduction:
-        #
         if axis is None or calculate_volume(tuple_pop(self.shape, axis)) == 1:
             # this is a scalar reduction and we can optimize this as a single
             # pass through a scalar reduction
@@ -3279,7 +3289,7 @@ def var(
                 dtype=dtype,
                 out=out,
                 keepdims=keepdims,
-                where=where,
+                where=where_array,
                 args=(mu,),
             )
         else:
@@ -3300,10 +3310,17 @@ def var(
                 dtype=dtype,
                 out=out,
                 keepdims=keepdims,
-                where=where,
+                where=where_array,
             )
 
-        self._normalize_summation(result, axis=axis, dtype=dtype, ddof=ddof)
+        self._normalize_summation(
+            result,
+            axis=axis,
+            dtype=result.dtype,
+            ddof=ddof,
+            keepdims=keepdims,
+            where=where_array,
+        )
 
         return result
 
diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py
index dfa1b0fa3..256a6b77f 100644
--- a/tests/integration/test_stats.py
+++ b/tests/integration/test_stats.py
@@ -39,7 +39,7 @@ def check_result(in_np, out_np, out_num, **isclose_kwargs):
         is_negative_test = False
 
     result = (
-        allclose(out_np, out_num, **isclose_kwargs)
+        allclose(out_np, out_num, equal_nan=True, **isclose_kwargs)
         and out_np.dtype == out_num.dtype
     )
     if not result and not is_negative_test:
@@ -131,6 +131,24 @@ def test_var_default_shape(dtype, ddof, axis, keepdims):
     check_op(op_np, op_num, np_in, dtype)
 
 
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("ddof", [0, 1])
+@pytest.mark.parametrize("axis", [None, 0, 1])
+@pytest.mark.parametrize("keepdims", [False, True])
+def test_var_where(dtype, ddof, axis, keepdims):
+    np_in = get_op_input(astype=dtype)
+    where = (np_in.astype(int) % 2).astype(bool)
+
+    op_np = functools.partial(
+        np.var, ddof=ddof, axis=axis, keepdims=keepdims, where=where
+    )
+    op_num = functools.partial(
+        num.var, ddof=ddof, axis=axis, keepdims=keepdims, where=where
+    )
+
+    check_op(op_np, op_num, np_in, dtype)
+
+
 @pytest.mark.parametrize("dtype", dtypes)
 @pytest.mark.parametrize("ddof", [0, 1])
 @pytest.mark.parametrize("axis", [None, 0, 1, 2])

From 4ef0749d6674e08d4f88f3d0e9c3076e7b8a1b45 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 30 Nov 2023 05:14:52 +0000
Subject: [PATCH 64/66] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/cunumeric/unary/unary_red_template.inl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 1b92d5a32..aa038384f 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -51,11 +51,9 @@ struct UnaryRedImpl {
     auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
 
     AccessorRO<bool, DIM> where;
-    if constexpr (HAS_WHERE) {
-      where = args.where.read_accessor<bool, DIM>(rect);
-    }
+    if constexpr (HAS_WHERE) { where = args.where.read_accessor<bool, DIM>(rect); }
     UnaryRedImplBody<KIND, OP_CODE, CODE, DIM, HAS_WHERE>()(
-        lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
+      lhs, rhs, where, rect, pitches, args.collapsed_dim, volume);
   }
 
   template <Type::Code CODE,

From 18a6fe85fd452126afe84c94b0331f98a05cf28a Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 30 Nov 2023 09:38:21 -0800
Subject: [PATCH 65/66] fixing eager logic for var

---
 cunumeric/eager.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 03929395d..7c58023ef 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1532,7 +1532,9 @@ def unary_reduction(
                 squared,
                 out=self.array,
                 axis=orig_axis,
-                where=where,
+                where=where
+                if not isinstance(where, EagerArray)
+                else where.array,
                 keepdims=keepdims,
             )
         elif op == UnaryRedCode.VARIANCE:
@@ -1542,7 +1544,9 @@ def unary_reduction(
             np.sum(
                 squares,
                 axis=orig_axis,
-                where=where,
+                where=where
+                if not isinstance(where, EagerArray)
+                else where.array,
                 keepdims=keepdims,
                 out=self.array,
             )

From 869b1762aa1dea0f9e4e9e70f48c4cdcd5d4d914 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 1 Dec 2023 18:54:50 -0800
Subject: [PATCH 66/66] removing dtype from _normalize_summation

---
 cunumeric/array.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 3da80916c..8bfc5178a 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3094,11 +3094,11 @@ def _normalize_summation(
         self,
         sum_array: Any,
         axis: Any,
-        dtype: np.dtype[Any],
         ddof: int = 0,
         keepdims: bool = False,
         where: Union[ndarray, None] = None,
     ) -> None:
+        dtype = sum_array.dtype
         if axis is None:
             if where is not None:
                 divisor = where._count_nonzero() - ddof
@@ -3106,9 +3106,7 @@ def _normalize_summation(
                 divisor = reduce(lambda x, y: x * y, self.shape, 1) - ddof
         else:
             if where is not None:
-                divisor = where.sum(
-                    axis=axis, dtype=sum_array.dtype, keepdims=keepdims
-                )
+                divisor = where.sum(axis=axis, dtype=dtype, keepdims=keepdims)
                 if ddof != 0 and not np.isscalar(divisor):
                     mask = divisor != 0
                     values = divisor - ddof
@@ -3121,9 +3119,9 @@ def _normalize_summation(
         # Divide by the number of things in the collapsed dimensions
         # Pick the right kinds of division based on the dtype
         if isinstance(divisor, ndarray):
-            divisor = divisor.astype(sum_array.dtype)
+            divisor = divisor.astype(dtype)
         else:
-            divisor = np.array(divisor, dtype=sum_array.dtype)  # type: ignore [assignment] # noqa
+            divisor = np.array(divisor, dtype=dtype)  # type: ignore [assignment] # noqa
 
         if dtype.kind == "f" or dtype.kind == "c":
             sum_array.__itruediv__(divisor)
@@ -3179,7 +3177,7 @@ def mean(
         )
 
         self._normalize_summation(
-            sum_array, axis, dtype, keepdims=keepdims, where=where_array
+            sum_array, axis, keepdims=keepdims, where=where_array
         )
 
         # Convert to the output we didn't already put it there
@@ -3303,7 +3301,6 @@ def var(
         self._normalize_summation(
             result,
             axis=axis,
-            dtype=result.dtype,
             ddof=ddof,
             keepdims=keepdims,
             where=where_array,