Merge branch 'branch-24.03' into numba-soa-fix-typing

nv-legate · May 1, 2024 · 966111a · 966111a
2 parents 5bbe4d6 + 8de3a95
commit 966111a
Show file tree

Hide file tree

Showing 31 changed files with 2,043 additions and 252 deletions.
diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
@@ -27,9 +27,9 @@ jobs:
       options: -u root
       image: "${{ inputs.image }}"
       env:
-        CUDA_VERSION: "12.0"
+        CUDA_VERSION: "12.2"
         CUDA_VERSION_MAJOR: "12"
-        CUDA_VERSION_MINOR: "0"
+        CUDA_VERSION_MINOR: "2"
         SCCACHE_REGION: "us-east-2"
         SCCACHE_BUCKET: "rapids-sccache-devs"
         SCCACHE_S3_KEY_PREFIX: "legate-cunumeric-dev"

diff --git a/README.md b/README.md
@@ -37,33 +37,38 @@ If you have questions, please contact us at legate(at)nvidia.com.
 
 ## Installation
 
-cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric).
-Create a new environment containing cuNumeric:
+cuNumeric is available from [conda](https://docs.conda.io/projects/conda/en/latest/index.html)
+on the [legate channel](https://anaconda.org/legate/cunumeric).
+Please make sure you have at least conda version 24.1 installed, then create
+a new environment containing cuNumeric:
 
 ```
-mamba create -n myenv -c nvidia -c conda-forge -c legate cunumeric
+conda create -n myenv -c nvidia -c conda-forge -c legate cunumeric
 ```
 
 or install it into an existing environment:
 
 ```
-mamba install -c nvidia -c conda-forge -c legate cunumeric
+conda install -c nvidia -c conda-forge -c legate cunumeric
 ```
 
-Only linux-64 packages are available at the moment.
-
-The default package contains GPU support, and is compatible with CUDA >= 11.8
-(CUDA driver version >= r520), and Volta or later GPU architectures. There are
-also CPU-only packages available, and will be automatically selected when
-installing on a machine without GPUs. You can force installation of a CPU-only
-package by requesting it as follows:
+Once installed, you can verify the installation by running one of the examples
+from the cuNumeric repository, for instance:
 
 ```
-mamba ... cunumeric=*=*_cpu
+$ legate examples/black_scholes.py
+Running black scholes on 10K options...
+Elapsed Time: 129.017 ms
 ```
 
-See the build instructions at https://nv-legate.github.io/cunumeric for details
-about building cuNumeric from source.
+Only linux-64 packages are available at the moment.
+
+The default package contains GPU support, and is compatible with CUDA >= 11.8
+(driver >= 520), and Volta or later GPU architectures. There are also CPU-only
+packages available, which will be automatically selected when installing on a
+machine without GPUs available. See https://nv-legate.github.io/cunumeric for
+details about manually forcing different install configurations, or building 
+cuNumeric from source.
 
 ## Usage and Execution
 

diff --git a/cmake/versions.json b/cmake/versions.json
@@ -5,7 +5,7 @@
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "08da13fc544f3db26bf1ef7ce9bdb85e72a9d9fb"
+      "git_tag" : "0f509a007f36609d2b0bd536d8e5c54f2391b444"
     }
   }
 }
diff --git a/continuous_integration/scripts/test-cunumeric b/continuous_integration/scripts/test-cunumeric
@@ -5,7 +5,7 @@ setup_env() {
 }
 
 setup_test_env() {
-    mamba install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-lazy-fixture pytest-mock pytest types-docutils pynvml
+    mamba install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-lazy-fixture pytest-mock "pytest<8" types-docutils pynvml
 
     pip install tifffile
 }
@@ -58,4 +58,4 @@ test-cunumeric() {
     esac
 }
 
-(test-cunumeric "$@");
+(test-cunumeric "$@");
diff --git a/cunumeric/_ufunc/comparison.py b/cunumeric/_ufunc/comparison.py
@@ -73,13 +73,15 @@
     "logical_and",
     BinaryOpCode.LOGICAL_AND,
     relation_types_of(all_dtypes),
+    red_code=UnaryRedCode.ALL,
 )
 
 logical_or = create_binary_ufunc(
     "Compute the truth value of x1 OR x2 element-wise.",
     "logical_or",
     BinaryOpCode.LOGICAL_OR,
     relation_types_of(all_dtypes),
+    red_code=UnaryRedCode.ANY,
 )
 
 logical_xor = create_binary_ufunc(

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
@@ -753,6 +753,16 @@ def reduce(
                 f"reduction for {self} is not yet implemented"
             )
 
+        if self._op_code in [
+            BinaryOpCode.LOGICAL_AND,
+            BinaryOpCode.LOGICAL_OR,
+        ]:
+            res_dtype = bool
+            if dtype is not None:
+                raise TypeError("Cannot set dtype on a logical reduction")
+        else:
+            res_dtype = None
+
         # NumPy seems to be using None as the default axis value for scalars
         if array.ndim == 0 and axis == 0:
             axis = None
@@ -767,6 +777,7 @@ def reduce(
             keepdims=keepdims,
             initial=initial,
             where=where,
+            res_dtype=res_dtype,
         )
 
 

diff --git a/cunumeric/array.py b/cunumeric/array.py
@@ -2257,6 +2257,20 @@ def clip(
         Multiple GPUs, Multiple CPUs
 
         """
+        min = (
+            min
+            if min is not None
+            else np.iinfo(self.dtype).min
+            if self.dtype.kind == "i"
+            else -np.inf
+        )
+        max = (
+            max
+            if max is not None
+            else np.iinfo(self.dtype).max
+            if self.dtype.kind == "i"
+            else np.inf
+        )
         args = (
             np.array(min, dtype=self.dtype),
             np.array(max, dtype=self.dtype),

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
@@ -296,6 +296,7 @@ def _copy_if_overlapping(self, other: DeferredArray) -> DeferredArray:
             self.runtime.create_empty_thunk(
                 self.shape,
                 self.base.type,
+                inputs=[self],
             ),
         )
         copy.copy(self, deep=True)
@@ -1098,22 +1099,20 @@ def set_item(self, key: Any, rhs: Any) -> None:
                 # to set the result back. In cuNumeric, the object we
                 # return in step (1) is actually a subview to the array arr
                 # through which we make updates in place, so after step (2) is
-                # done, # the effect of inplace update is already reflected
+                # done, the effect of inplace update is already reflected
                 # to the arr. Therefore, we skip the copy to avoid redundant
                 # copies if we know that we hit such a scenario.
                 # TODO: We should make this work for the advanced indexing case
-                if view.base == rhs.base:
+                # NOTE: Neither Store nor Storage have an __eq__, so we can
+                # only check that the underlying RegionField/Future corresponds
+                # to the same Legion handle.
+                if (
+                    view.base.has_storage
+                    and rhs.base.has_storage
+                    and view.base.storage.same_handle(rhs.base.storage)
+                ):
                     return
 
-                if view.base.overlaps(rhs.base):
-                    rhs_copy = self.runtime.create_empty_thunk(
-                        rhs.shape,
-                        rhs.base.type,
-                        inputs=[rhs],
-                    )
-                    rhs_copy.copy(rhs, deep=False)
-                    rhs = rhs_copy
-
                 view.copy(rhs, deep=False)
 
     def broadcast_to(self, shape: NdShape) -> NumPyThunk:
@@ -1870,6 +1869,9 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
 
         assert indices.size == values.size
 
+        # Handle store overlap
+        values = values._copy_if_overlapping(self_tmp)
+
         # first, we create indirect array with PointN type that
         # (indices.size,) shape and is used to copy data from values
         # to the target ND array (self)
@@ -1910,11 +1912,12 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
     @auto_convert("mask", "values")
     def putmask(self, mask: Any, values: Any) -> None:
         assert self.shape == mask.shape
-
+        values = values._copy_if_overlapping(self)
         if values.shape != self.shape:
             values_new = values._broadcast(self.shape)
         else:
             values_new = values.base
+
         task = self.context.create_auto_task(CuNumericOpCode.PUTMASK)
         task.add_input(self.base)
         task.add_input(mask.base)
@@ -3142,6 +3145,7 @@ def unary_op(
         multiout: Optional[Any] = None,
     ) -> None:
         lhs = self.base
+        src = src._copy_if_overlapping(self)
         rhs = src._broadcast(lhs.shape)
 
         with Annotation({"OpCode": op.name}):
@@ -3304,7 +3308,9 @@ def binary_op(
         args: Any,
     ) -> None:
         lhs = self.base
+        src1 = src1._copy_if_overlapping(self)
         rhs1 = src1._broadcast(lhs.shape)
+        src2 = src2._copy_if_overlapping(self)
         rhs2 = src2._broadcast(lhs.shape)
 
         with Annotation({"OpCode": op_code.name}):

diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py
@@ -40,7 +40,6 @@ def transpose_copy_single(
     task.add_input(input)
     # Output has the same shape as input, but is mapped
     # to a column major instance
-    task.add_scalar_arg(False, ty.bool_)
 
     task.add_broadcast(output)
     task.add_broadcast(input)
@@ -62,7 +61,6 @@ def transpose_copy(
     task.add_input(p_input)
     # Output has the same shape as input, but is mapped
     # to a column major instance
-    task.add_scalar_arg(False, ty.bool_)
 
     task.execute()