diff --git a/.github/actions/download-artifacts/action.yml b/.github/actions/download-artifacts/action.yml
index e8019b1b1..640dc143a 100644
--- a/.github/actions/download-artifacts/action.yml
+++ b/.github/actions/download-artifacts/action.yml
@@ -25,7 +25,7 @@ runs:
         repo: nv-legate/legate.core
         commit: ${{ inputs.git_sha }}
         workflow_conclusion: success
-        workflow: "ci-gh-${{ inputs.device }}-build-and-test.yml"
+        workflow: "ci-gh.yml"
         name: "legate.core-${{ inputs.device }}-[0-9a-z]{40}"
         name_is_regexp: true
 
diff --git a/README.md b/README.md
index 7516331ff..cec00b052 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,14 @@ If you have questions, please contact us at legate(at)nvidia.com.
 
 ## Installation
 
-cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
+cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric).
+Create a new environment containing cuNumeric:
+
+```
+mamba create -n myenv -c nvidia -c conda-forge -c legate cunumeric
+```
+
+or install it into an existing environment:
 
 ```
 mamba install -c nvidia -c conda-forge -c legate cunumeric
@@ -48,7 +55,12 @@ Only linux-64 packages are available at the moment.
 The default package contains GPU support, and is compatible with CUDA >= 11.8
 (CUDA driver version >= r520), and Volta or later GPU architectures. There are
 also CPU-only packages available, and will be automatically selected when
-installing on a machine without GPUs.
+installing on a machine without GPUs. You can force installation of a CPU-only
+package by requesting it as follows:
+
+```
+mamba ... cunumeric=*=*_cpu
+```
 
 See the build instructions at https://nv-legate.github.io/cunumeric for details
 about building cuNumeric from source.
diff --git a/cmake/versions.json b/cmake/versions.json
index 43d60fa5e..9441e93d8 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -5,7 +5,7 @@
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "6fa0acc9dcfa89be2702f1de6c045bc262f752b1"
+      "git_tag" : "c05ebdab30dee6ac2d6e2808fb835fad0302822d"
     }
   }
 }
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index c652d931b..85ea88d74 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -116,11 +116,12 @@ requirements:
     - cuda-nvcc ={{ cuda_version }}
     - cuda-cccl ={{ cuda_version }}
     - cuda-cudart ={{ cuda_version }}
+    - cuda-cudart-static ={{ cuda_version }}
     - cuda-driver-dev ={{ cuda_version }}
     - cuda-cudart-dev ={{ cuda_version }}
     - cuda-nvtx ={{ cuda_version }}
     # - libcutensor-dev >=1.3
-    - cutensor >=1.3 =*_*
+    - cutensor >=1.3,<2.0 =*_*
     - libcublas-dev
     - libcusolver-dev
     - libcufft-dev
@@ -140,6 +141,8 @@ requirements:
     - libcublas
     - libcusolver >=11.4.1.48-0
     - libcufft
+    - libnvjitlink
+    - libcusparse
 {% endif %}
     - opt_einsum >=3.3
     - scipy
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 6c226d747..3da80916c 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -2532,10 +2532,9 @@ def _diag_helper(
     def diagonal(
         self,
         offset: int = 0,
-        axis1: Any = None,
-        axis2: Any = None,
+        axis1: int = 0,
+        axis2: int = 1,
         extract: bool = True,
-        axes: Any = None,
     ) -> ndarray:
         """a.diagonal(offset=0, axis1=None, axis2=None)
 
@@ -2557,19 +2556,7 @@ def diagonal(
                 raise ValueError("extract can be true only for Ndim >=2")
             axes = None
         else:
-            if isinstance(axis1, int) and isinstance(axis2, int):
-                if axes is not None:
-                    raise ValueError(
-                        "Either axis1/axis2 or axes must be supplied"
-                    )
-                axes = (axis1, axis2)
-            # default values for axes
-            elif (axis1 is None) and (axis2 is None) and (axes is None):
-                axes = (0, 1)
-            elif (axes is not None) and (
-                (axis1 is not None) or (axis2 is not None)
-            ):
-                raise ValueError("Either axis1/axis2 or axes must be supplied")
+            axes = (axis1, axis2)
         return self._diag_helper(offset=offset, axes=axes, extract=extract)
 
     @add_boilerplate("indices", "values")
diff --git a/cunumeric/module.py b/cunumeric/module.py
index d37ea4183..45d7508b1 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -3806,10 +3806,9 @@ def compress(
 def diagonal(
     a: ndarray,
     offset: int = 0,
-    axis1: Optional[int] = None,
-    axis2: Optional[int] = None,
+    axis1: int = 0,
+    axis2: int = 1,
     extract: bool = True,
-    axes: Optional[tuple[int, int]] = None,
 ) -> ndarray:
     """
     diagonal(a: ndarray, offset=0, axis1=None, axis2=None)
@@ -3868,9 +3867,7 @@ def diagonal(
     Multiple GPUs, Multiple CPUs
 
     """
-    return a.diagonal(
-        offset=offset, axis1=axis1, axis2=axis2, extract=extract, axes=axes
-    )
+    return a.diagonal(offset=offset, axis1=axis1, axis2=axis2, extract=extract)
 
 
 @add_boilerplate("a", "indices", "values")
diff --git a/src/cunumeric/matrix/batched_cholesky_template.inl b/src/cunumeric/matrix/batched_cholesky_template.inl
index 8d266e3f0..d27f25e7a 100644
--- a/src/cunumeric/matrix/batched_cholesky_template.inl
+++ b/src/cunumeric/matrix/batched_cholesky_template.inl
@@ -57,7 +57,7 @@ struct _cholesky_supported {
 
 template <VariantKind KIND>
 struct BatchedCholeskyImpl {
-  template <Type::Code CODE, int DIM>
+  template <Type::Code CODE, int32_t DIM, std::enable_if_t<(DIM > 2)>* = nullptr>
   void operator()(Array& input_array, Array& output_array) const
   {
     using VAL = legate_type_of<CODE>;
@@ -94,8 +94,8 @@ struct BatchedCholeskyImpl {
 
     if (shape.empty()) return;
 
-    int num_blocks = 1;
-    for (int i = 0; i < (DIM - 2); ++i) { num_blocks *= (shape.hi[i] - shape.lo[i] + 1); }
+    int32_t num_blocks = 1;
+    for (int32_t i = 0; i < (DIM - 2); ++i) { num_blocks *= (shape.hi[i] - shape.lo[i] + 1); }
 
     auto m = static_cast<int32_t>(shape.hi[DIM - 2] - shape.lo[DIM - 2] + 1);
     auto n = static_cast<int32_t>(shape.hi[DIM - 1] - shape.lo[DIM - 1] + 1);
@@ -103,7 +103,7 @@ struct BatchedCholeskyImpl {
 
     auto block_stride = m * n;
 
-    for (int i = 0; i < num_blocks; ++i) {
+    for (int32_t i = 0; i < num_blocks; ++i) {
       if constexpr (_cholesky_supported<CODE>::value) {
         CopyBlockImpl<KIND>()(output, input, sizeof(VAL) * block_stride);
         PotrfImplBody<KIND, CODE>()(output, m, n);
@@ -119,6 +119,12 @@ struct BatchedCholeskyImpl {
       }
     }
   }
+
+  template <Type::Code CODE, int32_t DIM, std::enable_if_t<DIM <= 2>* = nullptr>
+  void operator()(Array& input_array, Array& output_array) const
+  {
+    assert(false);
+  }
 };
 
 template <VariantKind KIND>
diff --git a/tests/integration/test_convolve.py b/tests/integration/test_convolve.py
index 7a318668d..997a9d6f7 100644
--- a/tests/integration/test_convolve.py
+++ b/tests/integration/test_convolve.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 #
 
+import os
+
 import numpy as np
 import pytest
 import scipy.signal as sig
@@ -20,8 +22,57 @@
 
 import cunumeric as num
 
-SHAPES = [(100,), (10, 10), (10, 10, 10)]
-FILTER_SHAPES = [(5,), (3, 5), (3, 5, 3)]
+CUDA_TEST = os.environ.get("LEGATE_NEED_CUDA") == "1"
+
+SHAPES = [(100,), (10, 10), (10, 10, 10), (32, 2, 32)]
+FILTER_SHAPES = [(5,), (3, 5), (3, 5, 3), (32, 1, 32)]
+
+LARGE_SHAPES = [
+    pytest.param(
+        (128, 2, 1024),
+        (64, 2, 512),
+        marks=pytest.mark.xfail(
+            not CUDA_TEST, run=False, reason="test hang on CPU variants"
+        ),
+    ),
+    pytest.param(
+        (1024, 2, 4096),
+        (128, 16, 64),
+        marks=pytest.mark.xfail(
+            not CUDA_TEST, run=False, reason="test hang on CPU variants"
+        ),
+    ),
+    pytest.param(
+        (1024, 2, 1024),
+        (5, 1, 5),
+        marks=pytest.mark.xfail(
+            CUDA_TEST, run=False, reason="GPU variant hits SIGABRT"
+        ),
+    ),
+    pytest.param(
+        (1024, 2, 1024),
+        (128, 1, 128),
+        marks=pytest.mark.xfail(
+            run=False, reason="GPU variant hits SIGFPE, CPU hangs"
+        ),
+    ),
+]
+
+DTYPES = [
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.uint8,
+    np.uint16,
+    np.uint32,
+    np.uint64,
+    np.float16,
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+]
 
 
 @pytest.mark.xfail
@@ -68,7 +119,9 @@ def check_convolve(a, v):
 
 
 @pytest.mark.parametrize(
-    "shape, filter_shape", zip(SHAPES, FILTER_SHAPES), ids=str
+    "shape, filter_shape",
+    list(zip(SHAPES, FILTER_SHAPES)) + LARGE_SHAPES,
+    ids=str,
 )
 def test_double(shape, filter_shape):
     a = num.random.rand(*shape)
@@ -79,7 +132,9 @@ def test_double(shape, filter_shape):
 
 
 @pytest.mark.parametrize(
-    "shape, filter_shape", zip(SHAPES, FILTER_SHAPES), ids=str
+    "shape, filter_shape",
+    list(zip(SHAPES, FILTER_SHAPES)) + LARGE_SHAPES,
+    ids=str,
 )
 def test_int(shape, filter_shape):
     a = num.random.randint(0, 5, shape)
@@ -88,10 +143,11 @@ def test_int(shape, filter_shape):
     check_convolve(a, v)
 
 
-def test_dtype():
+@pytest.mark.parametrize("dtype", DTYPES, ids=str)
+def test_dtype(dtype):
     shape = (5,) * 2
     arr1 = num.random.randint(0, 5, shape, dtype=np.int64)
-    arr2 = num.random.random(shape)
+    arr2 = num.random.random(shape).astype(dtype)
     out_num = num.convolve(arr1, arr2, mode="same")
     out_np = np.convolve(arr1, arr2, mode="same")
     assert allclose(out_num, out_np)
diff --git a/tests/integration/test_index_routines.py b/tests/integration/test_index_routines.py
index c86f62e97..4488b90b5 100644
--- a/tests/integration/test_index_routines.py
+++ b/tests/integration/test_index_routines.py
@@ -298,8 +298,8 @@ def test_diagonal():
         num_array = mk_seq_array(num, a_shape)
         for num_axes in range(3, ndim + 1):
             for axes in permutations(range(ndim), num_axes):
-                res_num = num.diagonal(
-                    num_array, offset=0, extract=True, axes=axes
+                res_num = num_array._diag_helper(
+                    offset=0, extract=True, axes=axes
                 )
                 res_ref = diagonal_reference(np_array, axes)
                 assert np.array_equal(res_num, res_ref)
@@ -424,33 +424,18 @@ def test_axes_none(self):
         with pytest.raises(TypeError):
             num.diagonal(self.a, 0, None, 0)
 
-    @pytest.mark.diff
-    def test_scalar_axes(self):
-        # NumPy does not have axes arg
-        with pytest.raises(ValueError):
-            num.diagonal(self.a, axes=(0,))
-
-    @pytest.mark.diff
-    def test_duplicate_axes(self):
-        # NumPy does not have axes arg
-        expected_exc = ValueError
-        with pytest.raises(expected_exc):
-            num.diagonal(self.a, axis1=1, axes=(0, 1))
-        with pytest.raises(expected_exc):
-            num.diagonal(self.a, axis1=1, axis2=0, axes=(0, 1))
-
     @pytest.mark.diff
     def test_extra_axes(self):
         # NumPy does not have axes arg
         axes = num.arange(self.a.ndim + 1, dtype=int)
         with pytest.raises(ValueError):
-            num.diagonal(self.a, axes=axes)
+            self.a._diag_helper(self.a, axes=axes)
 
     @pytest.mark.diff
     def test_n_axes_offset(self):
         # NumPy does not have axes arg
         with pytest.raises(ValueError):
-            num.diagonal(self.a, offset=1, axes=(2, 1, 0))
+            self.a._diag_helper(offset=1, axes=(2, 1, 0))
 
     @pytest.mark.parametrize(
         "k",