From 9eaca9b9540c37b9892ba64a41faf14767197f82 Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Fri, 26 Jan 2024 12:01:52 -0800 Subject: [PATCH 1/4] add np.digitize --- cunumeric/module.py | 118 +++++++++++++++++++++++++ tests/integration/test_digitize.py | 134 +++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 tests/integration/test_digitize.py diff --git a/cunumeric/module.py b/cunumeric/module.py index 0a8132a81..24a269a7b 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -8282,3 +8282,121 @@ def histogram( hist /= bins_array[1:] - bins_array[:-1] return hist.astype(result_type), bins_array.astype(bins_orig_type) + + +@add_boilerplate("x", "bins") +def digitize( + x: Union[ndarray, npt.ArrayLike], + bins: Union[ndarray, npt.ArrayLike], + right: bool = False, +) -> Union[int, ndarray]: + """ + Return the indices of the bins to which each value in input array belongs. + + ========= ============= ============================ + `right` order of bins returned index `i` satisfies + ========= ============= ============================ + ``False`` increasing ``bins[i-1] <= x < bins[i]`` + ``True`` increasing ``bins[i-1] < x <= bins[i]`` + ``False`` decreasing ``bins[i-1] > x >= bins[i]`` + ``True`` decreasing ``bins[i-1] >= x > bins[i]`` + ========= ============= ============================ + + If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is + returned as appropriate. + + Parameters + ---------- + x : array_like + Input array to be binned. Prior to NumPy 1.10.0, this array had to + be 1-dimensional, but can now have any shape. + bins : array_like + Array of bins. It has to be 1-dimensional and monotonic. + right : bool, optional + Indicating whether the intervals include the right or the left bin + edge. Default behavior is (right==False) indicating that the interval + does not include the right edge. The left bin end is open in this + case, i.e., bins[i-1] <= x < bins[i] is the default behavior for + monotonically increasing bins. + + Returns + ------- + indices : ndarray of ints + Output array of indices, of same shape as `x`. + + Raises + ------ + ValueError + If `bins` is not monotonic. + TypeError + If the type of the input is complex. + + See Also + -------- + bincount, histogram, unique, searchsorted + + Notes + ----- + If values in `x` are such that they fall outside the bin range, + attempting to index `bins` with the indices that `digitize` returns + will result in an IndexError. + + `numpy.digitize` is implemented in terms of `numpy.searchsorted`. + This means that a binary search is used to bin the values, which scales + much better for larger number of bins than the previous linear search. + It also removes the requirement for the input array to be 1-dimensional. + + For monotonically *increasing* `bins`, the following are equivalent:: + + np.digitize(x, bins, right=True) + np.searchsorted(bins, x, side='left') + + Note that as the order of the arguments are reversed, the side must be too. + The `searchsorted` call is marginally faster, as it does not do any + monotonicity checks. Perhaps more importantly, it supports all dtypes. + + Examples + -------- + >>> x = np.array([0.2, 6.4, 3.0, 1.6]) + >>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0]) + >>> inds = np.digitize(x, bins) + >>> inds + array([1, 4, 3, 2]) + >>> for n in range(x.size): + ... print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]]) + ... + 0.0 <= 0.2 < 1.0 + 4.0 <= 6.4 < 10.0 + 2.5 <= 3.0 < 4.0 + 1.0 <= 1.6 < 2.5 + + >>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.]) + >>> bins = np.array([0, 5, 10, 15, 20]) + >>> np.digitize(x,bins,right=True) + array([1, 2, 3, 4, 4]) + >>> np.digitize(x,bins,right=False) + array([1, 3, 3, 4, 5]) + + Availability + -------- + Multiple GPUs, Multiple CPUs + """ + x = asarray(x) + bins = asarray(bins) + + # here for compatibility, searchsorted below is happy to take this + if np.issubdtype(x.dtype, np.complexfloating): + raise TypeError("x may not be complex") + + increasing = np.all(bins[1:] >= bins[:-1]) + decreasing = np.all(bins[1:] <= bins[:-1]) + if not increasing and not decreasing: + raise ValueError("bins must be monotonically increasing or decreasing") + + # this is backwards because the arguments below are swapped + side: SortSide = "left" if right else "right" + if decreasing: + # reverse the bins, and invert the results + return len(bins) - searchsorted(bins[::-1], x, side=side) + else: + return searchsorted(bins, x, side=side) diff --git a/tests/integration/test_digitize.py b/tests/integration/test_digitize.py new file mode 100644 index 000000000..5d1808ae2 --- /dev/null +++ b/tests/integration/test_digitize.py @@ -0,0 +1,134 @@ +# Copyright 2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math + +import numpy as np +import pytest + +import cunumeric as num + +DTYPES = ( + np.uint32, + np.uint64, + np.float32, + np.float64, +) + +SHAPES = ( + (10,), + (2, 5), + (3, 7, 10), +) + + +class TestDigitizeErrors(object): + def test_complex_array(self): + a = np.array([2, 3, 10, 9], dtype=np.complex64) + bins = [0, 3, 5] + expected_exc = TypeError + with pytest.raises(expected_exc): + num.digitize(a, bins) + with pytest.raises(expected_exc): + np.digitize(a, bins) + + @pytest.mark.xfail + def test_bad_array(self): + bins = [0, 5, 3] + expected_exc = ValueError + with pytest.raises(expected_exc): + # cunumeric raises TypeError + num.digitize(None, bins) + with pytest.raises(expected_exc): + np.digitize(None, bins) + + @pytest.mark.xfail + def test_bad_bins(self): + a = [2, 3, 10, 9] + expected_exc = ValueError + with pytest.raises(expected_exc): + # cunumeric raises TypeError + num.digitize(a, None) + with pytest.raises(expected_exc): + np.digitize(a, None) + + def test_bins_non_monotonic(self): + a = [2, 3, 10, 9] + bins = [0, 5, 3] + expected_exc = ValueError + with pytest.raises(expected_exc): + num.digitize(a, bins) + with pytest.raises(expected_exc): + np.digitize(a, bins) + + +def generate_random(shape, dtype): + a_np = None + size = math.prod(shape) + if np.issubdtype(dtype, np.integer): + a_np = np.array( + np.random.randint( + np.iinfo(dtype).min, + np.iinfo(dtype).max, + size=size, + dtype=dtype, + ), + dtype=dtype, + ) + elif np.issubdtype(dtype, np.floating): + a_np = np.array(np.random.random(size=size), dtype=dtype) + elif np.issubdtype(dtype, np.complexfloating): + a_np = np.array( + np.random.random(size=size) + np.random.random(size=size) * 1j, + dtype=dtype, + ) + else: + assert False + return a_np.reshape(shape) + + +@pytest.mark.parametrize("right", (True, False)) +def test_empty(right): + bins = [0, 3, 5] + assert len(num.digitize([], bins, right=right)) == 0 + + +@pytest.mark.parametrize("shape", SHAPES, ids=str) +@pytest.mark.parametrize("dtype", DTYPES, ids=str) +@pytest.mark.parametrize("right", (True, False)) +def test_ndmin(shape, dtype, right): + a = generate_random(shape, dtype) + bins = [0, 3, 5] + + a_num = num.array(a) + bins_num = num.array(bins) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a, bins, right=right) + assert num.array_equal(res_np, res_num) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a_num, bins, right=right) + assert num.array_equal(res_np, res_num) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a_num, bins_num, right=right) + assert num.array_equal(res_np, res_num) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(sys.argv)) From eb803c3bdb6b2fa673192c36a48b9ec8d29faed6 Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Tue, 6 Feb 2024 12:12:42 -0800 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Manolis Papadakis --- cunumeric/module.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 24a269a7b..5cd9459e4 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -8286,8 +8286,8 @@ def histogram( @add_boilerplate("x", "bins") def digitize( - x: Union[ndarray, npt.ArrayLike], - bins: Union[ndarray, npt.ArrayLike], + x: ndarray, + bins: ndarray, right: bool = False, ) -> Union[int, ndarray]: """ @@ -8308,8 +8308,7 @@ def digitize( Parameters ---------- x : array_like - Input array to be binned. Prior to NumPy 1.10.0, this array had to - be 1-dimensional, but can now have any shape. + Input array to be binned. Doesn't need to be 1-dimensional. bins : array_like Array of bins. It has to be 1-dimensional and monotonic. right : bool, optional @@ -8333,7 +8332,7 @@ def digitize( See Also -------- - bincount, histogram, unique, searchsorted + numpy.digitize Notes ----- @@ -8341,11 +8340,6 @@ def digitize( attempting to index `bins` with the indices that `digitize` returns will result in an IndexError. - `numpy.digitize` is implemented in terms of `numpy.searchsorted`. - This means that a binary search is used to bin the values, which scales - much better for larger number of bins than the previous linear search. - It also removes the requirement for the input array to be 1-dimensional. - For monotonically *increasing* `bins`, the following are equivalent:: np.digitize(x, bins, right=True) @@ -8381,15 +8375,12 @@ def digitize( -------- Multiple GPUs, Multiple CPUs """ - x = asarray(x) - bins = asarray(bins) - # here for compatibility, searchsorted below is happy to take this if np.issubdtype(x.dtype, np.complexfloating): raise TypeError("x may not be complex") - increasing = np.all(bins[1:] >= bins[:-1]) - decreasing = np.all(bins[1:] <= bins[:-1]) + increasing = (bins[1:] >= bins[:-1]).all() + decreasing = (bins[1:] <= bins[:-1]).all() if not increasing and not decreasing: raise ValueError("bins must be monotonically increasing or decreasing") @@ -8397,6 +8388,6 @@ def digitize( side: SortSide = "left" if right else "right" if decreasing: # reverse the bins, and invert the results - return len(bins) - searchsorted(bins[::-1], x, side=side) + return len(bins) - searchsorted(bins.flip(), x, side=side) else: return searchsorted(bins, x, side=side) From c8704fd9006010a64bdcad5cb82049b99723c6dd Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Tue, 6 Feb 2024 12:42:49 -0800 Subject: [PATCH 3/4] test for decreasing bins --- tests/integration/test_digitize.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_digitize.py b/tests/integration/test_digitize.py index 5d1808ae2..d8f91ffd5 100644 --- a/tests/integration/test_digitize.py +++ b/tests/integration/test_digitize.py @@ -108,7 +108,7 @@ def test_empty(right): @pytest.mark.parametrize("shape", SHAPES, ids=str) @pytest.mark.parametrize("dtype", DTYPES, ids=str) @pytest.mark.parametrize("right", (True, False)) -def test_ndmin(shape, dtype, right): +def test_increasing_bins(shape, dtype, right): a = generate_random(shape, dtype) bins = [0, 3, 5] @@ -128,6 +128,29 @@ def test_ndmin(shape, dtype, right): assert num.array_equal(res_np, res_num) +@pytest.mark.parametrize("shape", SHAPES, ids=str) +@pytest.mark.parametrize("dtype", DTYPES, ids=str) +@pytest.mark.parametrize("right", (True, False)) +def test_decreasing_bins(shape, dtype, right): + a = generate_random(shape, dtype) + bins = [5, 3, 0] + + a_num = num.array(a) + bins_num = num.array(bins) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a, bins, right=right) + assert num.array_equal(res_np, res_num) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a_num, bins, right=right) + assert num.array_equal(res_np, res_num) + + res_np = np.digitize(a, bins, right=right) + res_num = num.digitize(a_num, bins_num, right=right) + assert num.array_equal(res_np, res_num) + + if __name__ == "__main__": import sys From ee48f891d9e4b831c427c83d74e342b7fa065edc Mon Sep 17 00:00:00 2001 From: bryan Date: Mon, 12 Feb 2024 14:52:54 -0800 Subject: [PATCH 4/4] test for bins.ndims --- cunumeric/module.py | 3 +++ tests/integration/test_digitize.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/cunumeric/module.py b/cunumeric/module.py index 5cd9459e4..6a0a586ab 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -8379,6 +8379,9 @@ def digitize( if np.issubdtype(x.dtype, np.complexfloating): raise TypeError("x may not be complex") + if bins.ndim > 1: + raise ValueError("bins must be one-dimensional") + increasing = (bins[1:] >= bins[:-1]).all() decreasing = (bins[1:] <= bins[:-1]).all() if not increasing and not decreasing: diff --git a/tests/integration/test_digitize.py b/tests/integration/test_digitize.py index d8f91ffd5..9c75c3a60 100644 --- a/tests/integration/test_digitize.py +++ b/tests/integration/test_digitize.py @@ -73,6 +73,15 @@ def test_bins_non_monotonic(self): with pytest.raises(expected_exc): np.digitize(a, bins) + def test_bins_ndim(self): + a = [2, 3, 10, 9] + bins = np.array([[0], [5], [3]]) + expected_exc = ValueError + with pytest.raises(expected_exc): + num.digitize(a, bins) + with pytest.raises(expected_exc): + np.digitize(a, bins) + def generate_random(shape, dtype): a_np = None