Skip to content

Commit

Permalink
add np.digitize
Browse files Browse the repository at this point in the history
  • Loading branch information
bryevdv committed Jan 26, 2024
1 parent b4e164b commit 9eaca9b
Show file tree
Hide file tree
Showing 2 changed files with 252 additions and 0 deletions.
118 changes: 118 additions & 0 deletions cunumeric/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -8282,3 +8282,121 @@ def histogram(
hist /= bins_array[1:] - bins_array[:-1]

return hist.astype(result_type), bins_array.astype(bins_orig_type)


@add_boilerplate("x", "bins")
def digitize(
x: Union[ndarray, npt.ArrayLike],
bins: Union[ndarray, npt.ArrayLike],
right: bool = False,
) -> Union[int, ndarray]:
"""
Return the indices of the bins to which each value in input array belongs.
========= ============= ============================
`right` order of bins returned index `i` satisfies
========= ============= ============================
``False`` increasing ``bins[i-1] <= x < bins[i]``
``True`` increasing ``bins[i-1] < x <= bins[i]``
``False`` decreasing ``bins[i-1] > x >= bins[i]``
``True`` decreasing ``bins[i-1] >= x > bins[i]``
========= ============= ============================
If values in `x` are beyond the bounds of `bins`, 0 or ``len(bins)`` is
returned as appropriate.
Parameters
----------
x : array_like
Input array to be binned. Prior to NumPy 1.10.0, this array had to
be 1-dimensional, but can now have any shape.
bins : array_like
Array of bins. It has to be 1-dimensional and monotonic.
right : bool, optional
Indicating whether the intervals include the right or the left bin
edge. Default behavior is (right==False) indicating that the interval
does not include the right edge. The left bin end is open in this
case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
monotonically increasing bins.
Returns
-------
indices : ndarray of ints
Output array of indices, of same shape as `x`.
Raises
------
ValueError
If `bins` is not monotonic.
TypeError
If the type of the input is complex.
See Also
--------
bincount, histogram, unique, searchsorted
Notes
-----
If values in `x` are such that they fall outside the bin range,
attempting to index `bins` with the indices that `digitize` returns
will result in an IndexError.
`numpy.digitize` is implemented in terms of `numpy.searchsorted`.
This means that a binary search is used to bin the values, which scales
much better for larger number of bins than the previous linear search.
It also removes the requirement for the input array to be 1-dimensional.
For monotonically *increasing* `bins`, the following are equivalent::
np.digitize(x, bins, right=True)
np.searchsorted(bins, x, side='left')
Note that as the order of the arguments are reversed, the side must be too.
The `searchsorted` call is marginally faster, as it does not do any
monotonicity checks. Perhaps more importantly, it supports all dtypes.
Examples
--------
>>> x = np.array([0.2, 6.4, 3.0, 1.6])
>>> bins = np.array([0.0, 1.0, 2.5, 4.0, 10.0])
>>> inds = np.digitize(x, bins)
>>> inds
array([1, 4, 3, 2])
>>> for n in range(x.size):
... print(bins[inds[n]-1], "<=", x[n], "<", bins[inds[n]])
...
0.0 <= 0.2 < 1.0
4.0 <= 6.4 < 10.0
2.5 <= 3.0 < 4.0
1.0 <= 1.6 < 2.5
>>> x = np.array([1.2, 10.0, 12.4, 15.5, 20.])
>>> bins = np.array([0, 5, 10, 15, 20])
>>> np.digitize(x,bins,right=True)
array([1, 2, 3, 4, 4])
>>> np.digitize(x,bins,right=False)
array([1, 3, 3, 4, 5])
Availability
--------
Multiple GPUs, Multiple CPUs
"""
x = asarray(x)
bins = asarray(bins)

# here for compatibility, searchsorted below is happy to take this
if np.issubdtype(x.dtype, np.complexfloating):
raise TypeError("x may not be complex")

increasing = np.all(bins[1:] >= bins[:-1])
decreasing = np.all(bins[1:] <= bins[:-1])
if not increasing and not decreasing:
raise ValueError("bins must be monotonically increasing or decreasing")

# this is backwards because the arguments below are swapped
side: SortSide = "left" if right else "right"
if decreasing:
# reverse the bins, and invert the results
return len(bins) - searchsorted(bins[::-1], x, side=side)
else:
return searchsorted(bins, x, side=side)
134 changes: 134 additions & 0 deletions tests/integration/test_digitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Copyright 2022 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import math

import numpy as np
import pytest

import cunumeric as num

DTYPES = (
np.uint32,
np.uint64,
np.float32,
np.float64,
)

SHAPES = (
(10,),
(2, 5),
(3, 7, 10),
)


class TestDigitizeErrors(object):
def test_complex_array(self):
a = np.array([2, 3, 10, 9], dtype=np.complex64)
bins = [0, 3, 5]
expected_exc = TypeError
with pytest.raises(expected_exc):
num.digitize(a, bins)
with pytest.raises(expected_exc):
np.digitize(a, bins)

@pytest.mark.xfail
def test_bad_array(self):
bins = [0, 5, 3]
expected_exc = ValueError
with pytest.raises(expected_exc):
# cunumeric raises TypeError
num.digitize(None, bins)
with pytest.raises(expected_exc):
np.digitize(None, bins)

@pytest.mark.xfail
def test_bad_bins(self):
a = [2, 3, 10, 9]
expected_exc = ValueError
with pytest.raises(expected_exc):
# cunumeric raises TypeError
num.digitize(a, None)
with pytest.raises(expected_exc):
np.digitize(a, None)

def test_bins_non_monotonic(self):
a = [2, 3, 10, 9]
bins = [0, 5, 3]
expected_exc = ValueError
with pytest.raises(expected_exc):
num.digitize(a, bins)
with pytest.raises(expected_exc):
np.digitize(a, bins)


def generate_random(shape, dtype):
a_np = None
size = math.prod(shape)
if np.issubdtype(dtype, np.integer):
a_np = np.array(
np.random.randint(
np.iinfo(dtype).min,
np.iinfo(dtype).max,
size=size,
dtype=dtype,
),
dtype=dtype,
)
elif np.issubdtype(dtype, np.floating):
a_np = np.array(np.random.random(size=size), dtype=dtype)
elif np.issubdtype(dtype, np.complexfloating):
a_np = np.array(
np.random.random(size=size) + np.random.random(size=size) * 1j,
dtype=dtype,
)
else:
assert False
return a_np.reshape(shape)


@pytest.mark.parametrize("right", (True, False))
def test_empty(right):
bins = [0, 3, 5]
assert len(num.digitize([], bins, right=right)) == 0


@pytest.mark.parametrize("shape", SHAPES, ids=str)
@pytest.mark.parametrize("dtype", DTYPES, ids=str)
@pytest.mark.parametrize("right", (True, False))
def test_ndmin(shape, dtype, right):
a = generate_random(shape, dtype)
bins = [0, 3, 5]

a_num = num.array(a)
bins_num = num.array(bins)

res_np = np.digitize(a, bins, right=right)
res_num = num.digitize(a, bins, right=right)
assert num.array_equal(res_np, res_num)

res_np = np.digitize(a, bins, right=right)
res_num = num.digitize(a_num, bins, right=right)
assert num.array_equal(res_np, res_num)

res_np = np.digitize(a, bins, right=right)
res_num = num.digitize(a_num, bins_num, right=right)
assert num.array_equal(res_np, res_num)


if __name__ == "__main__":
import sys

sys.exit(pytest.main(sys.argv))

0 comments on commit 9eaca9b

Please sign in to comment.