From 02f814c43ee77bafcb930e2969051160401f824b Mon Sep 17 00:00:00 2001 From: Amanda Potts Date: Thu, 23 May 2024 15:39:34 -0400 Subject: [PATCH] Closes #3177 Index.sort_values --- PROTO_tests/tests/coargsort_test.py | 62 ++++++++ PROTO_tests/tests/index_test.py | 227 +++++++++++++++++++++++++++- PROTO_tests/tests/sort_test.py | 47 ++++++ arkouda/categorical.py | 23 ++- arkouda/index.py | 118 +++++++++++++-- arkouda/series.py | 22 ++- arkouda/sorting.py | 33 +++- tests/coargsort_test.py | 64 ++++++++ tests/index_test.py | 174 ++++++++++++++++++++- tests/sort_test.py | 48 ++++++ 10 files changed, 778 insertions(+), 40 deletions(-) diff --git a/PROTO_tests/tests/coargsort_test.py b/PROTO_tests/tests/coargsort_test.py index 2b420bb7e4..05edae3872 100644 --- a/PROTO_tests/tests/coargsort_test.py +++ b/PROTO_tests/tests/coargsort_test.py @@ -98,6 +98,68 @@ def test_coargsort_bool(self): assert args[0][perm].to_list() == [False, False, True, True, True] assert args[1][perm].to_list() == [2, 4, 1, 3, 5] + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_coargsort_wstrings(self, prob_size): + ak_char_array = ak.random_strings_uniform(minlen=1, maxlen=2, seed=1, size=prob_size) + ak_int_array = ak.randint(0, 10 * prob_size, prob_size, dtype=ak.int64, seed=1) + + perm = ak.coargsort([ak_char_array, ak_int_array]) + arrays = [ak_char_array[perm], ak_int_array[perm]] + + # code borrowed from arkouda.alignment.is_cosorted + # initialize the array to track boundary + boundary = arrays[0][:-1] != arrays[0][1:] + for array in arrays[1:]: + left = array[:-1] + right = array[1:] + _ = left <= right + if not (_ | boundary).all(): + raise ValueError + boundary = boundary | (left != right) + + # Now check ascending=False + perm = ak.coargsort([ak_char_array, ak_int_array], ascending=False) + arrays = [ak_char_array[perm], ak_int_array[perm]] + + # code borrowed from arkouda.alignment.is_cosorted + # initialize the array to track boundary + boundary = arrays[0][:-1] != arrays[0][1:] + for array in arrays[1:]: + left = array[:-1] + right = array[1:] + _ = left >= right + if not (_ | boundary).all(): + raise ValueError + boundary = boundary | (left != right) + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_coargsort_numeric(self, prob_size): + from arkouda.alignment import is_cosorted + + ak_int_array = ak.randint(0, 10 * prob_size, prob_size, dtype=ak.int64, seed=1) + ak_float_array = ak.randint(0, 10 * prob_size, prob_size, dtype=ak.float64, seed=2) + + perm = ak.coargsort([ak_int_array, ak_float_array]) + arrays = [ak_int_array[perm], ak_float_array[perm]] + + assert is_cosorted(arrays) + + perm = ak.coargsort([ak_float_array, ak_int_array]) + arrays = [ak_float_array[perm], ak_int_array[perm]] + + assert is_cosorted(arrays) + + # Now check when ascending=False + perm = ak.coargsort([ak_int_array, ak_float_array], ascending=False) + arrays = [-1 * ak_int_array[perm], -1 * ak_float_array[perm]] + + assert is_cosorted(arrays) + + perm = ak.coargsort([ak_float_array, ak_int_array], ascending=False) + arrays = [-1 * ak_float_array[perm], -1 * ak_int_array[perm]] + + assert is_cosorted(arrays) + @pytest.mark.parametrize("algo", SortingAlgorithm) def test_error_handling(self, algo): ones, short_ones = ak.ones(100), ak.ones(10) diff --git a/PROTO_tests/tests/index_test.py b/PROTO_tests/tests/index_test.py index 3207b0fb83..2625f46536 100644 --- a/PROTO_tests/tests/index_test.py +++ b/PROTO_tests/tests/index_test.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest from numpy import dtype as npdtype @@ -27,7 +28,7 @@ def test_index_creation_lists(self): i3 = ak.Index(["a", "b", "c"], allow_list=True) assert isinstance(i3.values, list) - assert i3.dtype == dtype(" str: return "integer" elif _is_dtype_in_union(self.dtype, float_scalars): return "floating" - elif self.dtype == " bool: else: return akall(self == other) + def _reindex(self, perm): + if isinstance(self, MultiIndex): + return MultiIndex(self[perm].levels, name=self.name, names=self.names) + elif isinstance(self.values, list): + if not isinstance(perm, list): + perm = perm.to_list() + return Index([self.values[i] for i in perm], name=self.name, allow_list=True) + else: + return Index(self.values[perm], name=self.name) + + @typechecked + def sort_values( + self, ascending: bool = True, return_indexer: bool = False, na_position="last" + ) -> Union[Index, Tuple]: + """ + Return a sorted copy of the index. + + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + Returns + ------- + sorted_index : arkouda.Index + Sorted copy of the index. + indexer : arkouda.pdarray or list, optional + The indices that the index itself was sorted by. + + Examples + -------- + >>> idx = ak.Index([10, 100, 1, 1000]) + >>> idx + Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + + """ + + if na_position not in ["first", "last"]: + raise ValueError('na_position must be "first" or "last".') + + if isinstance(self, MultiIndex): + perm = coargsort(self.levels, ascending=ascending) + elif isinstance(self.values, list): + from numpy import argsort as np_argsort + + if ascending is True: + perm = np_argsort(self.values).tolist() + else: + perm = np_argsort(self.values)[::-1].tolist() + else: + perm = argsort(self.values, ascending=ascending) + + from arkouda.util import is_float + + if isinstance(self.values, pdarray) and is_float(self.values): + from arkouda import concatenate + + from arkouda import isnan as ak_isnan + + is_nan = ak_isnan(self.values) + if na_position == "last": + perm = concatenate([perm[~is_nan], perm[is_nan]]) + elif na_position == "first": + perm = concatenate([perm[is_nan], perm[~is_nan]]) + + elif isinstance(self.values, list): + from numpy import isnan as np_isnan + + is_nan = np_isnan(self.values) + nan_vals = [i for (i, b) in zip(perm, is_nan) if b] + not_nan_vals = [i for (i, b) in zip(perm, is_nan) if not b] + + if na_position == "last": + perm = [*not_nan_vals, *nan_vals] # type: ignore + elif na_position == "first": + perm = [*nan_vals, *not_nan_vals] # type: ignore + + if return_indexer: + return self._reindex(perm), perm + else: + return self._reindex(perm) + def memory_usage(self, unit="B"): """ Return the memory usage of the Index values. @@ -1047,13 +1141,15 @@ class MultiIndex(Index): def __init__( self, - levels: Union[list, pdarray, Strings, Categorical], + levels: Union[list, tuple, pdarray, Strings, Categorical], name: Optional[str] = None, names: Optional[list[str]] = None, ): self.registered_name: Optional[str] = None if not (isinstance(levels, list) or isinstance(levels, tuple)): raise TypeError("MultiIndex should be an iterable") + elif isinstance(levels, tuple): + levels = list(levels) self.levels = levels first = True self._names = names diff --git a/arkouda/series.py b/arkouda/series.py index a3592cd562..83e685bdbb 100644 --- a/arkouda/series.py +++ b/arkouda/series.py @@ -12,12 +12,11 @@ from arkouda.accessor import CachedAccessor, DatetimeAccessor, StringAccessor from arkouda.alignment import lookup from arkouda.categorical import Categorical -from arkouda.dtypes import dtype, float64, int64 +from arkouda.dtypes import dtype, int64 from arkouda.groupbyclass import GroupBy, groupable_element_type from arkouda.index import Index, MultiIndex from arkouda.numeric import cast as akcast from arkouda.numeric import isnan, value_counts -from arkouda.segarray import SegArray from arkouda.pdarrayclass import ( RegistrationError, any, @@ -27,6 +26,7 @@ ) from arkouda.pdarraycreation import arange, array, full, zeros from arkouda.pdarraysetops import argsort, concatenate, in1d, indexof1d +from arkouda.segarray import SegArray from arkouda.strings import Strings from arkouda.util import convert_if_categorical, get_callback, is_float @@ -685,8 +685,8 @@ def sort_index(self, ascending: bool = True) -> Series: A new Series sorted. """ - idx = self.index.argsort(ascending=ascending) - return self._reindex(idx) + perm = self.index.argsort(ascending=ascending) + return self._reindex(perm) @typechecked def sort_values(self, ascending: bool = True) -> Series: @@ -701,21 +701,19 @@ def sort_values(self, ascending: bool = True) -> Series: ------- A new Series sorted smallest to largest """ + from arkouda.util import is_numeric if not ascending: - if isinstance(self.values, pdarray) and self.values.dtype in ( - int64, - float64, - ): + if isinstance(self.values, pdarray) and is_numeric(self.values): # For numeric values, negation reverses sort order - idx = argsort(-self.values) + perm = argsort(-self.values) else: # For non-numeric values, need the descending arange because reverse slicing # is not supported - idx = argsort(self.values)[arange(self.values.size - 1, -1, -1)] + perm = argsort(self.values)[arange(self.values.size - 1, -1, -1)] else: - idx = argsort(self.values) - return self._reindex(idx) + perm = argsort(self.values) + return self._reindex(perm) @typechecked def tail(self, n: int = 10) -> Series: diff --git a/arkouda/sorting.py b/arkouda/sorting.py index 3ff5d0493e..af18eaf05d 100644 --- a/arkouda/sorting.py +++ b/arkouda/sorting.py @@ -20,6 +20,7 @@ def argsort( pda: Union[pdarray, Strings, "Categorical"], # type: ignore # noqa + ascending: bool = True, algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, axis: int_scalars = 0, ) -> pdarray: @@ -30,7 +31,10 @@ def argsort( ---------- pda : pdarray or Strings or Categorical The array to sort (int64, uint64, or float64) - + ascending: bool = True + Ignored when the number of dimensions is > 1. + algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, + axis: int_scalars = 0 Returns ------- pdarray, int64 @@ -66,7 +70,7 @@ def argsort( check_type(argname="argsort", value=pda, expected_type=Union[pdarray, Strings, Categorical]) if hasattr(pda, "argsort"): - return cast(Categorical, pda).argsort() + return cast(Categorical, pda).argsort(ascending=ascending) if pda.size == 0 and hasattr(pda, "dtype"): return zeros(0, dtype=pda.dtype) if isinstance(pda, pdarray) and pda.dtype == bigint: @@ -80,12 +84,19 @@ def argsort( "axis": axis, }, ) - return create_pdarray(cast(str, repMsg)) + sorted_array = create_pdarray(cast(str, repMsg)) + if ascending is True or (hasattr(pda, "ndim") and pda.ndim != 1): + return sorted_array + else: + from arkouda import arange + + return sorted_array[arange(sorted_array.size - 1, -1, -1)] def coargsort( arrays: Sequence[Union[Strings, pdarray, "Categorical"]], # type: ignore # noqa algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD, + ascending=True, ) -> pdarray: """ Return the permutation that groups the rows (left-to-right), if the @@ -96,7 +107,9 @@ def coargsort( ---------- arrays : Sequence[Union[Strings, pdarray, Categorical]] The columns (int64, uint64, float64, Strings, or Categorical) to sort by row - + algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD + ascending: bool = True + Ignored when the number of dimensions is > 1. Returns ------- pdarray, int64 @@ -144,7 +157,11 @@ def coargsort( anames = [] atypes = [] expanded_arrays = [] + max_dim = 1 for a in arrays: + if hasattr(a, "ndim"): + from numpy import maximum + max_dim = maximum(a.ndim, max_dim) if not isinstance(a, pdarray) or a.dtype not in [bigint, bool]: expanded_arrays.append(a) elif a.dtype == bigint: @@ -182,7 +199,13 @@ def coargsort( "arr_types": atypes, }, ) - return create_pdarray(cast(str, repMsg)) + sorted_array = create_pdarray(cast(str, repMsg)) + if ascending is True or max_dim > 1: + return sorted_array + else: + from arkouda import arange + + return sorted_array[arange(sorted_array.size - 1, -1, -1)] @typechecked diff --git a/tests/coargsort_test.py b/tests/coargsort_test.py index 40d6a2d701..f9ad7972e8 100755 --- a/tests/coargsort_test.py +++ b/tests/coargsort_test.py @@ -204,6 +204,70 @@ def test_coargsort_bool(self): self.assertListEqual(args[0][perm].to_list(), [False, False, True, True, True]) self.assertListEqual(args[1][perm].to_list(), [2, 4, 1, 3, 5]) + def test_coargsort_wstrings(self): + size = 100 + + ak_char_array = ak.random_strings_uniform(minlen=1, maxlen=2, seed=1, size=size) + ak_int_array = ak.randint(0, 10 * size, size, dtype=ak.int64) + + perm = ak.coargsort([ak_char_array, ak_int_array]) + arrays = [ak_char_array[perm], ak_int_array[perm]] + + # code borrowed from arkouda.alignment.is_cosorted + # initialize the array to track boundary + boundary = arrays[0][:-1] != arrays[0][1:] + for array in arrays[1:]: + left = array[:-1] + right = array[1:] + _ = left <= right + if not (_ | boundary).all(): + raise ValueError + boundary = boundary | (left != right) + + # Now check ascending=False + perm = ak.coargsort([ak_char_array, ak_int_array], ascending=False) + arrays = [ak_char_array[perm], ak_int_array[perm]] + + # code borrowed from arkouda.alignment.is_cosorted + # initialize the array to track boundary + boundary = arrays[0][:-1] != arrays[0][1:] + for array in arrays[1:]: + left = array[:-1] + right = array[1:] + _ = left >= right + if not (_ | boundary).all(): + raise ValueError + boundary = boundary | (left != right) + + def test_coargsort_numeric(self): + from arkouda.alignment import is_cosorted + + size = 100 + + ak_int_array = ak.randint(0, 10 * size, size, dtype=ak.int64) + ak_float_array = ak.randint(0, 10 * size, size, dtype=ak.float64) + + perm = ak.coargsort([ak_int_array, ak_float_array]) + arrays = [ak_int_array[perm], ak_float_array[perm]] + + assert is_cosorted(arrays) + + perm = ak.coargsort([ak_float_array, ak_int_array]) + arrays = [ak_float_array[perm], ak_int_array[perm]] + + assert is_cosorted(arrays) + + # Now check when ascending=False + perm = ak.coargsort([ak_int_array, ak_float_array], ascending=False) + arrays = [-1 * ak_int_array[perm], -1 * ak_float_array[perm]] + + assert is_cosorted(arrays) + + perm = ak.coargsort([ak_float_array, ak_int_array], ascending=False) + arrays = [-1 * ak_float_array[perm], -1 * ak_int_array[perm]] + + assert is_cosorted(arrays) + def create_parser(): parser = argparse.ArgumentParser(description="Check coargsort correctness.") diff --git a/tests/index_test.py b/tests/index_test.py index 75fa978e78..d7d9189e03 100644 --- a/tests/index_test.py +++ b/tests/index_test.py @@ -2,7 +2,7 @@ import os import tempfile - +import numpy as np import pandas as pd from base_test import ArkoudaTest from context import arkouda as ak @@ -12,7 +12,6 @@ from arkouda.dtypes import dtype from arkouda.index import Index from arkouda.pdarrayclass import pdarray -from arkouda.index import Index class IndexTest(ArkoudaTest): @@ -39,7 +38,7 @@ def test_index_creation_lists(self): i3 = ak.Index(["a", "b", "c"], allow_list=True) self.assertIsInstance(i3.values, list) - self.assertEqual(i3.dtype, dtype("