diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8091f3f7dd2..2f6e864b51c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -608,8 +608,14 @@ def intersection(self, other, sort=False): (1, 'Blue')], ) """ + if not can_convert_to_column(other): + raise TypeError("Input must be Index or array-like") + if not isinstance(other, BaseIndex): - other = cudf.Index(other, name=self.name) + other = cudf.Index( + other, + name=getattr(other, "name", self.name), + ) if sort not in {None, False}: raise ValueError( @@ -617,10 +623,17 @@ def intersection(self, other, sort=False): f"None or False; {sort} was passed." ) - if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) - return self._get_reconciled_name_object(other) + if not len(self) or not len(other) or self.equals(other): + common_dtype = cudf.utils.dtypes._dtype_pandas_compatible( + cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) + ) + + lhs = self.unique() if self.has_duplicates else self + rhs = other + if not len(other): + lhs, rhs = rhs, lhs + + return lhs._get_reconciled_name_object(rhs).astype(common_dtype) res_name = _get_result_name(self.name, other.name) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 57c481db0d8..56ec9ce0359 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -682,7 +682,9 @@ def _union(self, other, sort=None): @_cudf_nvtx_annotate def _intersection(self, other, sort=False): if not isinstance(other, RangeIndex): - return super()._intersection(other, sort=sort) + return self._try_reconstruct_range_index( + super()._intersection(other, sort=sort) + ) if not len(self) or not len(other): return RangeIndex(0) @@ -723,7 +725,7 @@ def _intersection(self, other, sort=False): if sort is None: new_index = new_index.sort_values() - return new_index + return self._try_reconstruct_range_index(new_index) @_cudf_nvtx_annotate def difference(self, other, sort=None): diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 7d799fa1573..1071261044f 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -74,6 +74,7 @@ def _match_join_keys( common_type = ltype.categories.dtype else: common_type = rtype.categories.dtype + common_type = cudf.utils.dtypes._dtype_pandas_compatible(common_type) return lcol.astype(common_type), rcol.astype(common_type) if is_dtype_equal(ltype, rtype): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f7f6e1f9114..6fb615c22e0 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.api.types import is_bool_dtype from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 from cudf.core.index import ( CategoricalIndex, @@ -2104,25 +2105,48 @@ def test_union_index(idx1, idx2, sort): (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])), (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), - (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), + ( + pd.Index(["a", "b", "c", "d", "c"]), + pd.Index(["a", "c", "z"], name="abc"), + ), ( pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "b", "c", "d", "c"]), ), (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])), (pd.Index([True, False, True, True]), pd.Index([True, True])), + (pd.RangeIndex(0, 10, name="a"), pd.Index([5, 6, 7], name="b")), + (pd.Index(["a", "b", "c"], dtype="category"), pd.Index(["a", "b"])), + (pd.Index(["a", "b", "c"], dtype="category"), pd.Index([1, 2, 3])), + (pd.Index([0, 1, 2], dtype="category"), pd.RangeIndex(0, 10)), + (pd.Index(["a", "b", "c"], name="abc"), []), + (pd.Index([], name="abc"), pd.RangeIndex(0, 4)), + (pd.Index([1, 2, 3]), pd.Index([1, 2], dtype="category")), + (pd.Index([]), pd.Index([1, 2], dtype="category")), ], ) @pytest.mark.parametrize("sort", [None, False]) -def test_intersection_index(idx1, idx2, sort): +@pytest.mark.parametrize("pandas_compatible", [True, False]) +def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected = idx1.intersection(idx2, sort=sort) - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - actual = idx1.intersection(idx2, sort=sort) - - assert_eq(expected, actual, exact=False) + with cudf.option_context("mode.pandas_compatible", pandas_compatible): + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + actual = idx1.intersection(idx2, sort=sort) + + # TODO: Resolve the bool vs ints mixed issue + # once pandas has a direction on this issue + # https://github.com/pandas-dev/pandas/issues/44000 + assert_eq( + expected, + actual, + exact=False + if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype)) + or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype)) + else True, + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e50457b8e7b..1b94db75340 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -634,6 +634,16 @@ def find_common_type(dtypes): return cudf.dtype(common_dtype) +def _dtype_pandas_compatible(dtype): + """ + A utility function, that returns `str` instead of `object` + dtype when pandas comptibility mode is enabled. + """ + if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"): + return "str" + return dtype + + def _can_cast(from_dtype, to_dtype): """ Utility function to determine if we can cast