diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 34898e640697..49ef91de9be4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -114,7 +114,7 @@ jobs: const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown'; const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown'; - let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`; + let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`; // Calculate percentage increase if both sizes are available if (previousSize !== 'Unknown' && currentSize !== '') { diff --git a/crates/polars-core/src/chunked_array/comparison/categorical.rs b/crates/polars-core/src/chunked_array/comparison/categorical.rs index bbcd6b6047c9..09573c5fbd32 100644 --- a/crates/polars-core/src/chunked_array/comparison/categorical.rs +++ b/crates/polars-core/src/chunked_array/comparison/categorical.rs @@ -374,13 +374,29 @@ where // Apply comparison on categories map and then do a lookup let bitmap = str_single_compare_function(lhs.get_rev_map().get_categories(), rhs); - Ok( - BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(|opt_idx| { - // SAFETY: indexing into bitmap with same length as original array - opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) }) - })) - .with_name(lhs.name().clone()), - ) + let mask = match lhs.get_rev_map().as_ref() { + RevMapping::Local(_, _) => { + BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map( + |opt_idx| { + // SAFETY: indexing into bitmap with same length as original array + opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) }) + }, + )) + }, + RevMapping::Global(idx_map, _, _) => { + BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map( + |opt_idx| { + // SAFETY: indexing into bitmap with same length as original array + opt_idx.map(|idx| unsafe { + let idx = *idx_map.get(&idx).unwrap(); + bitmap.get_bit_unchecked(idx as usize) + }) + }, + )) + }, + }; + + Ok(mask.with_name(lhs.name().clone())) } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index c46291e4382a..17752f828d8d 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -1,5 +1,3 @@ -use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel}; - use super::*; impl CategoricalChunked { @@ -41,18 +39,7 @@ impl CategoricalChunked { Ok(out) } } else { - let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); - for chunk in self.physical().downcast_iter() { - state.key_state().append(chunk); - } - let (_, unique, _) = state.finalize_unique().take(); - let ca = unsafe { - UInt32Chunked::from_chunks_and_dtype_unchecked( - self.physical().name().clone(), - vec![unique.to_boxed()], - DataType::UInt32, - ) - }; + let ca = self.physical().unique()?; // SAFETY: // we only removed some indexes so we are still in bounds unsafe { @@ -70,12 +57,7 @@ impl CategoricalChunked { if self._can_fast_unique() { Ok(self.get_rev_map().len()) } else { - let cat_map = self.get_rev_map(); - let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); - for chunk in self.physical().downcast_iter() { - state.key_state().append(chunk); - } - Ok(state.finalize_n_unique()) + self.physical().n_unique() } } diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 66aa7b2ba898..d5416bc47a93 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -904,7 +904,7 @@ def test_perfect_group_by_19950() -> None: def test_categorical_unique() -> None: s = pl.Series(["a", "b", None], dtype=pl.Categorical) assert s.n_unique() == 3 - assert s.unique().to_list() == ["a", "b", None] + assert s.unique().sort().to_list() == [None, "a", "b"] @StringCache() @@ -925,3 +925,22 @@ def test_categorical_unique_20539() -> None: "unique": [["a", "b"], ["b", "c"], ["c"]], "unique_with_order": [["a", "b"], ["b", "c"], ["c"]], } + + +@StringCache() +@pytest.mark.may_fail_auto_streaming +def test_categorical_prefill() -> None: + # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443 + # prefill cache + pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical) # pre-fill cache + + # test_compare_categorical_single + assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False] + + # test_unique_categorical + a = pl.Series(["a"], dtype=pl.Categorical) + assert a.unique().to_list() == ["a"] + + s = pl.Series(["1", "2", "3"], dtype=pl.Categorical) + s = s.filter([True, False, True]) + assert s.n_unique() == 2