From 5c08a86b5cee5bafaf50bcb513693b2a023a0643 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 24 Jun 2024 14:25:08 +0100 Subject: [PATCH 1/3] Run ETE benchmarks with MiMalloc and leave a note encouraging its usage (#399) fixes #395 --- Cargo.lock | 20 ++++++++++++++++++++ Cargo.toml | 1 + README.md | 11 +++++++++++ bench-vortex/Cargo.toml | 7 ++++--- bench-vortex/benches/random_access.rs | 7 +++++-- 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 881441f94..40315168f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -433,6 +433,7 @@ dependencies = [ "itertools 0.13.0", "lazy_static", "log", + "mimalloc", "parquet", "reqwest", "serde", @@ -1961,6 +1962,16 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libmimalloc-sys" +version = "0.1.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7bb23d733dfcc8af652a78b7bf232f0e967710d044732185e561e47c0336b6" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -2043,6 +2054,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mimalloc" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9186d86b79b52f4a77af65604b51225e8db1d6ee7e3f41aec1e40829c71a176" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" diff --git a/Cargo.toml b/Cargo.toml index ac85667a7..67e19e05e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,6 +103,7 @@ uuid = "1.8.0" walkdir = "2.5.0" worker = "0.3.0" zigzag = "0.1.0" +mimalloc = "0.1.42" [workspace.lints.rust] warnings = "deny" diff --git a/README.md b/README.md index 6157c4ac8..338719eb1 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,16 @@ Vortex to model more complex arrays while still exposing a logical interface. Fo `ChunkedArray` where the first chunk is run-length encoded and the second chunk is dictionary encoded. In Arrow, `RunLengthArray` and `DictionaryArray` are separate incompatible types, and so cannot be combined in this way. +### Usage + +For best performance we recommend using [MiMalloc](https://github.com/microsoft/mimalloc) as the application's +allocator. + +```rust +#[global_allocator] +static GLOBAL_ALLOC: MiMalloc = MiMalloc; +``` + ## Contributing Please see [CONTRIBUTING.md](CONTRIBUTING.md). @@ -159,6 +169,7 @@ Please see [CONTRIBUTING.md](CONTRIBUTING.md). In order to build vortex, you may also need to install the flatbuffer compiler (flatc): ### Mac + ```bash brew install flatbuffers ``` diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index fa2f178ca..cb222976d 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -17,16 +17,20 @@ workspace = true [dependencies] arrow-array = { workspace = true } arrow-select = { workspace = true } +bytes = { workspace = true } bzip2 = { workspace = true } csv = { workspace = true } enum-iterator = { workspace = true } +flexbuffers = { workspace = true } futures = { workspace = true } humansize = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } log = { workspace = true } +mimalloc = { workspace = true } parquet = { workspace = true, features = [] } reqwest = { workspace = true } +serde = { workspace = true } simplelog = { workspace = true } tokio = { workspace = true, features = ["full"] } uuid = { workspace = true, features = ["v4"] } @@ -41,9 +45,6 @@ vortex-fastlanes = { path = "../encodings/fastlanes" } vortex-ipc = { path = "../vortex-ipc" } vortex-ree = { path = "../encodings/runend" } vortex-roaring = { path = "../encodings/roaring" } -serde = { workspace = true } -bytes = { workspace = true } -flexbuffers = { workspace = true } [dev-dependencies] criterion = { workspace = true, features = ["html_reports", "async_tokio"] } diff --git a/bench-vortex/benches/random_access.rs b/bench-vortex/benches/random_access.rs index 4c5e5b4d8..ef6281f80 100644 --- a/bench-vortex/benches/random_access.rs +++ b/bench-vortex/benches/random_access.rs @@ -1,11 +1,14 @@ use bench_vortex::reader::{take_parquet, take_vortex}; use bench_vortex::taxi_data::{taxi_data_parquet, taxi_data_vortex}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use mimalloc::MiMalloc; use tokio::runtime::Runtime; +#[global_allocator] +static GLOBAL: MiMalloc = MiMalloc; + fn random_access(c: &mut Criterion) { let mut group = c.benchmark_group("random access"); - group.sample_size(10); let indices = [10, 11, 12, 13, 100_000, 3_000_000]; @@ -16,7 +19,7 @@ fn random_access(c: &mut Criterion) { }); let taxi_parquet = taxi_data_parquet(); - group.bench_function("arrow", |b| { + group.sample_size(10).bench_function("arrow", |b| { b.iter(|| black_box(take_parquet(&taxi_parquet, &indices).unwrap())) }); } From a430fa61c88e869d291ea2f95ef8713b9cd55370 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 13:28:34 +0000 Subject: [PATCH 2/3] Update Rust crate mimalloc to v0.1.43 (#405) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [mimalloc](https://togithub.com/purpleprotocol/mimalloc_rust) | workspace.dependencies | patch | `0.1.42` -> `0.1.43` | --- ### Release Notes
purpleprotocol/mimalloc_rust (mimalloc) ### [`v0.1.43`](https://togithub.com/purpleprotocol/mimalloc_rust/releases/tag/v0.1.43): Version 0.1.43 [Compare Source](https://togithub.com/purpleprotocol/mimalloc_rust/compare/v0.1.42...v0.1.43) ##### Changes - Mimalloc `v2.1.7` - Fix static builds (credits [@​BlackDex](https://togithub.com/BlackDex))
--- ### Configuration 📅 **Schedule**: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Enabled. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/spiraldb/vortex). Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 40315168f..c23c47024 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1964,9 +1964,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libmimalloc-sys" -version = "0.1.38" +version = "0.1.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7bb23d733dfcc8af652a78b7bf232f0e967710d044732185e561e47c0336b6" +checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" dependencies = [ "cc", "libc", @@ -2056,9 +2056,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.42" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9186d86b79b52f4a77af65604b51225e8db1d6ee7e3f41aec1e40829c71a176" +checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" dependencies = [ "libmimalloc-sys", ] From c01871798f4ce3c0e6238f647eef13f244c9cef1 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 24 Jun 2024 15:22:35 +0100 Subject: [PATCH 3/3] Rename flatten -> canonicalize + bugfix + a secret third thing (#402) Have a couple of threads in progress, figured I'd break this out into its own PR to make reviews easier 1. Rename `*Flatten*` -> `Canonical` / `canonicalize` 2. Fix a bug with `struct_array_to_arrow` where it wasn't doing the conversion deeply (test added) 3. Bit of a driveby, but implemented the `ArrayAccessor` for `BoolArray` (test added) 4. Add an extension to DataFusion `SessionContext` to load a Vortex array into a dataframe --- Cargo.lock | 68 +-- bench-vortex/src/lib.rs | 6 +- bench-vortex/src/reader.rs | 4 +- bench-vortex/src/vortex_utils.rs | 2 +- encodings/alp/src/array.rs | 8 +- encodings/alp/src/compress.rs | 15 +- encodings/alp/src/compute.rs | 2 +- encodings/datetime-parts/src/array.rs | 8 +- encodings/datetime-parts/src/compress.rs | 8 +- encodings/datetime-parts/src/compute.rs | 12 +- encodings/dict/src/compress.rs | 2 +- encodings/dict/src/compute.rs | 46 +- encodings/dict/src/dict.rs | 17 +- encodings/dict/src/lib.rs | 4 + .../fastlanes/src/bitpacking/compress.rs | 10 +- .../fastlanes/src/bitpacking/compute/mod.rs | 17 +- encodings/fastlanes/src/bitpacking/mod.rs | 16 +- encodings/fastlanes/src/delta/compress.rs | 10 +- encodings/fastlanes/src/delta/mod.rs | 8 +- encodings/fastlanes/src/for/compress.rs | 14 +- encodings/fastlanes/src/for/compute.rs | 4 +- encodings/fastlanes/src/for/mod.rs | 8 +- encodings/roaring/src/boolean/compress.rs | 4 +- encodings/roaring/src/boolean/compute.rs | 2 +- encodings/roaring/src/boolean/mod.rs | 10 +- encodings/roaring/src/integer/compress.rs | 3 +- encodings/roaring/src/integer/compute.rs | 2 +- encodings/roaring/src/integer/mod.rs | 16 +- encodings/runend/src/compress.rs | 4 +- encodings/runend/src/compute.rs | 15 +- encodings/runend/src/ree.rs | 30 +- encodings/zigzag/src/compute.rs | 2 +- encodings/zigzag/src/zigzag.rs | 6 +- pyvortex/src/array.rs | 2 +- pyvortex/src/vortex_arrow.rs | 6 +- vortex-array/benches/scalar_subtract.rs | 8 +- vortex-array/src/accessor.rs | 8 +- vortex-array/src/array/bool/accessors.rs | 59 +++ .../src/array/bool/compute/compare.rs | 17 +- vortex-array/src/array/bool/compute/fill.rs | 6 +- vortex-array/src/array/bool/compute/mod.rs | 4 +- .../src/array/bool/compute/scalar_at.rs | 2 +- vortex-array/src/array/bool/compute/take.rs | 4 +- vortex-array/src/array/bool/mod.rs | 11 +- vortex-array/src/array/bool/stats.rs | 4 +- .../chunked/{flatten.rs => canonical.rs} | 47 +- vortex-array/src/array/chunked/compute/mod.rs | 4 +- .../src/array/chunked/compute/take.rs | 11 +- vortex-array/src/array/chunked/mod.rs | 27 +- vortex-array/src/array/constant/compute.rs | 2 +- vortex-array/src/array/constant/flatten.rs | 10 +- vortex-array/src/array/extension/compute.rs | 4 +- vortex-array/src/array/extension/mod.rs | 8 +- vortex-array/src/array/mod.rs | 2 +- vortex-array/src/array/null/compute.rs | 8 +- vortex-array/src/array/null/mod.rs | 8 +- .../src/array/primitive/compute/cast.rs | 14 +- .../src/array/primitive/compute/compare.rs | 17 +- .../src/array/primitive/compute/fill.rs | 14 +- .../array/primitive/compute/filter_indices.rs | 37 +- .../src/array/primitive/compute/mod.rs | 8 +- .../src/array/primitive/compute/scalar_at.rs | 2 +- .../primitive/compute/subtract_scalar.rs | 22 +- .../src/array/primitive/compute/take.rs | 4 +- vortex-array/src/array/primitive/mod.rs | 12 +- vortex-array/src/array/primitive/stats.rs | 4 +- vortex-array/src/array/sparse/compress.rs | 3 +- vortex-array/src/array/sparse/compute/mod.rs | 26 +- vortex-array/src/array/sparse/flatten.rs | 40 +- vortex-array/src/array/sparse/mod.rs | 32 +- .../src/array/{struct => struct_}/compress.rs | 0 .../src/array/{struct => struct_}/compute.rs | 4 +- .../src/array/{struct => struct_}/mod.rs | 24 +- vortex-array/src/array/varbin/accessor.rs | 5 +- vortex-array/src/array/varbin/builder.rs | 2 +- vortex-array/src/array/varbin/compute/mod.rs | 2 +- vortex-array/src/array/varbin/compute/take.rs | 8 +- vortex-array/src/array/varbin/flatten.rs | 8 +- vortex-array/src/array/varbin/mod.rs | 8 +- vortex-array/src/array/varbinview/accessor.rs | 7 +- vortex-array/src/array/varbinview/compute.rs | 2 +- vortex-array/src/array/varbinview/mod.rs | 30 +- vortex-array/src/arrow/array.rs | 2 +- vortex-array/src/arrow/recordbatch.rs | 2 +- vortex-array/src/canonical.rs | 469 ++++++++++++++++++ vortex-array/src/compress.rs | 8 +- vortex-array/src/compute/compare.rs | 15 +- vortex-array/src/compute/filter_indices.rs | 4 +- vortex-array/src/compute/mod.rs | 21 +- vortex-array/src/compute/patch.rs | 28 -- vortex-array/src/compute/search_sorted.rs | 2 +- vortex-array/src/compute/take.rs | 4 +- vortex-array/src/compute/{ => unary}/cast.rs | 5 +- .../{fill.rs => unary/fill_forward.rs} | 0 vortex-array/src/compute/unary/mod.rs | 4 + .../src/compute/{ => unary}/scalar_at.rs | 0 .../compute/{ => unary}/scalar_subtract.rs | 4 +- vortex-array/src/context.rs | 2 +- vortex-array/src/encoding.rs | 8 +- vortex-array/src/flatten.rs | 264 ---------- vortex-array/src/implementation.rs | 5 +- vortex-array/src/lib.rs | 8 +- vortex-array/src/stream/take_rows.rs | 8 +- vortex-array/src/validity.rs | 25 +- vortex-datafusion/src/lib.rs | 89 ++-- vortex-dtype/src/dtype.rs | 4 + vortex-ipc/src/chunked_reader/take_rows.rs | 35 +- vortex-ipc/src/lib.rs | 4 +- 108 files changed, 1226 insertions(+), 774 deletions(-) create mode 100644 vortex-array/src/array/bool/accessors.rs rename vortex-array/src/array/chunked/{flatten.rs => canonical.rs} (80%) rename vortex-array/src/array/{struct => struct_}/compress.rs (100%) rename vortex-array/src/array/{struct => struct_}/compute.rs (94%) rename vortex-array/src/array/{struct => struct_}/mod.rs (88%) create mode 100644 vortex-array/src/canonical.rs delete mode 100644 vortex-array/src/compute/patch.rs rename vortex-array/src/compute/{ => unary}/cast.rs (73%) rename vortex-array/src/compute/{fill.rs => unary/fill_forward.rs} (100%) create mode 100644 vortex-array/src/compute/unary/mod.rs rename vortex-array/src/compute/{ => unary}/scalar_at.rs (100%) rename vortex-array/src/compute/{ => unary}/scalar_subtract.rs (91%) delete mode 100644 vortex-array/src/flatten.rs diff --git a/Cargo.lock b/Cargo.lock index c23c47024..09e1105b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -360,7 +360,7 @@ checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -386,7 +386,7 @@ checksum = "62f7df18977a1ee03650ee4b31b4aefed6d56bac188760b6e37610400fe8d4bb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -605,9 +605,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.99" +version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" +checksum = "c891175c3fb232128f48de6590095e59198bbeb8620c310be349bfc3afd12c7b" dependencies = [ "jobserver", "libc", @@ -1263,7 +1263,7 @@ checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -1304,7 +1304,7 @@ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -1466,7 +1466,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2118,7 +2118,7 @@ checksum = "176a5f5e69613d9e88337cf2a65e11135332b4efbcc628404a7c555e4452084c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2286,7 +2286,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2363,7 +2363,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2544,7 +2544,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2618,7 +2618,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2642,9 +2642,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -2676,7 +2676,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.66", + "syn 2.0.68", "tempfile", ] @@ -2690,7 +2690,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2760,7 +2760,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -2773,7 +2773,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -3139,7 +3139,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -3289,7 +3289,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -3300,9 +3300,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strum" -version = "0.26.2" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ "strum_macros", ] @@ -3317,14 +3317,14 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "0d0208408ba0c3df17ed26eb06992cb1a1268d41b2c0e12e65203fbe3972cee5" [[package]] name = "syn" @@ -3339,9 +3339,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.66" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", @@ -3429,7 +3429,7 @@ checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -3537,7 +3537,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -3648,7 +3648,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] @@ -4074,7 +4074,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", "wasm-bindgen-shared", ] @@ -4108,7 +4108,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4373,7 +4373,7 @@ dependencies = [ "async-trait", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", "wasm-bindgen", "wasm-bindgen-futures", "wasm-bindgen-macro-support", @@ -4418,7 +4418,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.68", ] [[package]] diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index b6781ad33..0a7fc6407 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -216,7 +216,7 @@ mod test { use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use vortex::arrow::FromArrowArray; use vortex::compress::Compressor; - use vortex::{ArrayData, IntoArray}; + use vortex::{ArrayData, IntoArray, IntoCanonical}; use crate::taxi_data::taxi_data_parquet; use crate::{compress_taxi_data, setup_logger, CTX}; @@ -239,7 +239,7 @@ mod test { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); let vortex_array = ArrayData::from_arrow(arrow_array.clone(), false).into_array(); - let vortex_as_arrow = vortex_array.flatten().unwrap().into_arrow(); + let vortex_as_arrow = vortex_array.into_canonical().unwrap().into_arrow(); assert_eq!(vortex_as_arrow.deref(), arrow_array.deref()); } } @@ -259,7 +259,7 @@ mod test { let vortex_array = ArrayData::from_arrow(arrow_array.clone(), false).into_array(); let compressed = Compressor::new(&CTX).compress(&vortex_array, None).unwrap(); - let compressed_as_arrow = compressed.flatten().unwrap().into_arrow(); + let compressed_as_arrow = compressed.into_canonical().unwrap().into_arrow(); assert_eq!(compressed_as_arrow.deref(), arrow_array.deref()); } } diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index c4cdb1791..67f7f1270 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -24,7 +24,7 @@ use vortex::array::primitive::PrimitiveArray; use vortex::arrow::FromArrowType; use vortex::compress::Compressor; use vortex::stream::ArrayStreamExt; -use vortex::{Array, IntoArray, ToArrayData, ViewContext}; +use vortex::{Array, IntoArray, IntoCanonical, ToArrayData, ViewContext}; use vortex_buffer::Buffer; use vortex_dtype::DType; use vortex_error::{vortex_err, VortexResult}; @@ -166,7 +166,7 @@ pub async fn take_vortex(path: &Path, indices: &[u64]) -> VortexResult { let indices_array = indices.to_vec().into_array(); let taken = reader.take_rows(&indices_array).await?; // For equivalence.... we flatten to make sure we're not cheating too much. - Ok(taken.flatten()?.into_array()) + Ok(taken.into_canonical()?.into_array()) } pub fn take_parquet(path: &Path, indices: &[u64]) -> VortexResult { diff --git a/bench-vortex/src/vortex_utils.rs b/bench-vortex/src/vortex_utils.rs index b24f14a6f..88c24b3ad 100644 --- a/bench-vortex/src/vortex_utils.rs +++ b/bench-vortex/src/vortex_utils.rs @@ -3,7 +3,7 @@ use std::os::unix::prelude::MetadataExt; use std::path::Path; use vortex::array::chunked::ChunkedArray; -use vortex::array::r#struct::StructArray; +use vortex::array::struct_::StructArray; use vortex::ArrayDType; use vortex_dtype::DType; use vortex_error::VortexResult; diff --git a/encodings/alp/src/array.rs b/encodings/alp/src/array.rs index 8a10c1c9b..e00376059 100644 --- a/encodings/alp/src/array.rs +++ b/encodings/alp/src/array.rs @@ -3,7 +3,7 @@ use vortex::array::primitive::PrimitiveArray; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_dtype::PType; use vortex_error::vortex_bail; @@ -88,9 +88,9 @@ impl ArrayValidity for ALPArray { } } -impl ArrayFlatten for ALPArray { - fn flatten(self) -> VortexResult { - decompress(self).map(Flattened::Primitive) +impl IntoCanonical for ALPArray { + fn into_canonical(self) -> VortexResult { + decompress(self).map(Canonical::Primitive) } } diff --git a/encodings/alp/src/compress.rs b/encodings/alp/src/compress.rs index 2eed560c8..fc04c3ae0 100644 --- a/encodings/alp/src/compress.rs +++ b/encodings/alp/src/compress.rs @@ -3,7 +3,7 @@ use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::{Sparse, SparseArray}; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; use vortex::validity::Validity; -use vortex::{Array, ArrayDType, ArrayDef, AsArray, IntoArray}; +use vortex::{Array, ArrayDType, ArrayDef, AsArray, IntoArray, IntoArrayVariant}; use vortex_dtype::{NativePType, PType}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_scalar::Scalar; @@ -94,12 +94,13 @@ where exponents, PrimitiveArray::from_vec(encoded, values.validity()).into_array(), (!exc.is_empty()).then(|| { - SparseArray::new( + SparseArray::try_new( PrimitiveArray::from(exc_pos).into_array(), PrimitiveArray::from_vec(exc, Validity::AllValid).into_array(), len, Scalar::null(values.dtype().as_nullable()), ) + .unwrap() .into_array() }), ) @@ -115,7 +116,7 @@ pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { } pub fn decompress(array: ALPArray) -> VortexResult { - let encoded = array.encoded().flatten_primitive()?; + let encoded = array.encoded().into_primitive()?; let decoded = match_each_alp_float_ptype!(array.dtype().try_into().unwrap(), |$T| { PrimitiveArray::from_vec( @@ -138,7 +139,7 @@ fn patch_decoded(array: PrimitiveArray, patches: &Array) -> VortexResult())? + typed_patches.values().into_primitive()?.maybe_null_slice::<$T>())? }) } _ => panic!("can't patch ALP array with {}", patches), @@ -165,7 +166,7 @@ mod tests { let encoded = alp_encode(&array).unwrap(); assert!(encoded.patches().is_none()); assert_eq!( - encoded.encoded().into_primitive().maybe_null_slice::(), + encoded.encoded().as_primitive().maybe_null_slice::(), vec![1234; 1025] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); @@ -183,7 +184,7 @@ mod tests { let encoded = alp_encode(&array).unwrap(); assert!(encoded.patches().is_none()); assert_eq!( - encoded.encoded().into_primitive().maybe_null_slice::(), + encoded.encoded().as_primitive().maybe_null_slice::(), vec![0, 1234, 0] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); @@ -201,7 +202,7 @@ mod tests { let encoded = alp_encode(&array).unwrap(); assert!(encoded.patches().is_some()); assert_eq!( - encoded.encoded().into_primitive().maybe_null_slice::(), + encoded.encoded().as_primitive().maybe_null_slice::(), vec![1234i64, 2718, 2718, 4000] // fill forward ); assert_eq!(encoded.exponents(), &Exponents { e: 3, f: 0 }); diff --git a/encodings/alp/src/compute.rs b/encodings/alp/src/compute.rs index 19f7e3636..048caae73 100644 --- a/encodings/alp/src/compute.rs +++ b/encodings/alp/src/compute.rs @@ -1,6 +1,6 @@ -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::{Array, ArrayDType, IntoArray}; use vortex_error::VortexResult; diff --git a/encodings/datetime-parts/src/array.rs b/encodings/datetime-parts/src/array.rs index 0c8e153b1..cc2f04397 100644 --- a/encodings/datetime-parts/src/array.rs +++ b/encodings/datetime-parts/src/array.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_error::vortex_bail; use crate::compute::decode_to_localdatetime; @@ -76,9 +76,9 @@ impl DateTimePartsArray { } } -impl ArrayFlatten for DateTimePartsArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::Extension( +impl IntoCanonical for DateTimePartsArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Extension( decode_to_localdatetime(&self.into_array())?.try_into()?, )) } diff --git a/encodings/datetime-parts/src/compress.rs b/encodings/datetime-parts/src/compress.rs index 0be21f5b0..0957b39cd 100644 --- a/encodings/datetime-parts/src/compress.rs +++ b/encodings/datetime-parts/src/compress.rs @@ -1,8 +1,8 @@ use vortex::array::datetime::{LocalDateTimeArray, TimeUnit}; use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; -use vortex::compute::cast::cast; -use vortex::{Array, ArrayTrait, IntoArray}; +use vortex::compute::unary::cast::try_cast; +use vortex::{Array, ArrayTrait, IntoArray, IntoCanonical}; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -39,7 +39,9 @@ fn compress_localdatetime( like: Option, ctx: Compressor, ) -> VortexResult { - let timestamps = cast(&array.timestamps(), PType::I64.into())?.flatten_primitive()?; + let timestamps = try_cast(&array.timestamps(), PType::I64.into())? + .into_canonical()? + .into_primitive()?; let divisor = match array.time_unit() { TimeUnit::Ns => 1_000_000_000, diff --git a/encodings/datetime-parts/src/compute.rs b/encodings/datetime-parts/src/compute.rs index 42f0496c0..10f772021 100644 --- a/encodings/datetime-parts/src/compute.rs +++ b/encodings/datetime-parts/src/compute.rs @@ -1,11 +1,11 @@ use vortex::array::datetime::{try_parse_time_unit, LocalDateTimeArray, TimeUnit}; use vortex::array::primitive::PrimitiveArray; -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::validity::ArrayValidity; -use vortex::{Array, ArrayDType, IntoArray}; +use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; @@ -104,9 +104,9 @@ pub fn decode_to_localdatetime(array: &Array) -> VortexResult 1, }; - let days_buf = array.days().flatten()?.into_array().as_primitive(); - let seconds_buf = array.seconds().flatten()?.into_array().as_primitive(); - let subsecond_buf = array.subsecond().flatten()?.into_array().as_primitive(); + let days_buf = array.days().into_primitive()?; + let seconds_buf = array.seconds().into_primitive()?; + let subsecond_buf = array.subsecond().into_primitive()?; // TODO(aduffy): replace with vectorized implementation? let values = days_buf @@ -127,7 +127,7 @@ pub fn decode_to_localdatetime(array: &Array) -> VortexResult(&reference); let dict = DictArray::try_new(codes.into_array(), values.into_array()).unwrap(); - let flattened_dict = dict.to_array().flatten_primitive().unwrap(); + let flattened_dict = dict + .to_array() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(); assert_eq!(flattened_dict.buffer(), reference.buffer()); } @@ -79,18 +84,43 @@ mod test { ); let (codes, values) = dict_encode_varbin(&reference); let dict = DictArray::try_new(codes.into_array(), values.into_array()).unwrap(); - let flattened_dict = dict.to_array().flatten_varbin().unwrap(); + let flattened_dict = dict + .to_array() + .into_canonical() + .unwrap() + .into_varbin() + .unwrap(); assert_eq!( flattened_dict .offsets() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .buffer(), - reference.offsets().flatten_primitive().unwrap().buffer() + reference + .offsets() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap() + .buffer() ); assert_eq!( - flattened_dict.bytes().flatten_primitive().unwrap().buffer(), - reference.bytes().flatten_primitive().unwrap().buffer() + flattened_dict + .bytes() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap() + .buffer(), + reference + .bytes() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap() + .buffer() ); } } diff --git a/encodings/dict/src/dict.rs b/encodings/dict/src/dict.rs index 366db6ddc..5de3eea39 100644 --- a/encodings/dict/src/dict.rs +++ b/encodings/dict/src/dict.rs @@ -1,11 +1,11 @@ use serde::{Deserialize, Serialize}; use vortex::accessor::ArrayAccessor; use vortex::array::bool::BoolArray; -use vortex::compute::scalar_at::scalar_at; use vortex::compute::take::take; +use vortex::compute::unary::scalar_at::scalar_at; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_dtype::match_each_integer_ptype; use vortex_error::vortex_bail; @@ -44,9 +44,9 @@ impl DictArray { } } -impl ArrayFlatten for DictArray { - fn flatten(self) -> VortexResult { - take(&self.values(), &self.codes())?.flatten() +impl IntoCanonical for DictArray { + fn into_canonical(self) -> VortexResult { + take(&self.values(), &self.codes())?.into_canonical() } } @@ -62,7 +62,12 @@ impl ArrayValidity for DictArray { fn logical_validity(&self) -> LogicalValidity { if self.dtype().is_nullable() { - let primitive_codes = self.codes().flatten_primitive().unwrap(); + let primitive_codes = self + .codes() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(); match_each_integer_ptype!(primitive_codes.ptype(), |$P| { ArrayAccessor::<$P>::with_iterator(&primitive_codes, |iter| { LogicalValidity::Array( diff --git a/encodings/dict/src/lib.rs b/encodings/dict/src/lib.rs index d8c8ab392..26bd4b80c 100644 --- a/encodings/dict/src/lib.rs +++ b/encodings/dict/src/lib.rs @@ -1,3 +1,7 @@ +//! Implementation of Dictionary encoding. +//! +//! Expose a [DictArray] which is zero-copy equivalent to Arrow's +//! [arrow_array::array::DictionaryArray] type. pub use compress::*; pub use dict::*; diff --git a/encodings/fastlanes/src/bitpacking/compress.rs b/encodings/fastlanes/src/bitpacking/compress.rs index f8c9476ec..01b8adb9a 100644 --- a/encodings/fastlanes/src/bitpacking/compress.rs +++ b/encodings/fastlanes/src/bitpacking/compress.rs @@ -6,6 +6,7 @@ use vortex::array::sparse::{Sparse, SparseArray}; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; use vortex::stats::ArrayStatistics; use vortex::validity::Validity; +use vortex::IntoArrayVariant; use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray}; use vortex_dtype::{ match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType, PType, @@ -188,7 +189,7 @@ pub fn unpack(array: BitPackedArray) -> VortexResult { let bit_width = array.bit_width(); let length = array.len(); let offset = array.offset(); - let packed = array.packed().flatten_primitive()?; + let packed = array.packed().into_primitive()?; let ptype = packed.ptype(); let mut unpacked = match_each_unsigned_integer_ptype!(ptype, |$P| { @@ -217,7 +218,7 @@ fn patch_unpacked(array: PrimitiveArray, patches: &Array) -> VortexResult()) + typed_patches.values().into_primitive()?.maybe_null_slice::<$T>()) }) } _ => panic!("can't patch bitpacked array with {}", patches), @@ -292,7 +293,7 @@ pub fn unpack_primitive( pub fn unpack_single(array: &BitPackedArray, index: usize) -> VortexResult { let bit_width = array.bit_width(); - let packed = array.packed().flatten_primitive()?; + let packed = array.packed().into_primitive()?; let index_in_encoded = index + array.offset(); let scalar: Scalar = match_each_unsigned_integer_ptype!(packed.ptype(), |$P| { unsafe { @@ -364,6 +365,7 @@ fn count_exceptions(bit_width: usize, bit_width_freq: &[usize]) -> usize { #[cfg(test)] mod test { use vortex::encoding::ArrayEncoding; + use vortex::IntoArrayVariant; use vortex::{Context, ToArray}; use vortex_scalar::PrimitiveScalar; @@ -407,7 +409,7 @@ mod test { .compress(values.array(), None) .unwrap(); let compressed = BitPackedArray::try_from(compressed).unwrap(); - let decompressed = compressed.to_array().flatten_primitive().unwrap(); + let decompressed = compressed.to_array().into_primitive().unwrap(); assert_eq!( decompressed.maybe_null_slice::(), values.maybe_null_slice::() diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs index 695abd572..184819fec 100644 --- a/encodings/fastlanes/src/bitpacking/compute/mod.rs +++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs @@ -6,11 +6,11 @@ use itertools::Itertools; use vortex::array::constant::ConstantArray; use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; -use vortex::{Array, ArrayDType, ArrayTrait, IntoArray}; +use vortex::{Array, ArrayDType, ArrayTrait, IntoArray, IntoArrayVariant, IntoCanonical}; use vortex_dtype::{ match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType, PType, }; @@ -67,7 +67,7 @@ impl TakeFn for BitPackedArray { }; } - let indices = indices.clone().flatten_primitive()?; + let indices = indices.clone().into_primitive()?; let taken = match_each_unsigned_integer_ptype!(ptype, |$T| { PrimitiveArray::from_vec(take_primitive::<$T>(self, &indices)?, taken_validity) }); @@ -92,7 +92,7 @@ fn take_primitive( let bit_width = array.bit_width(); - let packed = array.packed().flatten_primitive()?; + let packed = array.packed().into_primitive()?; let packed = packed.maybe_null_slice::(); let patches = array.patches().map(SparseArray::try_from).transpose()?; @@ -163,7 +163,8 @@ fn do_patch_for_take_primitive( let base_index = output.len() - indices.len(); let output_patches = taken_patches .values() - .flatten_primitive()? + .into_canonical()? + .into_primitive()? .reinterpret_cast(T::PTYPE); taken_patches .resolved_indices() @@ -185,9 +186,9 @@ mod test { use vortex::array::primitive::{Primitive, PrimitiveArray}; use vortex::array::sparse::SparseArray; use vortex::compress::Compressor; - use vortex::compute::scalar_at::scalar_at; use vortex::compute::take::take; - use vortex::{ArrayDef, Context, IntoArray}; + use vortex::compute::unary::scalar_at::scalar_at; + use vortex::{ArrayDef, Context, IntoArray, IntoArrayVariant}; use crate::{BitPackedArray, BitPackedEncoding}; @@ -209,7 +210,7 @@ mod test { let result = take(&bitpacked, &indices).unwrap(); assert_eq!(result.encoding().id(), Primitive::ID); - let primitive_result = result.flatten_primitive().unwrap(); + let primitive_result = result.into_primitive().unwrap(); let res_bytes = primitive_result.maybe_null_slice::(); assert_eq!(res_bytes, &[0, 62, 31, 33, 9, 18]); } diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs index daf5874f3..a722e2d15 100644 --- a/encodings/fastlanes/src/bitpacking/mod.rs +++ b/encodings/fastlanes/src/bitpacking/mod.rs @@ -4,7 +4,7 @@ use vortex::array::primitive::{Primitive, PrimitiveArray}; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_dtype::{Nullability, PType}; use vortex_error::{vortex_bail, vortex_err}; @@ -124,9 +124,9 @@ impl BitPackedArray { } } -impl ArrayFlatten for BitPackedArray { - fn flatten(self) -> VortexResult { - unpack(self).map(Flattened::Primitive) +impl IntoCanonical for BitPackedArray { + fn into_canonical(self) -> VortexResult { + unpack(self).map(Canonical::Primitive) } } @@ -170,9 +170,9 @@ impl ArrayTrait for BitPackedArray { #[cfg(test)] mod test { use vortex::array::primitive::PrimitiveArray; - use vortex::compute::scalar_at::scalar_at; use vortex::compute::slice::slice; - use vortex::IntoArray; + use vortex::compute::unary::scalar_at::scalar_at; + use vortex::{IntoArray, IntoCanonical}; use crate::BitPackedArray; @@ -222,7 +222,9 @@ mod test { let expected = &[1, 0, 1, 0, 1, 0, u64::MAX]; let results = packed .into_array() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); diff --git a/encodings/fastlanes/src/delta/compress.rs b/encodings/fastlanes/src/delta/compress.rs index 022bc35d2..c6b27abee 100644 --- a/encodings/fastlanes/src/delta/compress.rs +++ b/encodings/fastlanes/src/delta/compress.rs @@ -3,9 +3,9 @@ use fastlanes::{Delta, Transpose}; use num_traits::{WrappingAdd, WrappingSub}; use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; -use vortex::compute::fill::fill_forward; +use vortex::compute::unary::fill_forward::fill_forward; use vortex::validity::Validity; -use vortex::{Array, IntoArray}; +use vortex::{Array, IntoArray, IntoArrayVariant}; use vortex_dtype::NativePType; use vortex_dtype::{match_each_unsigned_integer_ptype, Nullability}; use vortex_error::VortexResult; @@ -41,7 +41,7 @@ impl EncodingCompression for DeltaEncoding { let validity = ctx.compress_validity(parray.validity())?; // Fill forward nulls - let filled = fill_forward(array)?.flatten_primitive()?; + let filled = fill_forward(array)?.into_primitive()?; // Compress the filled array let (bases, deltas) = match_each_unsigned_integer_ptype!(parray.ptype(), |$T| { @@ -137,8 +137,8 @@ where } pub fn decompress(array: DeltaArray) -> VortexResult { - let bases = array.bases().flatten_primitive()?; - let deltas = array.deltas().flatten_primitive()?; + let bases = array.bases().into_primitive()?; + let deltas = array.deltas().into_primitive()?; let decoded = match_each_unsigned_integer_ptype!(deltas.ptype(), |$T| { PrimitiveArray::from_vec( decompress_primitive::<$T>(bases.maybe_null_slice(), deltas.maybe_null_slice()), diff --git a/encodings/fastlanes/src/delta/mod.rs b/encodings/fastlanes/src/delta/mod.rs index 716ac70ef..7ca5ca138 100644 --- a/encodings/fastlanes/src/delta/mod.rs +++ b/encodings/fastlanes/src/delta/mod.rs @@ -3,7 +3,7 @@ use vortex::stats::ArrayStatisticsCompute; use vortex::validity::ValidityMetadata; use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_dtype::match_each_unsigned_integer_ptype; use vortex_error::vortex_bail; @@ -94,9 +94,9 @@ impl DeltaArray { } } -impl ArrayFlatten for DeltaArray { - fn flatten(self) -> VortexResult { - decompress(self).map(Flattened::Primitive) +impl IntoCanonical for DeltaArray { + fn into_canonical(self) -> VortexResult { + decompress(self).map(Canonical::Primitive) } } diff --git a/encodings/fastlanes/src/for/compress.rs b/encodings/fastlanes/src/for/compress.rs index 1a219b499..1372d56c9 100644 --- a/encodings/fastlanes/src/for/compress.rs +++ b/encodings/fastlanes/src/for/compress.rs @@ -5,7 +5,7 @@ use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; use vortex::stats::{ArrayStatistics, Stat}; use vortex::validity::ArrayValidity; -use vortex::{Array, ArrayDType, ArrayTrait, IntoArray}; +use vortex::{Array, ArrayDType, ArrayTrait, IntoArray, IntoArrayVariant}; use vortex_dtype::{match_each_integer_ptype, NativePType, PType}; use vortex_error::{vortex_err, VortexResult}; use vortex_scalar::Scalar; @@ -103,7 +103,7 @@ fn compress_primitive( pub fn decompress(array: FoRArray) -> VortexResult { let shift = array.shift(); let ptype: PType = array.dtype().try_into()?; - let encoded = array.encoded().flatten_primitive()?; + let encoded = array.encoded().into_primitive()?; Ok(match_each_integer_ptype!(ptype, |$T| { let reference: $T = array.reference().try_into()?; PrimitiveArray::from_vec( @@ -148,9 +148,9 @@ fn trailing_zeros(array: &Array) -> u8 { #[cfg(test)] mod test { - use vortex::compute::scalar_at::ScalarAtFn; + use vortex::compute::unary::scalar_at::ScalarAtFn; use vortex::encoding::{ArrayEncoding, EncodingRef}; - use vortex::Context; + use vortex::{Context, IntoArrayVariant}; use super::*; use crate::BitPackedEncoding; @@ -184,7 +184,7 @@ mod test { .unwrap(); assert_eq!(compressed.encoding().id(), FoREncoding.id()); - let decompressed = compressed.flatten_primitive().unwrap(); + let decompressed = compressed.into_primitive().unwrap(); assert_eq!( decompressed.maybe_null_slice::(), array.maybe_null_slice::() @@ -201,12 +201,12 @@ mod test { let compressed = FoRArray::try_from(compressed).unwrap(); assert_eq!(i8::MIN, i8::try_from(compressed.reference()).unwrap()); - let encoded = compressed.encoded().flatten_primitive().unwrap(); + let encoded = compressed.encoded().into_primitive().unwrap(); let bitcast: &[u8] = unsafe { std::mem::transmute(encoded.maybe_null_slice::()) }; let unsigned: Vec = (0..u8::MAX).collect_vec(); assert_eq!(bitcast, unsigned.as_slice()); - let decompressed = compressed.array().clone().flatten_primitive().unwrap(); + let decompressed = compressed.array().clone().into_primitive().unwrap(); assert_eq!( decompressed.maybe_null_slice::(), array.maybe_null_slice::() diff --git a/encodings/fastlanes/src/for/compute.rs b/encodings/fastlanes/src/for/compute.rs index 463ae7c6c..8c12a2b63 100644 --- a/encodings/fastlanes/src/for/compute.rs +++ b/encodings/fastlanes/src/for/compute.rs @@ -1,6 +1,6 @@ -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::{Array, IntoArray}; use vortex_dtype::match_each_integer_ptype; @@ -68,7 +68,7 @@ impl SliceFn for FoRArray { mod test { use vortex::array::primitive::PrimitiveArray; use vortex::compress::{Compressor, EncodingCompression}; - use vortex::compute::scalar_at::scalar_at; + use vortex::compute::unary::scalar_at::scalar_at; use vortex::Context; use crate::FoREncoding; diff --git a/encodings/fastlanes/src/for/mod.rs b/encodings/fastlanes/src/for/mod.rs index 088c0da55..c1732ff2b 100644 --- a/encodings/fastlanes/src/for/mod.rs +++ b/encodings/fastlanes/src/for/mod.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_error::vortex_bail; use vortex_scalar::Scalar; @@ -61,9 +61,9 @@ impl ArrayValidity for FoRArray { } } -impl ArrayFlatten for FoRArray { - fn flatten(self) -> VortexResult { - decompress(self).map(Flattened::Primitive) +impl IntoCanonical for FoRArray { + fn into_canonical(self) -> VortexResult { + decompress(self).map(Canonical::Primitive) } } diff --git a/encodings/roaring/src/boolean/compress.rs b/encodings/roaring/src/boolean/compress.rs index 28d8f4c8d..9bd23a46c 100644 --- a/encodings/roaring/src/boolean/compress.rs +++ b/encodings/roaring/src/boolean/compress.rs @@ -1,7 +1,7 @@ use croaring::Bitmap; use vortex::array::bool::BoolArray; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; -use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray}; +use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, IntoArrayVariant}; use vortex_dtype::DType; use vortex_dtype::Nullability::NonNullable; use vortex_error::VortexResult; @@ -37,7 +37,7 @@ impl EncodingCompression for RoaringBoolEncoding { _like: Option<&Array>, _ctx: Compressor, ) -> VortexResult { - roaring_encode(array.clone().flatten_bool()?).map(move |a| a.into_array()) + roaring_encode(array.clone().into_bool()?).map(move |a| a.into_array()) } } diff --git a/encodings/roaring/src/boolean/compute.rs b/encodings/roaring/src/boolean/compute.rs index d57a6c2a3..7b0c19ea0 100644 --- a/encodings/roaring/src/boolean/compute.rs +++ b/encodings/roaring/src/boolean/compute.rs @@ -1,6 +1,6 @@ use croaring::Bitmap; -use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::slice::SliceFn; +use vortex::compute::unary::scalar_at::ScalarAtFn; use vortex::compute::ArrayCompute; use vortex::{Array, IntoArray}; use vortex_error::VortexResult; diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index 7ff08b8fb..c682ee3cf 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -7,7 +7,7 @@ use vortex::array::bool::{Bool, BoolArray}; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::Nullability::Nullable; @@ -86,8 +86,8 @@ impl ArrayValidity for RoaringBoolArray { } } -impl ArrayFlatten for RoaringBoolArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for RoaringBoolArray { + fn into_canonical(self) -> VortexResult { // TODO(ngates): benchmark the fastest conversion from BitMap. // Via bitset requires two copies. let bitset = self @@ -97,7 +97,7 @@ impl ArrayFlatten for RoaringBoolArray { let bytes = &bitset.as_slice()[0..bitset.size_in_bytes()]; let buffer = ArrowBuffer::from_slice_ref(bytes); - Ok(Flattened::Bool(BoolArray::try_new( + Ok(Canonical::Bool(BoolArray::try_new( BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), match self.dtype().nullability() { NonNullable => Validity::NonNullable, @@ -110,7 +110,7 @@ impl ArrayFlatten for RoaringBoolArray { #[cfg(test)] mod test { use vortex::array::bool::BoolArray; - use vortex::compute::scalar_at::scalar_at; + use vortex::compute::unary::scalar_at::scalar_at; use vortex::IntoArray; use vortex_error::VortexResult; use vortex_scalar::Scalar; diff --git a/encodings/roaring/src/integer/compress.rs b/encodings/roaring/src/integer/compress.rs index f0615d221..8c2a8d124 100644 --- a/encodings/roaring/src/integer/compress.rs +++ b/encodings/roaring/src/integer/compress.rs @@ -4,6 +4,7 @@ use num_traits::NumCast; use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, Compressor, EncodingCompression}; use vortex::stats::ArrayStatistics; +use vortex::IntoArrayVariant; use vortex::{Array, ArrayDType, ArrayDef, IntoArray}; use vortex_dtype::{NativePType, PType}; use vortex_error::VortexResult; @@ -52,7 +53,7 @@ impl EncodingCompression for RoaringIntEncoding { _like: Option<&Array>, _ctx: Compressor, ) -> VortexResult { - let parray = array.clone().flatten_primitive()?; + let parray = array.clone().into_primitive()?; Ok(roaring_encode(parray).into_array()) } } diff --git a/encodings/roaring/src/integer/compute.rs b/encodings/roaring/src/integer/compute.rs index fa6bf0f9b..f9f097c1f 100644 --- a/encodings/roaring/src/integer/compute.rs +++ b/encodings/roaring/src/integer/compute.rs @@ -1,4 +1,4 @@ -use vortex::compute::scalar_at::ScalarAtFn; +use vortex::compute::unary::scalar_at::ScalarAtFn; use vortex::compute::ArrayCompute; use vortex_dtype::PType; use vortex_error::VortexResult; diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index d8020240b..7ac447331 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -5,7 +5,7 @@ use vortex::array::primitive::{Primitive, PrimitiveArray}; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayFlatten}; +use vortex::{impl_encoding, Canonical, IntoCanonical}; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::PType; @@ -71,17 +71,17 @@ impl RoaringIntArray { } impl ArrayValidity for RoaringIntArray { - fn logical_validity(&self) -> LogicalValidity { - LogicalValidity::AllValid(self.bitmap().iter().count()) - } - fn is_valid(&self, _index: usize) -> bool { true } + + fn logical_validity(&self) -> LogicalValidity { + LogicalValidity::AllValid(self.bitmap().iter().count()) + } } -impl ArrayFlatten for RoaringIntArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for RoaringIntArray { + fn into_canonical(self) -> VortexResult { todo!() } } @@ -103,7 +103,7 @@ impl ArrayTrait for RoaringIntArray { #[cfg(test)] mod test { use vortex::array::primitive::PrimitiveArray; - use vortex::compute::scalar_at::scalar_at; + use vortex::compute::unary::scalar_at::scalar_at; use vortex::IntoArray; use vortex_error::VortexResult; diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 84d9653b6..54cf6ad9d 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -207,8 +207,8 @@ mod test { .unwrap(); let decoded = ree_decode( - &arr.ends().into_primitive(), - &arr.values().into_primitive(), + &arr.ends().as_primitive(), + &arr.values().as_primitive(), arr.validity(), 0, arr.len(), diff --git a/encodings/runend/src/compute.rs b/encodings/runend/src/compute.rs index 3948f4252..25c70a8ad 100644 --- a/encodings/runend/src/compute.rs +++ b/encodings/runend/src/compute.rs @@ -1,9 +1,9 @@ use vortex::array::primitive::PrimitiveArray; -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; -use vortex::{Array, IntoArray}; +use vortex::{Array, IntoArray, IntoArrayVariant}; use vortex_dtype::match_each_integer_ptype; use vortex_error::VortexResult; use vortex_scalar::Scalar; @@ -32,7 +32,7 @@ impl ScalarAtFn for REEArray { impl TakeFn for REEArray { fn take(&self, indices: &Array) -> VortexResult { - let primitive_indices = indices.clone().flatten_primitive()?; + let primitive_indices = indices.clone().into_primitive()?; let physical_indices = match_each_integer_ptype!(primitive_indices.ptype(), |$P| { primitive_indices .maybe_null_slice::<$P>() @@ -69,7 +69,7 @@ impl SliceFn for REEArray { mod test { use vortex::array::primitive::PrimitiveArray; use vortex::compute::take::take; - use vortex::ToArray; + use vortex::{IntoCanonical, ToArray}; use crate::REEArray; @@ -81,7 +81,12 @@ mod test { .unwrap(); let taken = take(ree.array(), PrimitiveArray::from(vec![9, 8, 1, 3]).array()).unwrap(); assert_eq!( - taken.flatten_primitive().unwrap().maybe_null_slice::(), + taken + .into_canonical() + .unwrap() + .into_primitive() + .unwrap() + .maybe_null_slice::(), &[5, 5, 1, 4] ); } diff --git a/encodings/runend/src/ree.rs b/encodings/runend/src/ree.rs index bacb86aeb..2f513d220 100644 --- a/encodings/runend/src/ree.rs +++ b/encodings/runend/src/ree.rs @@ -1,11 +1,11 @@ use serde::{Deserialize, Serialize}; use vortex::array::primitive::{Primitive, PrimitiveArray}; -use vortex::compute::scalar_at::scalar_at; use vortex::compute::search_sorted::{search_sorted, SearchSortedSide}; +use vortex::compute::unary::scalar_at::scalar_at; use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute}; use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoArrayVariant, IntoCanonical}; use vortex_error::vortex_bail; use crate::compress::{ree_decode, ree_encode}; @@ -107,12 +107,12 @@ impl ArrayValidity for REEArray { } } -impl ArrayFlatten for REEArray { - fn flatten(self) -> VortexResult { - let pends = self.ends().flatten_primitive()?; - let pvalues = self.values().flatten_primitive()?; +impl IntoCanonical for REEArray { + fn into_canonical(self) -> VortexResult { + let pends = self.ends().into_primitive()?; + let pvalues = self.values().into_primitive()?; ree_decode(&pends, &pvalues, self.validity(), self.offset(), self.len()) - .map(Flattened::Primitive) + .map(Canonical::Primitive) } } @@ -134,10 +134,10 @@ impl ArrayTrait for REEArray { #[cfg(test)] mod test { - use vortex::compute::scalar_at::scalar_at; use vortex::compute::slice::slice; + use vortex::compute::unary::scalar_at::scalar_at; use vortex::validity::Validity; - use vortex::{ArrayDType, ArrayTrait, IntoArray}; + use vortex::{ArrayDType, ArrayTrait, IntoArray, IntoCanonical}; use vortex_dtype::{DType, Nullability, PType}; use crate::REEArray; @@ -186,7 +186,11 @@ mod test { assert_eq!(arr.len(), 5); assert_eq!( - arr.flatten_primitive().unwrap().maybe_null_slice::(), + arr.into_canonical() + .unwrap() + .into_primitive() + .unwrap() + .maybe_null_slice::(), vec![2, 2, 3, 3, 3] ); } @@ -199,9 +203,11 @@ mod test { Validity::NonNullable, ) .unwrap(); + assert_eq!( - arr.into_array() - .flatten_primitive() + arr.into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::(), vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3] diff --git a/encodings/zigzag/src/compute.rs b/encodings/zigzag/src/compute.rs index 009fbdb63..57a51413e 100644 --- a/encodings/zigzag/src/compute.rs +++ b/encodings/zigzag/src/compute.rs @@ -1,5 +1,5 @@ -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; +use vortex::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::{Array, IntoArray}; use vortex_dtype::PType; diff --git a/encodings/zigzag/src/zigzag.rs b/encodings/zigzag/src/zigzag.rs index c00c76e23..0a83fc569 100644 --- a/encodings/zigzag/src/zigzag.rs +++ b/encodings/zigzag/src/zigzag.rs @@ -3,7 +3,7 @@ use vortex::array::primitive::PrimitiveArray; use vortex::stats::ArrayStatisticsCompute; use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, ArrayDType, ArrayFlatten}; +use vortex::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; use vortex_dtype::PType; use vortex_error::{vortex_bail, vortex_err}; @@ -66,8 +66,8 @@ impl AcceptArrayVisitor for ZigZagArray { impl ArrayStatisticsCompute for ZigZagArray {} -impl ArrayFlatten for ZigZagArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for ZigZagArray { + fn into_canonical(self) -> VortexResult { todo!("ZigZagArray::flatten") } } diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 75ecb60f1..521571894 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -4,8 +4,8 @@ use vortex::array::bool::{Bool, BoolArray, BoolEncoding}; use vortex::array::chunked::{Chunked, ChunkedArray, ChunkedEncoding}; use vortex::array::constant::{Constant, ConstantArray, ConstantEncoding}; use vortex::array::primitive::{Primitive, PrimitiveArray, PrimitiveEncoding}; -use vortex::array::r#struct::{Struct, StructArray, StructEncoding}; use vortex::array::sparse::{Sparse, SparseArray, SparseEncoding}; +use vortex::array::struct_::{Struct, StructArray, StructEncoding}; use vortex::array::varbin::{VarBin, VarBinArray, VarBinEncoding}; use vortex::array::varbinview::{VarBinView, VarBinViewArray, VarBinViewEncoding}; use vortex::compute::take::take; diff --git a/pyvortex/src/vortex_arrow.rs b/pyvortex/src/vortex_arrow.rs index f9cb05c1a..cdb1d05a7 100644 --- a/pyvortex/src/vortex_arrow.rs +++ b/pyvortex/src/vortex_arrow.rs @@ -5,7 +5,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyList}; use vortex::array::chunked::ChunkedArray; -use vortex::Array; +use vortex::{Array, IntoCanonical}; pub fn map_arrow_err(error: ArrowError) -> PyErr { PyValueError::new_err(error.to_string()) @@ -17,10 +17,10 @@ pub fn export_array<'py>(py: Python<'py>, array: &Array) -> PyResult = if let Ok(chunked_array) = ChunkedArray::try_from(array) { chunked_array .chunks() - .map(|chunk| chunk.flatten().unwrap().into_arrow()) + .map(|chunk| chunk.into_canonical().unwrap().into_arrow()) .collect() } else { - vec![array.clone().flatten().unwrap().into_arrow()] + vec![array.clone().into_canonical().unwrap().into_arrow()] }; if chunks.is_empty() { return Err(PyValueError::new_err("No chunks in array")); diff --git a/vortex-array/benches/scalar_subtract.rs b/vortex-array/benches/scalar_subtract.rs index 70dd88af5..90096d301 100644 --- a/vortex-array/benches/scalar_subtract.rs +++ b/vortex-array/benches/scalar_subtract.rs @@ -26,9 +26,11 @@ fn scalar_subtract(c: &mut Criterion) { group.bench_function("vortex", |b| { b.iter(|| { - let array = - vortex::compute::scalar_subtract::subtract_scalar(&chunked, &to_subtract.into()) - .unwrap(); + let array = vortex::compute::unary::scalar_subtract::subtract_scalar( + &chunked, + &to_subtract.into(), + ) + .unwrap(); let chunked = ChunkedArray::try_from(array).unwrap(); black_box(chunked); diff --git a/vortex-array/src/accessor.rs b/vortex-array/src/accessor.rs index fe61bdf76..0f29ee949 100644 --- a/vortex-array/src/accessor.rs +++ b/vortex-array/src/accessor.rs @@ -1,7 +1,13 @@ use vortex_error::VortexResult; +/// Trait for arrays that support iterative access to their elements. pub trait ArrayAccessor { + /// Iterate over each element of the array, in-order. + /// + /// The function `f` will be passed an [`Iterator`], it can call [`next`][Iterator::next] on the + /// iterator [`len`][crate::Array::len] times. Iterator elements are `Option` types, regardless + /// of the nullability of the underlying array data. fn with_iterator(&self, f: F) -> VortexResult where - F: for<'a> FnOnce(&mut (dyn Iterator>)) -> R; + F: for<'a> FnOnce(&mut dyn Iterator>) -> R; } diff --git a/vortex-array/src/array/bool/accessors.rs b/vortex-array/src/array/bool/accessors.rs new file mode 100644 index 000000000..e35b25f8b --- /dev/null +++ b/vortex-array/src/array/bool/accessors.rs @@ -0,0 +1,59 @@ +use itertools::Itertools; +use vortex_error::VortexResult; + +use crate::accessor::ArrayAccessor; +use crate::array::bool::BoolArray; +use crate::validity::Validity; +use crate::{ArrayTrait, IntoArrayVariant}; + +static TRUE: bool = true; +static FALSE: bool = false; + +impl ArrayAccessor for BoolArray { + fn with_iterator(&self, f: F) -> VortexResult + where + F: for<'a> FnOnce(&mut dyn Iterator>) -> R, + { + let bools = self.boolean_buffer(); + match self.validity() { + Validity::NonNullable | Validity::AllValid => Ok(f(&mut bools + .iter() + .map(|b| Some(if b { &TRUE } else { &FALSE })))), + Validity::AllInvalid => Ok(f(&mut (0..self.len()).map(|_| None))), + Validity::Array(valid) => { + let valids = valid.into_bool()?.boolean_buffer(); + println!("nulls: {:?}", valids.iter().collect_vec()); + let mut iter = valids.iter().zip(bools.iter()).map(|(is_valid, value)| { + if is_valid { + Some(if value { &TRUE } else { &FALSE }) + } else { + None + } + }); + + Ok(f(&mut iter)) + } + } + } +} + +#[cfg(test)] +mod test { + use crate::accessor::ArrayAccessor; + use crate::array::bool::BoolArray; + + #[test] + fn test_bool_accesor() { + let original = vec![Some(true), None, Some(false), None]; + let array = BoolArray::from_iter(original.clone()); + + let bool_vec: Vec> = + ArrayAccessor::::with_iterator(&array, |values_iter| { + values_iter + .map(|b| b.cloned()) + .collect::>>() + }) + .unwrap(); + assert_eq!(bool_vec, original); + } +} diff --git a/vortex-array/src/array/bool/compute/compare.rs b/vortex-array/src/array/bool/compute/compare.rs index d359cfa40..d333c9cc3 100644 --- a/vortex-array/src/array/bool/compute/compare.rs +++ b/vortex-array/src/array/bool/compute/compare.rs @@ -5,11 +5,11 @@ use vortex_expr::Operator; use crate::array::bool::BoolArray; use crate::compute::compare::CompareFn; -use crate::{Array, ArrayTrait, IntoArray}; +use crate::{Array, ArrayTrait, IntoArray, IntoArrayVariant}; impl CompareFn for BoolArray { fn compare(&self, other: &Array, op: Operator) -> VortexResult { - let flattened = other.clone().flatten_bool()?; + let flattened = other.clone().into_bool()?; let lhs = self.boolean_buffer(); let rhs = flattened.boolean_buffer(); let result_buf = match op { @@ -39,6 +39,7 @@ mod test { use super::*; use crate::compute::compare::compare; use crate::validity::Validity; + use crate::IntoArrayVariant; fn to_int_indices(indices_bits: BoolArray) -> Vec { let filtered = indices_bits @@ -58,10 +59,10 @@ mod test { ) .into_array(); - let matches = compare(&arr, &arr, Operator::Eq)?.flatten_bool()?; + let matches = compare(&arr, &arr, Operator::Eq)?.into_bool()?; assert_eq!(to_int_indices(matches), [1u64, 2, 3, 4]); - let matches = compare(&arr, &arr, Operator::NotEq)?.flatten_bool()?; + let matches = compare(&arr, &arr, Operator::NotEq)?.into_bool()?; let empty: [u64; 0] = []; assert_eq!(to_int_indices(matches), empty); @@ -71,16 +72,16 @@ mod test { ) .into_array(); - let matches = compare(&arr, &other, Operator::Lte)?.flatten_bool()?; + let matches = compare(&arr, &other, Operator::Lte)?.into_bool()?; assert_eq!(to_int_indices(matches), [2u64, 3, 4]); - let matches = compare(&arr, &other, Operator::Lt)?.flatten_bool()?; + let matches = compare(&arr, &other, Operator::Lt)?.into_bool()?; assert_eq!(to_int_indices(matches), [4u64]); - let matches = compare(&other, &arr, Operator::Gte)?.flatten_bool()?; + let matches = compare(&other, &arr, Operator::Gte)?.into_bool()?; assert_eq!(to_int_indices(matches), [2u64, 3, 4]); - let matches = compare(&other, &arr, Operator::Gt)?.flatten_bool()?; + let matches = compare(&other, &arr, Operator::Gt)?.into_bool()?; assert_eq!(to_int_indices(matches), [4u64]); Ok(()) } diff --git a/vortex-array/src/array/bool/compute/fill.rs b/vortex-array/src/array/bool/compute/fill.rs index f4980601f..43a96dcc4 100644 --- a/vortex-array/src/array/bool/compute/fill.rs +++ b/vortex-array/src/array/bool/compute/fill.rs @@ -2,7 +2,7 @@ use vortex_dtype::Nullability; use vortex_error::VortexResult; use crate::array::bool::BoolArray; -use crate::compute::fill::FillForwardFn; +use crate::compute::unary::fill_forward::FillForwardFn; use crate::validity::ArrayValidity; use crate::{Array, ArrayDType, IntoArray, ToArrayData}; @@ -39,7 +39,9 @@ mod test { fn fill_forward() { let barr = BoolArray::from_iter(vec![None, Some(false), None, Some(true), None]).into_array(); - let filled_bool = BoolArray::try_from(compute::fill::fill_forward(&barr).unwrap()).unwrap(); + let filled_bool = + BoolArray::try_from(compute::unary::fill_forward::fill_forward(&barr).unwrap()) + .unwrap(); assert_eq!( filled_bool.boolean_buffer().iter().collect::>(), vec![false, false, false, true, true] diff --git a/vortex-array/src/array/bool/compute/mod.rs b/vortex-array/src/array/bool/compute/mod.rs index baea8e09c..39100fa22 100644 --- a/vortex-array/src/array/bool/compute/mod.rs +++ b/vortex-array/src/array/bool/compute/mod.rs @@ -1,9 +1,9 @@ use crate::array::bool::BoolArray; use crate::compute::compare::CompareFn; -use crate::compute::fill::FillForwardFn; -use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; +use crate::compute::unary::fill_forward::FillForwardFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; mod compare; diff --git a/vortex-array/src/array/bool/compute/scalar_at.rs b/vortex-array/src/array/bool/compute/scalar_at.rs index 68c6570b7..f6971cdc7 100644 --- a/vortex-array/src/array/bool/compute/scalar_at.rs +++ b/vortex-array/src/array/bool/compute/scalar_at.rs @@ -2,7 +2,7 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::bool::BoolArray; -use crate::compute::scalar_at::ScalarAtFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::validity::ArrayValidity; use crate::ArrayDType; diff --git a/vortex-array/src/array/bool/compute/take.rs b/vortex-array/src/array/bool/compute/take.rs index 008e8368b..39318faa4 100644 --- a/vortex-array/src/array/bool/compute/take.rs +++ b/vortex-array/src/array/bool/compute/take.rs @@ -6,13 +6,13 @@ use vortex_error::VortexResult; use crate::array::bool::BoolArray; use crate::compute::take::TakeFn; use crate::Array; -use crate::AsArray; use crate::IntoArray; +use crate::{AsArray, IntoArrayVariant}; impl TakeFn for BoolArray { fn take(&self, indices: &Array) -> VortexResult { let validity = self.validity(); - let indices = indices.clone().flatten_primitive()?; + let indices = indices.clone().into_primitive()?; match_each_integer_ptype!(indices.ptype(), |$I| { Ok(BoolArray::from_vec( take_bool(&self.boolean_buffer(), indices.maybe_null_slice::<$I>()), diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index a6931825a..79246f9f0 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -6,8 +6,9 @@ use vortex_buffer::Buffer; use crate::validity::{ArrayValidity, ValidityMetadata}; use crate::validity::{LogicalValidity, Validity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayFlatten}; +use crate::{impl_encoding, Canonical, IntoCanonical}; +mod accessors; mod compute; mod stats; @@ -92,9 +93,9 @@ impl ArrayTrait for BoolArray { } } -impl ArrayFlatten for BoolArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::Bool(self)) +impl IntoCanonical for BoolArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Bool(self)) } } @@ -120,7 +121,7 @@ impl EncodingCompression for BoolEncoding {} #[cfg(test)] mod tests { use crate::array::bool::BoolArray; - use crate::compute::scalar_at::scalar_at; + use crate::compute::unary::scalar_at::scalar_at; use crate::IntoArray; #[test] diff --git a/vortex-array/src/array/bool/stats.rs b/vortex-array/src/array/bool/stats.rs index 9a54eb29d..22588e9f5 100644 --- a/vortex-array/src/array/bool/stats.rs +++ b/vortex-array/src/array/bool/stats.rs @@ -6,7 +6,7 @@ use vortex_error::VortexResult; use crate::array::bool::BoolArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use crate::validity::{ArrayValidity, LogicalValidity}; -use crate::{ArrayDType, ArrayTrait}; +use crate::{ArrayDType, ArrayTrait, IntoArrayVariant}; impl ArrayStatisticsCompute for BoolArray { fn compute_statistics(&self, stat: Stat) -> VortexResult { @@ -19,7 +19,7 @@ impl ArrayStatisticsCompute for BoolArray { LogicalValidity::AllInvalid(v) => Ok(StatsSet::nulls(v, self.dtype())), LogicalValidity::Array(a) => NullableBools( &self.boolean_buffer(), - &a.clone().flatten_bool()?.boolean_buffer(), + &a.clone().into_bool()?.boolean_buffer(), ) .compute_statistics(stat), } diff --git a/vortex-array/src/array/chunked/flatten.rs b/vortex-array/src/array/chunked/canonical.rs similarity index 80% rename from vortex-array/src/array/chunked/flatten.rs rename to vortex-array/src/array/chunked/canonical.rs index 7a374ae0a..a7b4bd5ca 100644 --- a/vortex-array/src/array/chunked/flatten.rs +++ b/vortex-array/src/array/chunked/canonical.rs @@ -9,19 +9,22 @@ use crate::array::chunked::ChunkedArray; use crate::array::extension::ExtensionArray; use crate::array::null::NullArray; use crate::array::primitive::PrimitiveArray; -use crate::array::r#struct::StructArray; +use crate::array::struct_::StructArray; use crate::array::varbin::builder::VarBinBuilder; use crate::array::varbin::VarBinArray; use crate::validity::Validity; -use crate::{Array, ArrayDType, ArrayFlatten, ArrayTrait, ArrayValidity, Flattened, IntoArray}; - -impl ArrayFlatten for ChunkedArray { - fn flatten(self) -> VortexResult { - try_flatten_chunks(self.chunks().collect(), self.dtype().clone()) +use crate::{ + Array, ArrayDType, ArrayTrait, ArrayValidity, Canonical, IntoArray, IntoArrayVariant, + IntoCanonical, +}; + +impl IntoCanonical for ChunkedArray { + fn into_canonical(self) -> VortexResult { + try_canonicalize_chunks(self.chunks().collect(), self.dtype().clone()) } } -pub(crate) fn try_flatten_chunks(chunks: Vec, dtype: DType) -> VortexResult { +pub(crate) fn try_canonicalize_chunks(chunks: Vec, dtype: DType) -> VortexResult { let mismatched = chunks .iter() .filter(|chunk| !chunk.dtype().eq(&dtype)) @@ -35,7 +38,7 @@ pub(crate) fn try_flatten_chunks(chunks: Vec, dtype: DType) -> VortexResu // one level internally without copying or decompressing any data. DType::Struct(struct_dtype, _) => { let struct_array = swizzle_struct_chunks(chunks.as_slice(), struct_dtype)?; - Ok(Flattened::Struct(struct_array)) + Ok(Canonical::Struct(struct_array)) } // Extension arrays wrap an internal storage array, which can hold a ChunkedArray until @@ -46,34 +49,34 @@ pub(crate) fn try_flatten_chunks(chunks: Vec, dtype: DType) -> VortexResu ChunkedArray::try_new(chunks, dtype.clone())?.into_array(), ); - Ok(Flattened::Extension(ext_array)) + Ok(Canonical::Extension(ext_array)) } - // Lists just flatten into their inner PType + // TODO(aduffy): better list support DType::List(..) => { todo!() } DType::Bool(nullability) => { let bool_array = pack_bools(chunks.as_slice(), *nullability)?; - Ok(Flattened::Bool(bool_array)) + Ok(Canonical::Bool(bool_array)) } DType::Primitive(ptype, nullability) => { let prim_array = pack_primitives(chunks.as_slice(), *ptype, *nullability)?; - Ok(Flattened::Primitive(prim_array)) + Ok(Canonical::Primitive(prim_array)) } DType::Utf8(nullability) => { let varbin_array = pack_varbin(chunks.as_slice(), &dtype, *nullability)?; - Ok(Flattened::VarBin(varbin_array)) + Ok(Canonical::VarBin(varbin_array)) } DType::Binary(nullability) => { let varbin_array = pack_varbin(chunks.as_slice(), &dtype, *nullability)?; - Ok(Flattened::VarBin(varbin_array)) + Ok(Canonical::VarBin(varbin_array)) } DType::Null => { let len = chunks.iter().map(|chunk| chunk.len()).sum(); let null_array = NullArray::new(len); - Ok(Flattened::Null(null_array)) + Ok(Canonical::Null(null_array)) } } } @@ -81,7 +84,7 @@ pub(crate) fn try_flatten_chunks(chunks: Vec, dtype: DType) -> VortexResu /// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single /// StructArray, where the Array for each Field is a ChunkedArray. /// -/// It is expected this function is only called from [try_flatten_chunks], and thus all chunks have +/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have /// been checked to have the same DType already. fn swizzle_struct_chunks( chunks: &[Array], @@ -115,14 +118,14 @@ fn swizzle_struct_chunks( /// Builds a new [BoolArray] by repacking the values from the chunks in a single contiguous array. /// -/// It is expected this function is only called from [try_flatten_chunks], and thus all chunks have +/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have /// been checked to have the same DType already. fn pack_bools(chunks: &[Array], nullability: Nullability) -> VortexResult { let len = chunks.iter().map(|chunk| chunk.len()).sum(); let validity = validity_from_chunks(chunks, nullability); let mut bools = Vec::with_capacity(len); for chunk in chunks { - let chunk = chunk.clone().flatten_bool()?; + let chunk = chunk.clone().into_bool()?; bools.extend(chunk.boolean_buffer().iter()); } @@ -132,7 +135,7 @@ fn pack_bools(chunks: &[Array], nullability: Nullability) -> VortexResult::with_capacity(len); for chunk in chunks { - let chunk = chunk.clone().flatten_varbin()?; + let chunk = chunk.clone().into_varbin()?; chunk.with_iterator(|iter| { for datum in iter { builder.push(datum); diff --git a/vortex-array/src/array/chunked/compute/mod.rs b/vortex-array/src/array/chunked/compute/mod.rs index 0469a11b9..e165d4bdd 100644 --- a/vortex-array/src/array/chunked/compute/mod.rs +++ b/vortex-array/src/array/chunked/compute/mod.rs @@ -2,10 +2,10 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::chunked::ChunkedArray; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::scalar_subtract::SubtractScalarFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; +use crate::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; +use crate::compute::unary::scalar_subtract::SubtractScalarFn; use crate::compute::ArrayCompute; mod slice; diff --git a/vortex-array/src/array/chunked/compute/take.rs b/vortex-array/src/array/chunked/compute/take.rs index c57e8e10e..08647131d 100644 --- a/vortex-array/src/array/chunked/compute/take.rs +++ b/vortex-array/src/array/chunked/compute/take.rs @@ -2,8 +2,9 @@ use vortex_dtype::PType; use vortex_error::VortexResult; use crate::array::chunked::ChunkedArray; -use crate::compute::cast::cast; +use crate::array::primitive::PrimitiveArray; use crate::compute::take::{take, TakeFn}; +use crate::compute::unary::cast::try_cast; use crate::{Array, IntoArray, ToArray}; use crate::{ArrayDType, ArrayTrait}; @@ -13,7 +14,7 @@ impl TakeFn for ChunkedArray { return Ok(self.to_array()); } - let indices = cast(indices, PType::U64.into())?.flatten_primitive()?; + let indices = PrimitiveArray::try_from(try_cast(indices, PType::U64.into())?)?; // While the chunk idx remains the same, accumulate a list of chunk indices. let mut chunks = Vec::new(); @@ -54,7 +55,7 @@ impl TakeFn for ChunkedArray { mod test { use crate::array::chunked::ChunkedArray; use crate::compute::take::take; - use crate::{ArrayDType, ArrayTrait, AsArray, IntoArray}; + use crate::{ArrayDType, ArrayTrait, AsArray, IntoArray, IntoCanonical}; #[test] fn test_take() { @@ -68,7 +69,9 @@ mod test { let result = &ChunkedArray::try_from(take(arr.as_array_ref(), &indices).unwrap()) .unwrap() .into_array() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap(); assert_eq!(result.maybe_null_slice::(), &[1, 1, 1, 2]); } diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index 1b6eb9150..080e5acb5 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -1,3 +1,6 @@ +//! First-class chunked arrays. +//! +//! Vortex is a chunked array library that's able to use futures_util::stream; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -6,9 +9,9 @@ use vortex_error::vortex_bail; use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; -use crate::compute::scalar_at::scalar_at; -use crate::compute::scalar_subtract::{subtract_scalar, SubtractScalarFn}; use crate::compute::search_sorted::{search_sorted, SearchResult, SearchSortedSide}; +use crate::compute::unary::scalar_at::scalar_at; +use crate::compute::unary::scalar_subtract::{subtract_scalar, SubtractScalarFn}; use crate::iter::{ArrayIterator, ArrayIteratorAdapter}; use crate::stream::{ArrayStream, ArrayStreamAdapter}; use crate::validity::Validity::NonNullable; @@ -16,8 +19,8 @@ use crate::validity::{ArrayValidity, LogicalValidity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::{impl_encoding, ArrayDType}; +mod canonical; mod compute; -mod flatten; mod stats; impl_encoding!("vortex.chunked", Chunked); @@ -156,9 +159,9 @@ mod test { use vortex_dtype::{NativePType, PType}; use crate::array::chunked::ChunkedArray; - use crate::compute::scalar_subtract::subtract_scalar; use crate::compute::slice::slice; - use crate::{Array, IntoArray, ToArray}; + use crate::compute::unary::scalar_subtract::subtract_scalar; + use crate::{Array, IntoArray, IntoArrayVariant, IntoCanonical, ToArray}; fn chunked_array() -> ChunkedArray { ChunkedArray::try_new( @@ -177,7 +180,7 @@ mod test { ChunkedArray::try_from(arr) .unwrap() .chunks() - .map(|a| a.flatten_primitive().unwrap()) + .map(|a| a.into_primitive().unwrap()) .for_each(|a| values.extend_from_slice(a.maybe_null_slice::())); assert_eq!(values, slice); } @@ -222,7 +225,9 @@ mod test { let results = chunks_out .next() .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); @@ -230,7 +235,9 @@ mod test { let results = chunks_out .next() .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); @@ -238,7 +245,9 @@ mod test { let results = chunks_out .next() .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index 103b1693f..0402af880 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -2,8 +2,8 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::constant::ConstantArray; -use crate::compute::scalar_at::ScalarAtFn; use crate::compute::take::TakeFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::{Array, IntoArray}; diff --git a/vortex-array/src/array/constant/flatten.rs b/vortex-array/src/array/constant/flatten.rs index cde9ec3ba..661eaeaa7 100644 --- a/vortex-array/src/array/constant/flatten.rs +++ b/vortex-array/src/array/constant/flatten.rs @@ -7,10 +7,10 @@ use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; use crate::validity::Validity; use crate::{ArrayDType, ArrayTrait}; -use crate::{ArrayFlatten, Flattened}; +use crate::{Canonical, IntoCanonical}; -impl ArrayFlatten for ConstantArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for ConstantArray { + fn into_canonical(self) -> VortexResult { let validity = match self.dtype().nullability() { Nullability::NonNullable => Validity::NonNullable, Nullability::Nullable => match self.scalar().is_null() { @@ -20,7 +20,7 @@ impl ArrayFlatten for ConstantArray { }; if let Ok(b) = BoolScalar::try_from(self.scalar()) { - return Ok(Flattened::Bool(BoolArray::from_vec( + return Ok(Canonical::Bool(BoolArray::from_vec( vec![b.value().unwrap_or_default(); self.len()], validity, ))); @@ -28,7 +28,7 @@ impl ArrayFlatten for ConstantArray { if let Ok(ptype) = PType::try_from(self.scalar().dtype()) { return match_each_native_ptype!(ptype, |$P| { - Ok(Flattened::Primitive(PrimitiveArray::from_vec::<$P>( + Ok(Canonical::Primitive(PrimitiveArray::from_vec::<$P>( vec![$P::try_from(self.scalar()).unwrap_or_else(|_| $P::default()); self.len()], validity, ))) diff --git a/vortex-array/src/array/extension/compute.rs b/vortex-array/src/array/extension/compute.rs index fb007fee7..9e603d1ad 100644 --- a/vortex-array/src/array/extension/compute.rs +++ b/vortex-array/src/array/extension/compute.rs @@ -2,10 +2,10 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::extension::ExtensionArray; -use crate::compute::cast::CastFn; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::slice::{slice, SliceFn}; use crate::compute::take::{take, TakeFn}; +use crate::compute::unary::cast::CastFn; +use crate::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::ArrayCompute; use crate::{Array, IntoArray}; diff --git a/vortex-array/src/array/extension/mod.rs b/vortex-array/src/array/extension/mod.rs index a8d505984..9853f6e4d 100644 --- a/vortex-array/src/array/extension/mod.rs +++ b/vortex-array/src/array/extension/mod.rs @@ -4,7 +4,7 @@ use vortex_dtype::{ExtDType, ExtID}; use crate::stats::ArrayStatisticsCompute; use crate::validity::{ArrayValidity, LogicalValidity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayDType, ArrayFlatten}; +use crate::{impl_encoding, ArrayDType, Canonical, IntoCanonical}; mod compute; @@ -49,9 +49,9 @@ impl ExtensionArray { } } -impl ArrayFlatten for ExtensionArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::Extension(self)) +impl IntoCanonical for ExtensionArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Extension(self)) } } diff --git a/vortex-array/src/array/mod.rs b/vortex-array/src/array/mod.rs index 9a5a9ed35..31487ef3a 100644 --- a/vortex-array/src/array/mod.rs +++ b/vortex-array/src/array/mod.rs @@ -6,6 +6,6 @@ pub mod extension; pub mod null; pub mod primitive; pub mod sparse; -pub mod r#struct; +pub mod struct_; pub mod varbin; pub mod varbinview; diff --git a/vortex-array/src/array/null/compute.rs b/vortex-array/src/array/null/compute.rs index c627f29b8..acf6e2118 100644 --- a/vortex-array/src/array/null/compute.rs +++ b/vortex-array/src/array/null/compute.rs @@ -3,11 +3,11 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::null::NullArray; -use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; -use crate::{Array, ArrayTrait, IntoArray}; +use crate::{Array, ArrayTrait, IntoArray, IntoArrayVariant}; impl ArrayCompute for NullArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -40,7 +40,7 @@ impl ScalarAtFn for NullArray { impl TakeFn for NullArray { fn take(&self, indices: &Array) -> VortexResult { - let indices = indices.clone().flatten_primitive()?; + let indices = indices.clone().into_primitive()?; // Enforce all indices are valid match_each_integer_ptype!(indices.ptype(), |$T| { @@ -58,9 +58,9 @@ mod test { use vortex_dtype::DType; use crate::array::null::NullArray; - use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; use crate::compute::take::take; + use crate::compute::unary::scalar_at::scalar_at; use crate::validity::{ArrayValidity, LogicalValidity}; use crate::{ArrayTrait, IntoArray}; diff --git a/vortex-array/src/array/null/mod.rs b/vortex-array/src/array/null/mod.rs index fb0d93261..5be6d97f6 100644 --- a/vortex-array/src/array/null/mod.rs +++ b/vortex-array/src/array/null/mod.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::stats::{ArrayStatisticsCompute, Stat}; use crate::validity::{ArrayValidity, LogicalValidity, Validity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayFlatten}; +use crate::{impl_encoding, Canonical, IntoCanonical}; mod compute; @@ -26,9 +26,9 @@ impl NullArray { } } -impl ArrayFlatten for NullArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::Null(self)) +impl IntoCanonical for NullArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Null(self)) } } diff --git a/vortex-array/src/array/primitive/compute/cast.rs b/vortex-array/src/array/primitive/compute/cast.rs index 2e5e69d62..e1339571c 100644 --- a/vortex-array/src/array/primitive/compute/cast.rs +++ b/vortex-array/src/array/primitive/compute/cast.rs @@ -3,7 +3,7 @@ use vortex_dtype::{NativePType, PType}; use vortex_error::{vortex_err, VortexResult}; use crate::array::primitive::PrimitiveArray; -use crate::compute::cast::CastFn; +use crate::compute::unary::cast::CastFn; use crate::validity::Validity; use crate::IntoArray; use crate::{Array, ArrayDType}; @@ -57,25 +57,27 @@ mod test { #[test] fn cast_u32_u8() { let arr = vec![0u32, 10, 200].into_array(); - let p = compute::cast::cast(&arr, PType::U8.into()) + let p = compute::unary::cast::try_cast(&arr, PType::U8.into()) .unwrap() - .into_primitive(); + .as_primitive(); assert_eq!(p.maybe_null_slice::(), vec![0u8, 10, 200]); } #[test] fn cast_u32_f32() { let arr = vec![0u32, 10, 200].into_array(); - let u8arr = compute::cast::cast(&arr, PType::F32.into()) + let u8arr = compute::unary::cast::try_cast(&arr, PType::F32.into()) .unwrap() - .into_primitive(); + .as_primitive(); assert_eq!(u8arr.maybe_null_slice::(), vec![0.0f32, 10., 200.]); } #[test] fn cast_i32_u32() { let arr = vec![-1i32].into_array(); - let error = compute::cast::cast(&arr, PType::U32.into()).err().unwrap(); + let error = compute::unary::cast::try_cast(&arr, PType::U32.into()) + .err() + .unwrap(); let VortexError::ComputeError(s, _) = error else { unreachable!() }; diff --git a/vortex-array/src/array/primitive/compute/compare.rs b/vortex-array/src/array/primitive/compute/compare.rs index 5605b51c4..2feb092ec 100644 --- a/vortex-array/src/array/primitive/compute/compare.rs +++ b/vortex-array/src/array/primitive/compute/compare.rs @@ -8,11 +8,11 @@ use vortex_expr::Operator; use crate::array::bool::BoolArray; use crate::array::primitive::PrimitiveArray; use crate::compute::compare::CompareFn; -use crate::{Array, ArrayTrait, IntoArray}; +use crate::{Array, ArrayTrait, IntoArray, IntoArrayVariant}; impl CompareFn for PrimitiveArray { fn compare(&self, other: &Array, predicate: Operator) -> VortexResult { - let flattened = other.clone().flatten_primitive()?; + let flattened = other.clone().into_primitive()?; let matching_idxs = match_each_native_ptype!(self.ptype(), |$T| { let predicate_fn = &predicate.to_predicate::<$T>(); @@ -49,6 +49,7 @@ mod test { use super::*; use crate::compute::compare::compare; + use crate::IntoArrayVariant; fn to_int_indices(indices_bits: BoolArray) -> Vec { let filtered = indices_bits @@ -78,10 +79,10 @@ mod test { ]) .into_array(); - let matches = compare(&arr, &arr, Operator::Eq)?.flatten_bool()?; + let matches = compare(&arr, &arr, Operator::Eq)?.into_bool()?; assert_eq!(to_int_indices(matches), [0u64, 1, 2, 3, 5, 6, 7, 8, 10]); - let matches = compare(&arr, &arr, Operator::NotEq)?.flatten_bool()?; + let matches = compare(&arr, &arr, Operator::NotEq)?.into_bool()?; let empty: [u64; 0] = []; assert_eq!(to_int_indices(matches), empty); @@ -101,16 +102,16 @@ mod test { ]) .into_array(); - let matches = compare(&arr, &other, Operator::Lte)?.flatten_bool()?; + let matches = compare(&arr, &other, Operator::Lte)?.into_bool()?; assert_eq!(to_int_indices(matches), [0u64, 1, 2, 3, 5, 6, 7, 8, 10]); - let matches = compare(&arr, &other, Operator::Lt)?.flatten_bool()?; + let matches = compare(&arr, &other, Operator::Lt)?.into_bool()?; assert_eq!(to_int_indices(matches), [5u64, 6, 7, 8, 10]); - let matches = compare(&other, &arr, Operator::Gte)?.flatten_bool()?; + let matches = compare(&other, &arr, Operator::Gte)?.into_bool()?; assert_eq!(to_int_indices(matches), [0u64, 1, 2, 3, 5, 6, 7, 8, 10]); - let matches = compare(&other, &arr, Operator::Gt)?.flatten_bool()?; + let matches = compare(&other, &arr, Operator::Gt)?.into_bool()?; assert_eq!(to_int_indices(matches), [5u64, 6, 7, 8, 10]); Ok(()) } diff --git a/vortex-array/src/array/primitive/compute/fill.rs b/vortex-array/src/array/primitive/compute/fill.rs index e4114580e..b391efdbc 100644 --- a/vortex-array/src/array/primitive/compute/fill.rs +++ b/vortex-array/src/array/primitive/compute/fill.rs @@ -2,7 +2,7 @@ use vortex_dtype::match_each_native_ptype; use vortex_error::VortexResult; use crate::array::primitive::PrimitiveArray; -use crate::compute::fill::FillForwardFn; +use crate::compute::unary::fill_forward::FillForwardFn; use crate::validity::ArrayValidity; use crate::{Array, IntoArray, ToArrayData}; @@ -41,7 +41,9 @@ mod test { fn leading_none() { let arr = PrimitiveArray::from_nullable_vec(vec![None, Some(8u8), None, Some(10), None]) .into_array(); - let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + let p = compute::unary::fill_forward::fill_forward(&arr) + .unwrap() + .as_primitive(); assert_eq!(p.maybe_null_slice::(), vec![0, 8, 8, 10, 10]); assert!(p.logical_validity().all_valid()); } @@ -52,7 +54,9 @@ mod test { PrimitiveArray::from_nullable_vec(vec![Option::::None, None, None, None, None]) .into_array(); - let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + let p = compute::unary::fill_forward::fill_forward(&arr) + .unwrap() + .as_primitive(); assert_eq!(p.maybe_null_slice::(), vec![0, 0, 0, 0, 0]); assert!(p.logical_validity().all_valid()); } @@ -64,7 +68,9 @@ mod test { Validity::Array(BoolArray::from(vec![true, true, true, true, true]).into_array()), ) .into_array(); - let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + let p = compute::unary::fill_forward::fill_forward(&arr) + .unwrap() + .as_primitive(); assert_eq!(p.maybe_null_slice::(), vec![8, 10, 12, 14, 16]); assert!(p.logical_validity().all_valid()); } diff --git a/vortex-array/src/array/primitive/compute/filter_indices.rs b/vortex-array/src/array/primitive/compute/filter_indices.rs index d77f07987..bc5ece028 100644 --- a/vortex-array/src/array/primitive/compute/filter_indices.rs +++ b/vortex-array/src/array/primitive/compute/filter_indices.rs @@ -73,6 +73,7 @@ mod test { use super::*; use crate::validity::Validity; + use crate::IntoCanonical; fn apply_conjunctive_filter(arr: &PrimitiveArray, conj: Conjunction) -> VortexResult { arr.filter_indices(&Disjunction::from_iter([conj])) @@ -107,7 +108,9 @@ mod test { let filtered_primitive = apply_conjunctive_filter(&arr, Conjunction::from(field.lt(lit(5u32)))) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [0u64, 1, 2, 3]); @@ -115,7 +118,9 @@ mod test { let filtered_primitive = apply_conjunctive_filter(&arr, Conjunction::from(field.gt(lit(5u32)))) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [6u64, 7, 8, 10]); @@ -123,7 +128,9 @@ mod test { let filtered_primitive = apply_conjunctive_filter(&arr, Conjunction::from(field.equal(lit(5u32)))) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [5u64]); @@ -131,7 +138,9 @@ mod test { let filtered_primitive = apply_conjunctive_filter(&arr, Conjunction::from(field.gte(lit(5u32)))) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [5u64, 6, 7, 8, 10]); @@ -139,7 +148,9 @@ mod test { let filtered_primitive = apply_conjunctive_filter(&arr, Conjunction::from(field.lte(lit(5u32)))) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [0u64, 1, 2, 3, 5]); @@ -155,7 +166,9 @@ mod test { Conjunction::from_iter([field.lt(lit(5u32)), field.gt(lit(2u32))]), ) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [2u64, 3]) @@ -171,7 +184,9 @@ mod test { Conjunction::from_iter([field.lt(lit(5u32)), field.gt(lit(5u32))]), ) .unwrap() - .flatten_bool() + .into_canonical() + .unwrap() + .into_bool() .unwrap(); let filtered = to_int_indices(filtered_primitive); let expected: [u64; 0] = []; @@ -187,7 +202,13 @@ mod test { let c2 = Conjunction::from(field.gt(lit(5u32))); let disj = Disjunction::from_iter([c1, c2]); - let filtered_primitive = arr.filter_indices(&disj).unwrap().flatten_bool().unwrap(); + let filtered_primitive = arr + .filter_indices(&disj) + .unwrap() + .into_canonical() + .unwrap() + .into_bool() + .unwrap(); let filtered = to_int_indices(filtered_primitive); assert_eq!(filtered, [0u64, 1, 2, 3, 5, 6, 7, 8, 9]) } diff --git a/vortex-array/src/array/primitive/compute/mod.rs b/vortex-array/src/array/primitive/compute/mod.rs index e1ca9f3b0..293b28e9c 100644 --- a/vortex-array/src/array/primitive/compute/mod.rs +++ b/vortex-array/src/array/primitive/compute/mod.rs @@ -1,13 +1,13 @@ use crate::array::primitive::PrimitiveArray; -use crate::compute::cast::CastFn; use crate::compute::compare::CompareFn; -use crate::compute::fill::FillForwardFn; use crate::compute::filter_indices::FilterIndicesFn; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::scalar_subtract::SubtractScalarFn; use crate::compute::search_sorted::SearchSortedFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; +use crate::compute::unary::cast::CastFn; +use crate::compute::unary::fill_forward::FillForwardFn; +use crate::compute::unary::scalar_at::ScalarAtFn; +use crate::compute::unary::scalar_subtract::SubtractScalarFn; use crate::compute::ArrayCompute; mod cast; diff --git a/vortex-array/src/array/primitive/compute/scalar_at.rs b/vortex-array/src/array/primitive/compute/scalar_at.rs index 212842a62..b256ad5d8 100644 --- a/vortex-array/src/array/primitive/compute/scalar_at.rs +++ b/vortex-array/src/array/primitive/compute/scalar_at.rs @@ -3,7 +3,7 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; -use crate::compute::scalar_at::ScalarAtFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::validity::ArrayValidity; use crate::ArrayDType; diff --git a/vortex-array/src/array/primitive/compute/subtract_scalar.rs b/vortex-array/src/array/primitive/compute/subtract_scalar.rs index c5599a160..e724cb0be 100644 --- a/vortex-array/src/array/primitive/compute/subtract_scalar.rs +++ b/vortex-array/src/array/primitive/compute/subtract_scalar.rs @@ -8,7 +8,7 @@ use vortex_scalar::Scalar; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; -use crate::compute::scalar_subtract::SubtractScalarFn; +use crate::compute::unary::scalar_subtract::SubtractScalarFn; use crate::stats::{ArrayStatistics, Stat}; use crate::validity::ArrayValidity; use crate::{Array, ArrayDType, ArrayTrait, IntoArray}; @@ -105,15 +105,17 @@ mod test { use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; - use crate::compute::scalar_subtract::subtract_scalar; - use crate::{ArrayTrait, IntoArray}; + use crate::compute::unary::scalar_subtract::subtract_scalar; + use crate::{ArrayTrait, IntoArray, IntoCanonical}; #[test] fn test_scalar_subtract_unsigned() { let values = vec![1u16, 2, 3].into_array(); let results = subtract_scalar(&values, &1u16.into()) .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); @@ -125,7 +127,9 @@ mod test { let values = vec![1i64, 2, 3].into_array(); let results = subtract_scalar(&values, &(-1i64).into()) .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); @@ -138,7 +142,9 @@ mod test { .into_array(); let flattened = subtract_scalar(&values, &Some(1u16).into()) .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap(); let results = flattened.maybe_null_slice::().to_vec(); @@ -160,7 +166,9 @@ mod test { let to_subtract = -1f64; let results = subtract_scalar(&values, &to_subtract.into()) .unwrap() - .flatten_primitive() + .into_canonical() + .unwrap() + .into_primitive() .unwrap() .maybe_null_slice::() .to_vec(); diff --git a/vortex-array/src/array/primitive/compute/take.rs b/vortex-array/src/array/primitive/compute/take.rs index 2f22bdf00..669c3126c 100644 --- a/vortex-array/src/array/primitive/compute/take.rs +++ b/vortex-array/src/array/primitive/compute/take.rs @@ -6,12 +6,12 @@ use vortex_error::VortexResult; use crate::array::primitive::PrimitiveArray; use crate::compute::take::TakeFn; use crate::Array; -use crate::IntoArray; +use crate::{IntoArray, IntoArrayVariant}; impl TakeFn for PrimitiveArray { fn take(&self, indices: &Array) -> VortexResult { let validity = self.validity(); - let indices = indices.clone().flatten_primitive()?; + let indices = indices.clone().into_primitive()?; match_each_native_ptype!(self.ptype(), |$T| { match_each_integer_ptype!(indices.ptype(), |$I| { Ok(PrimitiveArray::from_vec( diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index 562e51d1f..7b3e18cdb 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -8,8 +8,8 @@ use vortex_error::vortex_bail; use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::ArrayFlatten; use crate::{impl_encoding, ArrayDType}; +use crate::{Canonical, IntoCanonical}; mod accessor; mod compute; @@ -177,9 +177,9 @@ impl IntoArray for Vec { } } -impl ArrayFlatten for PrimitiveArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::Primitive(self)) +impl IntoCanonical for PrimitiveArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Primitive(self)) } } @@ -207,10 +207,6 @@ impl AcceptArrayVisitor for PrimitiveArray { } impl Array { - pub fn into_primitive(self) -> PrimitiveArray { - PrimitiveArray::try_from(self).expect("expected primitive array") - } - pub fn as_primitive(&self) -> PrimitiveArray { PrimitiveArray::try_from(self).expect("expected primitive array") } diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index 14790dae2..d86f6cfd9 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -12,7 +12,7 @@ use crate::array::primitive::PrimitiveArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use crate::validity::ArrayValidity; use crate::validity::LogicalValidity; -use crate::ArrayDType; +use crate::{ArrayDType, IntoArrayVariant}; trait PStatsType: NativePType + Into + BitWidth {} @@ -26,7 +26,7 @@ impl ArrayStatisticsCompute for PrimitiveArray { LogicalValidity::AllInvalid(v) => Ok(StatsSet::nulls(v, self.dtype())), LogicalValidity::Array(a) => NullableValues( self.maybe_null_slice::<$P>(), - &a.clone().flatten_bool()?.boolean_buffer(), + &a.clone().into_bool()?.boolean_buffer(), ) .compute_statistics(stat), } diff --git a/vortex-array/src/array/sparse/compress.rs b/vortex-array/src/array/sparse/compress.rs index 5cb076cf4..f484a05dc 100644 --- a/vortex-array/src/array/sparse/compress.rs +++ b/vortex-array/src/array/sparse/compress.rs @@ -25,7 +25,7 @@ impl EncodingCompression for SparseEncoding { ) -> VortexResult { let sparse_array = SparseArray::try_from(array)?; let sparse_like = like.map(|la| SparseArray::try_from(la).unwrap()); - Ok(SparseArray::new( + Ok(SparseArray::try_new( ctx.auxiliary("indices").compress( &sparse_array.indices(), sparse_like.as_ref().map(|sa| sa.indices()).as_ref(), @@ -37,6 +37,7 @@ impl EncodingCompression for SparseEncoding { sparse_array.len(), sparse_array.fill_value().clone(), ) + .unwrap() .into_array()) } } diff --git a/vortex-array/src/array/sparse/compute/mod.rs b/vortex-array/src/array/sparse/compute/mod.rs index 2ab88bbed..27e43ab79 100644 --- a/vortex-array/src/array/sparse/compute/mod.rs +++ b/vortex-array/src/array/sparse/compute/mod.rs @@ -7,11 +7,11 @@ use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::slice::SliceFn; use crate::compute::take::{take, TakeFn}; +use crate::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::ArrayCompute; -use crate::{Array, ArrayDType, IntoArray}; +use crate::{Array, ArrayDType, IntoArray, IntoArrayVariant}; mod slice; @@ -40,7 +40,7 @@ impl ScalarAtFn for SparseArray { impl TakeFn for SparseArray { fn take(&self, indices: &Array) -> VortexResult { - let flat_indices = indices.clone().flatten_primitive()?; + let flat_indices = indices.clone().into_primitive()?; // if we are taking a lot of values we should build a hashmap let (positions, physical_take_indices) = if indices.len() > 128 { take_map(self, &flat_indices)? @@ -50,12 +50,13 @@ impl TakeFn for SparseArray { let taken_values = take(&self.values(), &physical_take_indices.into_array())?; - Ok(Self::new( + Ok(Self::try_new( positions.into_array(), taken_values, indices.len(), self.fill_value().clone(), ) + .unwrap() .into_array()) } } @@ -123,13 +124,14 @@ mod test { use crate::{Array, ArrayTrait, IntoArray}; fn sparse_array() -> Array { - SparseArray::new( + SparseArray::try_new( PrimitiveArray::from(vec![0u64, 37, 47, 99]).into_array(), PrimitiveArray::from_vec(vec![1.23f64, 0.47, 9.99, 3.5], Validity::AllValid) .into_array(), 100, Scalar::null(DType::Primitive(PType::F64, Nullability::Nullable)), ) + .unwrap() .into_array() } @@ -140,11 +142,11 @@ mod test { SparseArray::try_from(take(&sparse, &vec![0, 47, 47, 0, 99].into_array()).unwrap()) .unwrap(); assert_eq!( - taken.indices().into_primitive().maybe_null_slice::(), + taken.indices().as_primitive().maybe_null_slice::(), [0, 1, 2, 3, 4] ); assert_eq!( - taken.values().into_primitive().maybe_null_slice::(), + taken.values().as_primitive().maybe_null_slice::(), [1.23f64, 9.99, 9.99, 1.23, 3.5] ); } @@ -155,12 +157,12 @@ mod test { let taken = SparseArray::try_from(take(&sparse, &vec![69].into_array()).unwrap()).unwrap(); assert!(taken .indices() - .into_primitive() + .as_primitive() .maybe_null_slice::() .is_empty()); assert!(taken .values() - .into_primitive() + .as_primitive() .maybe_null_slice::() .is_empty()); } @@ -171,11 +173,11 @@ mod test { let taken = SparseArray::try_from(take(&sparse, &vec![69, 37].into_array()).unwrap()).unwrap(); assert_eq!( - taken.indices().into_primitive().maybe_null_slice::(), + taken.indices().as_primitive().maybe_null_slice::(), [1] ); assert_eq!( - taken.values().into_primitive().maybe_null_slice::(), + taken.values().as_primitive().maybe_null_slice::(), [0.47f64] ); assert_eq!(taken.len(), 2); @@ -188,7 +190,7 @@ mod test { let (positions, patch_indices) = take_map(&sparse, &indices).unwrap(); assert_eq!( positions.maybe_null_slice::(), - sparse.indices().into_primitive().maybe_null_slice::() + sparse.indices().as_primitive().maybe_null_slice::() ); assert_eq!(patch_indices.maybe_null_slice::(), [0u64, 1, 2, 3]); } diff --git a/vortex-array/src/array/sparse/flatten.rs b/vortex-array/src/array/sparse/flatten.rs index 7ef8fa75b..bf382b4a6 100644 --- a/vortex-array/src/array/sparse/flatten.rs +++ b/vortex-array/src/array/sparse/flatten.rs @@ -8,10 +8,10 @@ use crate::array::bool::BoolArray; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; use crate::validity::Validity; -use crate::{ArrayDType, ArrayFlatten, ArrayTrait, Flattened}; +use crate::{ArrayDType, ArrayTrait, Canonical, IntoArrayVariant, IntoCanonical}; -impl ArrayFlatten for SparseArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for SparseArray { + fn into_canonical(self) -> VortexResult { // Resolve our indices into a vector of usize applying the offset let indices = self.resolved_indices(); @@ -19,12 +19,16 @@ impl ArrayFlatten for SparseArray { validity.append_n(self.len(), false); if matches!(self.dtype(), DType::Bool(_)) { - let values = self.values().flatten_bool()?.boolean_buffer(); - flatten_sparse_bools(values, &indices, self.len(), self.fill_value(), validity) + let values = self + .values() + .into_canonical()? + .into_bool()? + .boolean_buffer(); + canonicalize_sparse_bools(values, &indices, self.len(), self.fill_value(), validity) } else { - let values = self.values().flatten_primitive()?; + let values = self.values().into_primitive()?; match_each_native_ptype!(values.ptype(), |$P| { - flatten_sparse_primitives( + canonicalize_sparse_primitives( values.maybe_null_slice::<$P>(), &indices, self.len(), @@ -36,13 +40,13 @@ impl ArrayFlatten for SparseArray { } } -fn flatten_sparse_bools( +fn canonicalize_sparse_bools( values: BooleanBuffer, indices: &[usize], len: usize, fill_value: &Scalar, mut validity: BooleanBufferBuilder, -) -> VortexResult { +) -> VortexResult { let fill_bool: bool = if fill_value.is_null() { bool::default() } else { @@ -57,16 +61,18 @@ fn flatten_sparse_bools( let validity = Validity::from(validity.finish()); let bool_values = BoolArray::from_vec(flat_bools, validity); - Ok(Flattened::Bool(bool_values)) + Ok(Canonical::Bool(bool_values)) } -fn flatten_sparse_primitives TryFrom<&'a Scalar, Error = VortexError>>( +fn canonicalize_sparse_primitives< + T: NativePType + for<'a> TryFrom<&'a Scalar, Error = VortexError>, +>( values: &[T], indices: &[usize], len: usize, fill_value: &Scalar, mut validity: BooleanBufferBuilder, -) -> VortexResult { +) -> VortexResult { let primitive_fill = if fill_value.is_null() { T::default() } else { @@ -85,7 +91,7 @@ fn flatten_sparse_primitives TryFrom<&'a Scalar, Error } else { PrimitiveArray::from(result) }; - Ok(Flattened::Primitive(array)) + Ok(Canonical::Primitive(array)) } #[cfg(test)] @@ -95,15 +101,15 @@ mod test { use crate::array::bool::BoolArray; use crate::array::sparse::SparseArray; use crate::validity::Validity; - use crate::{ArrayDType, ArrayFlatten, Flattened, IntoArray}; + use crate::{ArrayDType, Canonical, IntoArray, IntoCanonical}; #[test] fn test_sparse_bool() { let indices = vec![0u64].into_array(); let values = BoolArray::from_vec(vec![true], Validity::NonNullable).into_array(); - let sparse_bools = SparseArray::new(indices, values, 10, true.into()); + let sparse_bools = SparseArray::try_new(indices, values, 10, true.into()).unwrap(); assert_eq!(*sparse_bools.dtype(), DType::Bool(Nullability::NonNullable)); - let flat_bools = sparse_bools.flatten().unwrap(); - assert!(matches!(flat_bools, Flattened::Bool(_))); + let flat_bools = sparse_bools.into_canonical().unwrap(); + assert!(matches!(flat_bools, Canonical::Bool(_))); } } diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index 0e62c5a16..617f70bd0 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -8,7 +8,7 @@ use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; use crate::stats::ArrayStatisticsCompute; use crate::validity::{ArrayValidity, LogicalValidity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayDType}; +use crate::{impl_encoding, ArrayDType, IntoCanonical}; mod compress; mod compute; @@ -26,10 +26,6 @@ pub struct SparseMetadata { } impl SparseArray { - pub fn new(indices: Array, values: Array, len: usize, fill_value: Scalar) -> Self { - Self::try_new(indices, values, len, fill_value).unwrap() - } - pub fn try_new( indices: Array, values: Array, @@ -108,7 +104,12 @@ impl SparseArray { /// Return indices as a vector of usize with the indices_offset applied. pub fn resolved_indices(&self) -> Vec { - let flat_indices = self.indices().flatten_primitive().unwrap(); + let flat_indices = self + .indices() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(); match_each_integer_ptype!(flat_indices.ptype(), |$P| { flat_indices .maybe_null_slice::<$P>() @@ -181,10 +182,10 @@ mod test { use crate::accessor::ArrayAccessor; use crate::array::sparse::SparseArray; - use crate::compute::cast::cast; - use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; - use crate::{Array, IntoArray}; + use crate::compute::unary::cast::try_cast; + use crate::compute::unary::scalar_at::scalar_at; + use crate::{Array, IntoArray, IntoCanonical}; fn nullable_fill() -> Scalar { Scalar::null(DType::Primitive(PType::I32, Nullable)) @@ -198,14 +199,21 @@ mod test { fn sparse_array(fill_value: Scalar) -> Array { // merged array: [null, null, 100, null, null, 200, null, null, 300, null] let mut values = vec![100i32, 200, 300].into_array(); - values = cast(&values, fill_value.dtype()).unwrap(); + values = try_cast(&values, fill_value.dtype()).unwrap(); - SparseArray::new(vec![2u64, 5, 8].into_array(), values, 10, fill_value).into_array() + SparseArray::try_new(vec![2u64, 5, 8].into_array(), values, 10, fill_value) + .unwrap() + .into_array() } fn assert_sparse_array(sparse: &Array, values: &[Option]) { let sparse_arrow = ArrayAccessor::::with_iterator( - &sparse.clone().flatten_primitive().unwrap(), + &sparse + .clone() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(), |iter| iter.map(|v| v.cloned()).collect_vec(), ) .unwrap(); diff --git a/vortex-array/src/array/struct/compress.rs b/vortex-array/src/array/struct_/compress.rs similarity index 100% rename from vortex-array/src/array/struct/compress.rs rename to vortex-array/src/array/struct_/compress.rs diff --git a/vortex-array/src/array/struct/compute.rs b/vortex-array/src/array/struct_/compute.rs similarity index 94% rename from vortex-array/src/array/struct/compute.rs rename to vortex-array/src/array/struct_/compute.rs index d873c9a7a..a6cdf9f95 100644 --- a/vortex-array/src/array/struct/compute.rs +++ b/vortex-array/src/array/struct_/compute.rs @@ -2,10 +2,10 @@ use itertools::Itertools; use vortex_error::VortexResult; use vortex_scalar::Scalar; -use crate::array::r#struct::StructArray; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; +use crate::array::struct_::StructArray; use crate::compute::slice::{slice, SliceFn}; use crate::compute::take::{take, TakeFn}; +use crate::compute::unary::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::ArrayCompute; use crate::{Array, ArrayDType, IntoArray}; diff --git a/vortex-array/src/array/struct/mod.rs b/vortex-array/src/array/struct_/mod.rs similarity index 88% rename from vortex-array/src/array/struct/mod.rs rename to vortex-array/src/array/struct_/mod.rs index 042ec63ef..86a74c509 100644 --- a/vortex-array/src/array/struct/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -1,12 +1,12 @@ use serde::{Deserialize, Serialize}; -use vortex_dtype::{FieldNames, Nullability, StructDType}; +use vortex_dtype::{FieldName, FieldNames, Nullability, StructDType}; use vortex_error::{vortex_bail, vortex_err}; use crate::stats::ArrayStatisticsCompute; use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::ArrayFlatten; use crate::{impl_encoding, ArrayDType}; +use crate::{Canonical, IntoCanonical}; mod compute; @@ -96,6 +96,18 @@ impl StructArray { StatsSet::new(), ) } + + pub fn from_fields>(items: &[(N, Array)]) -> Self { + let names: Vec = items + .iter() + .map(|(name, _)| FieldName::from(name.as_ref())) + .collect(); + let fields: Vec = items.iter().map(|(_, array)| array.clone()).collect(); + let len = fields.first().unwrap().len(); + + Self::try_new(FieldNames::from(names), fields, len, Validity::NonNullable) + .expect("building StructArray with helper") + } } impl StructArray { @@ -128,10 +140,10 @@ impl StructArray { } } -impl ArrayFlatten for StructArray { +impl IntoCanonical for StructArray { /// StructEncoding is the canonical form for a [DType::Struct] array, so return self. - fn flatten(self) -> VortexResult { - Ok(Flattened::Struct(self)) + fn into_canonical(self) -> VortexResult { + Ok(Canonical::Struct(self)) } } @@ -171,7 +183,7 @@ mod test { use crate::array::bool::BoolArray; use crate::array::primitive::PrimitiveArray; - use crate::array::r#struct::StructArray; + use crate::array::struct_::StructArray; use crate::array::varbin::VarBinArray; use crate::validity::Validity; use crate::{ArrayTrait, IntoArray}; diff --git a/vortex-array/src/array/varbin/accessor.rs b/vortex-array/src/array/varbin/accessor.rs index c6ff09d4c..ecccf17e3 100644 --- a/vortex-array/src/array/varbin/accessor.rs +++ b/vortex-array/src/array/varbin/accessor.rs @@ -4,6 +4,7 @@ use vortex_error::VortexResult; use crate::accessor::ArrayAccessor; use crate::array::varbin::VarBinArray; use crate::validity::ArrayValidity; +use crate::IntoArrayVariant; impl ArrayAccessor<[u8]> for VarBinArray { fn with_iterator(&self, f: F) -> VortexResult @@ -11,8 +12,8 @@ impl ArrayAccessor<[u8]> for VarBinArray { F: for<'a> FnOnce(&mut (dyn Iterator>)) -> R, { // TODO(ngates): what happens if bytes is much larger than sliced_bytes? - let primitive = self.bytes().flatten_primitive()?; - let offsets = self.offsets().flatten_primitive()?; + let primitive = self.bytes().into_primitive()?; + let offsets = self.offsets().into_primitive()?; let validity = self.logical_validity().to_null_buffer()?; match_each_integer_ptype!(offsets.ptype(), |$T| { diff --git a/vortex-array/src/array/varbin/builder.rs b/vortex-array/src/array/varbin/builder.rs index f48a79c19..b70292907 100644 --- a/vortex-array/src/array/varbin/builder.rs +++ b/vortex-array/src/array/varbin/builder.rs @@ -72,7 +72,7 @@ mod test { use vortex_scalar::Scalar; use crate::array::varbin::builder::VarBinBuilder; - use crate::compute::scalar_at::scalar_at; + use crate::compute::unary::scalar_at::scalar_at; use crate::{ArrayDType, IntoArray}; #[test] diff --git a/vortex-array/src/array/varbin/compute/mod.rs b/vortex-array/src/array/varbin/compute/mod.rs index 732271728..198d482a2 100644 --- a/vortex-array/src/array/varbin/compute/mod.rs +++ b/vortex-array/src/array/varbin/compute/mod.rs @@ -2,9 +2,9 @@ use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::varbin::{varbin_scalar, VarBinArray}; -use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::validity::ArrayValidity; use crate::ArrayDType; diff --git a/vortex-array/src/array/varbin/compute/take.rs b/vortex-array/src/array/varbin/compute/take.rs index 4f67cb845..16bdf8fb2 100644 --- a/vortex-array/src/array/varbin/compute/take.rs +++ b/vortex-array/src/array/varbin/compute/take.rs @@ -9,8 +9,8 @@ use crate::array::varbin::VarBinArray; use crate::compute::take::TakeFn; use crate::validity::Validity; use crate::Array; -use crate::ArrayDType; use crate::IntoArray; +use crate::{ArrayDType, IntoArrayVariant}; impl TakeFn for VarBinArray { fn take(&self, indices: &Array) -> VortexResult { @@ -20,9 +20,9 @@ impl TakeFn for VarBinArray { "indices.len() must be less than i32::MAX" ); - let offsets = self.offsets().flatten_primitive()?; - let data = self.bytes().flatten_primitive()?; - let indices = indices.clone().flatten_primitive()?; + let offsets = self.offsets().into_primitive()?; + let data = self.bytes().into_primitive()?; + let indices = indices.clone().into_primitive()?; match_each_integer_ptype!(offsets.ptype(), |$O| { match_each_integer_ptype!(indices.ptype(), |$I| { Ok(take( diff --git a/vortex-array/src/array/varbin/flatten.rs b/vortex-array/src/array/varbin/flatten.rs index cc5230485..d00d03290 100644 --- a/vortex-array/src/array/varbin/flatten.rs +++ b/vortex-array/src/array/varbin/flatten.rs @@ -1,10 +1,10 @@ use vortex_error::VortexResult; use crate::array::varbin::VarBinArray; -use crate::{ArrayFlatten, Flattened}; +use crate::{Canonical, IntoCanonical}; -impl ArrayFlatten for VarBinArray { - fn flatten(self) -> VortexResult { - Ok(Flattened::VarBin(self)) +impl IntoCanonical for VarBinArray { + fn into_canonical(self) -> VortexResult { + Ok(Canonical::VarBin(self)) } } diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index a77f96b09..2ddfc92b9 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -9,10 +9,10 @@ use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::builder::VarBinBuilder; -use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; +use crate::compute::unary::scalar_at::scalar_at; use crate::validity::{Validity, ValidityMetadata}; -use crate::{impl_encoding, ArrayDType}; +use crate::{impl_encoding, ArrayDType, IntoArrayVariant}; mod accessor; mod array; @@ -153,7 +153,7 @@ impl VarBinArray { let start = self.offset_at(index); let end = self.offset_at(index + 1); let sliced = slice(&self.bytes(), start, end)?; - Ok(sliced.flatten_primitive()?.buffer().clone()) + Ok(sliced.into_primitive()?.buffer().clone()) } } @@ -222,8 +222,8 @@ mod test { use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; - use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; + use crate::compute::unary::scalar_at::scalar_at; use crate::validity::Validity; use crate::{Array, IntoArray}; diff --git a/vortex-array/src/array/varbinview/accessor.rs b/vortex-array/src/array/varbinview/accessor.rs index 6aee3b951..cf2ef0557 100644 --- a/vortex-array/src/array/varbinview/accessor.rs +++ b/vortex-array/src/array/varbinview/accessor.rs @@ -4,6 +4,7 @@ use crate::accessor::ArrayAccessor; use crate::array::primitive::PrimitiveArray; use crate::array::varbinview::VarBinViewArray; use crate::validity::ArrayValidity; +use crate::{Canonical, IntoCanonical}; impl ArrayAccessor<[u8]> for VarBinViewArray { fn with_iterator FnOnce(&mut dyn Iterator>) -> R, R>( @@ -12,7 +13,11 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { ) -> VortexResult { let views = self.view_slice(); let bytes: Vec = (0..self.metadata().n_children) - .map(|i| self.bytes(i).flatten_primitive()) + .map(|i| { + self.bytes(i) + .into_canonical() + .and_then(Canonical::into_primitive) + }) .collect::>>()?; let validity = self.logical_validity().to_null_buffer()?; diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index c5ecba578..cf9a06b35 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -3,8 +3,8 @@ use vortex_scalar::Scalar; use crate::array::varbin::varbin_scalar; use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE}; -use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::{slice, SliceFn}; +use crate::compute::unary::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::validity::ArrayValidity; use crate::{Array, ArrayDType, IntoArray, IntoArrayData}; diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 7c30513cb..60ba73c70 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -18,7 +18,7 @@ use crate::compute::slice::slice; use crate::validity::Validity; use crate::validity::{ArrayValidity, LogicalValidity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayDType, ArrayData, ArrayFlatten}; +use crate::{impl_encoding, ArrayDType, ArrayData, Canonical, IntoCanonical}; mod accessor; mod builder; @@ -215,7 +215,8 @@ impl VarBinViewArray { view._ref.offset as usize, (view._ref.size + view._ref.offset) as usize, )? - .flatten_primitive()?; + .into_canonical()? + .into_primitive()?; Ok(data_buf.maybe_null_slice::().to_vec()) } else { Ok(view.inlined.data[..view.inlined.size as usize].to_vec()) @@ -224,15 +225,15 @@ impl VarBinViewArray { } } -impl ArrayFlatten for VarBinViewArray { - fn flatten(self) -> VortexResult { +impl IntoCanonical for VarBinViewArray { + fn into_canonical(self) -> VortexResult { let nullable = self.dtype().is_nullable(); let arrow_self = as_arrow(self); let arrow_varbin = arrow_cast::cast(arrow_self.deref(), &DataType::Utf8) .expect("Utf8View must cast to Ut8f"); let vortex_array = ArrayData::from_arrow(arrow_varbin, nullable).into_array(); - Ok(Flattened::VarBin(VarBinArray::try_from(&vortex_array)?)) + Ok(Canonical::VarBin(VarBinArray::try_from(&vortex_array)?)) } } @@ -240,7 +241,9 @@ fn as_arrow(var_bin_view: VarBinViewArray) -> ArrayRef { // Views should be buffer of u8 let views = var_bin_view .views() - .flatten_primitive() + .into_canonical() + .expect("into_canonical") + .into_primitive() .expect("views must be primitive"); assert_eq!(views.ptype(), PType::U8); let nulls = var_bin_view @@ -249,7 +252,12 @@ fn as_arrow(var_bin_view: VarBinViewArray) -> ArrayRef { .expect("null buffer"); let data = (0..var_bin_view.metadata().n_children) - .map(|i| var_bin_view.bytes(i).flatten_primitive()) + .map(|i| { + var_bin_view + .bytes(i) + .into_canonical() + .and_then(Canonical::into_primitive) + }) .collect::>>() .expect("bytes arrays must be primitive"); if !data.is_empty() { @@ -359,9 +367,9 @@ mod test { use vortex_scalar::Scalar; use crate::array::varbinview::VarBinViewArray; - use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; - use crate::{ArrayFlatten, ArrayTrait, Flattened, IntoArray}; + use crate::compute::unary::scalar_at::scalar_at; + use crate::{ArrayTrait, Canonical, IntoArray, IntoCanonical}; #[test] pub fn varbin_view() { @@ -397,8 +405,8 @@ mod test { pub fn flatten_array() { let binary_arr = VarBinViewArray::from(vec!["string1", "string2"]); - let flattened = binary_arr.flatten().unwrap(); - assert!(matches!(flattened, Flattened::VarBin(_))); + let flattened = binary_arr.into_canonical().unwrap(); + assert!(matches!(flattened, Canonical::VarBin(_))); let var_bin = flattened.into_array(); assert_eq!(scalar_at(&var_bin, 0).unwrap(), Scalar::from("string1")); diff --git a/vortex-array/src/arrow/array.rs b/vortex-array/src/arrow/array.rs index 470ac4682..47bb9f5f4 100644 --- a/vortex-array/src/arrow/array.rs +++ b/vortex-array/src/arrow/array.rs @@ -27,7 +27,7 @@ use crate::array::bool::BoolArray; use crate::array::datetime::LocalDateTimeArray; use crate::array::null::NullArray; use crate::array::primitive::PrimitiveArray; -use crate::array::r#struct::StructArray; +use crate::array::struct_::StructArray; use crate::array::varbin::VarBinArray; use crate::array::varbinview::VarBinViewArray; use crate::arrow::FromArrowArray; diff --git a/vortex-array/src/arrow/recordbatch.rs b/vortex-array/src/arrow/recordbatch.rs index b157b5a4d..5845f66fe 100644 --- a/vortex-array/src/arrow/recordbatch.rs +++ b/vortex-array/src/arrow/recordbatch.rs @@ -1,7 +1,7 @@ use arrow_array::RecordBatch; use itertools::Itertools; -use crate::array::r#struct::StructArray; +use crate::array::struct_::StructArray; use crate::arrow::FromArrowArray; use crate::validity::Validity; use crate::{ArrayData, IntoArray, IntoArrayData, ToArrayData}; diff --git a/vortex-array/src/canonical.rs b/vortex-array/src/canonical.rs new file mode 100644 index 000000000..6fbe21b25 --- /dev/null +++ b/vortex-array/src/canonical.rs @@ -0,0 +1,469 @@ +use std::sync::Arc; + +use arrow_array::types::{ + Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_array::{ + ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray as ArrowBoolArray, LargeBinaryArray, + LargeStringArray, NullArray as ArrowNullArray, PrimitiveArray as ArrowPrimitiveArray, + StringArray, StructArray as ArrowStructArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow_buffer::ScalarBuffer; +use arrow_schema::{Field, Fields}; +use vortex_dtype::{DType, PType}; +use vortex_error::{vortex_bail, VortexResult}; + +use crate::array::bool::BoolArray; +use crate::array::datetime::{LocalDateTimeArray, TimeUnit}; +use crate::array::extension::ExtensionArray; +use crate::array::null::NullArray; +use crate::array::primitive::PrimitiveArray; +use crate::array::struct_::StructArray; +use crate::array::varbin::VarBinArray; +use crate::arrow::wrappers::as_offset_buffer; +use crate::compute::unary::cast::try_cast; +use crate::encoding::ArrayEncoding; +use crate::validity::ArrayValidity; +use crate::{Array, ArrayDType, ArrayTrait, IntoArray, ToArray}; + +/// The set of canonical array encodings, also the set of encodings that can be transferred to +/// Arrow with zero-copy. +/// +/// Note that a canonical form is not recursive, i.e. a StructArray may contain non-canonical +/// child arrays, which may themselves need to be [canonicalized](IntoCanonical). +/// +/// # Logical vs. Physical encodings +/// +/// Vortex separates logical and physical types, however this creates ambiguity with Arrow, there is +/// no separation. Thus, if you receive an Arrow array, compress it using Vortex, and then +/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array +/// variants to hold the data. +/// +/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which +/// will correspond to an arrow-rs [`arrow_schema::DataType`]. +/// +/// # Views support +/// +/// Binary and String views are a new, better encoding format for nearly all use-cases. For now, +/// because DataFusion does not include pervasive support for compute over StringView, we opt to use +/// the [`VarBinArray`] as the canonical encoding (which corresponds to the Arrow +/// [`BinaryViewArray`]). +/// +/// We expect to change this soon once DataFusion is able to finish up some initial support, which +/// is tracked in https://github.com/apache/datafusion/issues/10918. +#[derive(Debug, Clone)] +pub enum Canonical { + Null(NullArray), + Bool(BoolArray), + Primitive(PrimitiveArray), + Struct(StructArray), + VarBin(VarBinArray), + // TODO(aduffy): switch to useing VarBinView instead of VarBin + // VarBinView(VarBinViewArray), + Extension(ExtensionArray), +} + +impl Canonical { + /// Convert a canonical array into its equivalent [ArrayRef](Arrow array). + /// + /// Scalar arrays such as Bool and Primitive canonical arrays should convert with + /// zero copies, while more complex variants such as Struct may require allocations if its child + /// arrays require decompression. + pub fn into_arrow(self) -> ArrayRef { + match self { + Canonical::Null(a) => null_to_arrow(a), + Canonical::Bool(a) => bool_to_arrow(a), + Canonical::Primitive(a) => primitive_to_arrow(a), + Canonical::Struct(a) => struct_to_arrow(a), + Canonical::VarBin(a) => varbin_to_arrow(a), + Canonical::Extension(a) => match a.id().as_ref() { + "vortex.localdatetime" => local_date_time_to_arrow( + LocalDateTimeArray::try_from(&a.into_array()).expect("localdatetime"), + ), + _ => panic!("unsupported extension dtype with ID {}", a.id().as_ref()), + }, + } + } +} + +// Unwrap canonical type back down to specialized type. +impl Canonical { + pub fn into_null(self) -> VortexResult { + match self { + Canonical::Null(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap NullArray from {:?}", &self), + } + } + + pub fn into_bool(self) -> VortexResult { + match self { + Canonical::Bool(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap BoolArray from {:?}", &self), + } + } + + pub fn into_primitive(self) -> VortexResult { + match self { + Canonical::Primitive(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap PrimitiveArray from {:?}", &self), + } + } + + pub fn into_struct(self) -> VortexResult { + match self { + Canonical::Struct(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap StructArray from {:?}", &self), + } + } + + pub fn into_varbin(self) -> VortexResult { + match self { + Canonical::VarBin(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap VarBinArray from {:?}", &self), + } + } + + pub fn into_extension(self) -> VortexResult { + match self { + Canonical::Extension(a) => Ok(a), + _ => vortex_bail!(InvalidArgument: "cannot unwrap ExtensionArray from {:?}", &self), + } + } +} + +fn null_to_arrow(null_array: NullArray) -> ArrayRef { + Arc::new(ArrowNullArray::new(null_array.len())) +} + +fn bool_to_arrow(bool_array: BoolArray) -> ArrayRef { + Arc::new(ArrowBoolArray::new( + bool_array.boolean_buffer(), + bool_array + .logical_validity() + .to_null_buffer() + .expect("null buffer"), + )) +} + +fn primitive_to_arrow(primitive_array: PrimitiveArray) -> ArrayRef { + fn as_arrow_array_primitive( + array: &PrimitiveArray, + ) -> ArrowPrimitiveArray { + ArrowPrimitiveArray::new( + ScalarBuffer::::new(array.buffer().clone().into(), 0, array.len()), + array + .logical_validity() + .to_null_buffer() + .expect("null buffer"), + ) + } + + match primitive_array.ptype() { + PType::U8 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::U16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::U32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::U64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::I8 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::I16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::I32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::I64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::F16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::F32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + PType::F64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), + } +} + +fn struct_to_arrow(struct_array: StructArray) -> ArrayRef { + let field_arrays: Vec = struct_array + .children() + .map(|f| { + let canonical = f.into_canonical().unwrap(); + match canonical { + // visit nested structs recursively + Canonical::Struct(a) => struct_to_arrow(a), + _ => canonical.into_arrow(), + } + }) + .collect(); + + let arrow_fields: Fields = struct_array + .names() + .iter() + .zip(field_arrays.iter()) + .zip(struct_array.dtypes().iter()) + .map(|((name, arrow_field), vortex_field)| { + Field::new( + &**name, + arrow_field.data_type().clone(), + vortex_field.is_nullable(), + ) + }) + .map(Arc::new) + .collect(); + + Arc::new(ArrowStructArray::new(arrow_fields, field_arrays, None)) +} + +fn varbin_to_arrow(varbin_array: VarBinArray) -> ArrayRef { + let offsets = varbin_array + .offsets() + .into_canonical() + .and_then(Canonical::into_primitive) + .expect("flatten_primitive"); + let offsets = match offsets.ptype() { + PType::I32 | PType::I64 => offsets, + // Unless it's u64, everything else can be converted into an i32. + // FIXME(ngates): do not copy offsets again + PType::U64 => try_cast(&offsets.to_array(), PType::I64.into()) + .expect("cast to i64") + .into_canonical() + .and_then(Canonical::into_primitive) + .expect("flatten_primitive"), + _ => try_cast(&offsets.to_array(), PType::I32.into()) + .expect("cast to i32") + .into_canonical() + .and_then(Canonical::into_primitive) + .expect("flatten_primitive"), + }; + let nulls = varbin_array + .logical_validity() + .to_null_buffer() + .expect("null buffer"); + + let data = varbin_array + .bytes() + .into_canonical() + .and_then(Canonical::into_primitive) + .expect("flatten_primitive"); + assert_eq!(data.ptype(), PType::U8); + let data = data.buffer(); + + // Switch on Arrow DType. + match varbin_array.dtype() { + DType::Binary(_) => match offsets.ptype() { + PType::I32 => Arc::new(BinaryArray::new( + as_offset_buffer::(offsets), + data.into(), + nulls, + )), + PType::I64 => Arc::new(LargeBinaryArray::new( + as_offset_buffer::(offsets), + data.into(), + nulls, + )), + _ => panic!("Invalid offsets type"), + }, + DType::Utf8(_) => match offsets.ptype() { + PType::I32 => Arc::new(StringArray::new( + as_offset_buffer::(offsets), + data.into(), + nulls, + )), + PType::I64 => Arc::new(LargeStringArray::new( + as_offset_buffer::(offsets), + data.into(), + nulls, + )), + _ => panic!("Invalid offsets type"), + }, + _ => panic!( + "expected utf8 or binary instead of {}", + varbin_array.dtype() + ), + } +} + +fn local_date_time_to_arrow(local_date_time_array: LocalDateTimeArray) -> ArrayRef { + // A LocalDateTime maps to an Arrow Timestamp array with no timezone. + let timestamps = try_cast(&local_date_time_array.timestamps(), PType::I64.into()) + .expect("timestamps must cast to i64") + .into_canonical() + .and_then(Canonical::into_primitive) + .expect("must be i64 array"); + let validity = timestamps + .logical_validity() + .to_null_buffer() + .expect("null buffer"); + let timestamps_len = timestamps.len(); + let buffer = ScalarBuffer::::new(timestamps.into_buffer().into(), 0, timestamps_len); + + match local_date_time_array.time_unit() { + TimeUnit::Ns => Arc::new(TimestampNanosecondArray::new(buffer, validity)), + TimeUnit::Us => Arc::new(TimestampMicrosecondArray::new(buffer, validity)), + TimeUnit::Ms => Arc::new(TimestampMillisecondArray::new(buffer, validity)), + TimeUnit::S => Arc::new(TimestampSecondArray::new(buffer, validity)), + } +} + +/// Support trait for transmuting an array into its [vortex_dtype::DType]'s canonical encoding. +/// +/// This conversion ensures that the array's encoding matches one of the builtin canonical +/// encodings, each of which has a corresponding [Canonical] variant. +/// +/// # Invariants +/// +/// The DType of the array will be unchanged by canonicalization. +pub trait IntoCanonical { + fn into_canonical(self) -> VortexResult; +} + +/// Trait for types that can be converted from an owned type into an owned array variant. +/// +/// # Canonicalization +/// +/// This trait has a blanket implementation for all types implementing [IntoCanonical]. +pub trait IntoArrayVariant { + fn into_null(self) -> VortexResult; + + fn into_bool(self) -> VortexResult; + + fn into_primitive(self) -> VortexResult; + + fn into_struct(self) -> VortexResult; + + fn into_varbin(self) -> VortexResult; + + fn into_extension(self) -> VortexResult; +} + +impl IntoArrayVariant for T +where + T: IntoCanonical, +{ + fn into_null(self) -> VortexResult { + self.into_canonical()?.into_null() + } + + fn into_bool(self) -> VortexResult { + self.into_canonical()?.into_bool() + } + + fn into_primitive(self) -> VortexResult { + self.into_canonical()?.into_primitive() + } + + fn into_struct(self) -> VortexResult { + self.into_canonical()?.into_struct() + } + + fn into_varbin(self) -> VortexResult { + self.into_canonical()?.into_varbin() + } + + fn into_extension(self) -> VortexResult { + self.into_canonical()?.into_extension() + } +} + +/// IntoCanonical implementation for Array. +/// +/// Canonicalizing an array requires potentially decompressing, so this requires a roundtrip through +/// the array's internal codec. +impl IntoCanonical for Array { + fn into_canonical(self) -> VortexResult { + ArrayEncoding::canonicalize(self.encoding(), self) + } +} + +/// Implement the IntoArray for the [Canonical] type. +/// +/// This conversion is always "free" and should not touch underlying data. All it does is create an +/// owned pointer to the underlying concrete array type. +/// +/// This combined with the above [IntoCanonical] impl for [Array] allows simple two-way conversions +/// between arbitrary Vortex encodings and canonical Arrow-compatible encodings. +impl IntoArray for Canonical { + fn into_array(self) -> Array { + match self { + Self::Null(a) => a.into_array(), + Self::Bool(a) => a.into_array(), + Self::Primitive(a) => a.into_array(), + Self::Struct(a) => a.into_array(), + Self::VarBin(a) => a.into_array(), + Self::Extension(a) => a.into_array(), + } + } +} + +#[cfg(test)] +mod test { + use arrow_array::types::{Int64Type, UInt64Type}; + use arrow_array::{ + Array, PrimitiveArray as ArrowPrimitiveArray, StructArray as ArrowStructArray, + }; + use vortex_dtype::Nullability; + use vortex_scalar::Scalar; + + use crate::array::primitive::PrimitiveArray; + use crate::array::sparse::SparseArray; + use crate::array::struct_::StructArray; + use crate::validity::Validity; + use crate::{IntoArray, IntoCanonical}; + + #[test] + fn test_canonicalize_nested_struct() { + // Create a struct array with multiple internal components. + let nested_struct_array = StructArray::from_fields(&[ + ( + "a", + PrimitiveArray::from_vec(vec![1u64], Validity::NonNullable).into_array(), + ), + ( + "b", + StructArray::from_fields(&[( + "inner_a", + // The nested struct contains a SparseArray representing the primitive array + // [100i64, 100i64, 100i64] + // SparseArray is not a canonical type, so converting `into_arrow()` should map + // this to the nearest canonical type (PrimitiveArray). + SparseArray::try_new( + PrimitiveArray::from_vec(vec![0u64; 1], Validity::NonNullable).into_array(), + PrimitiveArray::from_vec(vec![100i64], Validity::NonNullable).into_array(), + 1, + Scalar::primitive(0i64, Nullability::NonNullable), + ) + .unwrap() + .into_array(), + )]) + .into_array(), + ), + ]); + + let arrow_struct = nested_struct_array + .into_canonical() + .unwrap() + .into_arrow() + .as_any() + .downcast_ref::() + .cloned() + .unwrap(); + + assert!(arrow_struct + .column(0) + .as_any() + .downcast_ref::>() + .is_some()); + + let inner_struct = arrow_struct + .column(1) + .clone() + .as_any() + .downcast_ref::() + .cloned() + .unwrap() + .clone(); + + let inner_a = inner_struct + .column(0) + .as_any() + .downcast_ref::>(); + assert!(inner_a.is_some()); + + assert_eq!( + inner_a.cloned().unwrap(), + ArrowPrimitiveArray::from(vec![100i64]), + ); + } +} diff --git a/vortex-array/src/compress.rs b/vortex-array/src/compress.rs index 8e1d0843d..d4575f986 100644 --- a/vortex-array/src/compress.rs +++ b/vortex-array/src/compress.rs @@ -6,14 +6,14 @@ use vortex_error::{vortex_bail, VortexResult}; use crate::array::chunked::{Chunked, ChunkedArray}; use crate::array::constant::{Constant, ConstantArray}; -use crate::array::r#struct::{Struct, StructArray}; -use crate::compute::scalar_at::scalar_at; +use crate::array::struct_::{Struct, StructArray}; use crate::compute::slice::slice; +use crate::compute::unary::scalar_at::scalar_at; use crate::encoding::{ArrayEncoding, EncodingRef}; use crate::sampling::stratified_slices; use crate::stats::ArrayStatistics; use crate::validity::Validity; -use crate::{Array, ArrayDType, ArrayDef, ArrayFlatten, ArrayTrait, Context, IntoArray}; +use crate::{Array, ArrayDType, ArrayDef, ArrayTrait, Context, IntoArray, IntoCanonical}; pub trait EncodingCompression: ArrayEncoding { fn cost(&self) -> u8 { @@ -322,7 +322,7 @@ pub fn sampled_compression(array: &Array, compressor: &Compressor) -> VortexResu .collect::>>()?, array.dtype().clone(), )? - .flatten()? + .into_canonical()? .into_array(); find_best_compression(candidates, &sample, compressor)? diff --git a/vortex-array/src/compute/compare.rs b/vortex-array/src/compute/compare.rs index ceeede9fb..4231cca34 100644 --- a/vortex-array/src/compute/compare.rs +++ b/vortex-array/src/compute/compare.rs @@ -2,28 +2,29 @@ use vortex_dtype::DType; use vortex_error::{vortex_err, VortexResult}; use vortex_expr::Operator; -use crate::{Array, ArrayDType}; +use crate::{Array, ArrayDType, IntoArrayVariant}; pub trait CompareFn { fn compare(&self, array: &Array, predicate: Operator) -> VortexResult; } -pub fn compare(array: &Array, other: &Array, predicate: Operator) -> VortexResult { +pub fn compare(left: &Array, right: &Array, predicate: Operator) -> VortexResult { if let Some(matching_indices) = - array.with_dyn(|c| c.compare().map(|t| t.compare(other, predicate))) + left.with_dyn(|lhs| lhs.compare().map(|rhs| rhs.compare(right, predicate))) { return matching_indices; } + // if compare is not implemented for the given array type, but the array has a numeric // DType, we can flatten the array and apply filter to the flattened primitive array - match array.dtype() { + match left.dtype() { DType::Primitive(..) => { - let flat = array.clone().flatten_primitive()?; - flat.compare(other, predicate) + let flat = left.clone().into_primitive()?; + flat.compare(right, predicate) } _ => Err(vortex_err!( NotImplemented: "compare", - array.encoding().id() + left.encoding().id() )), } } diff --git a/vortex-array/src/compute/filter_indices.rs b/vortex-array/src/compute/filter_indices.rs index cf40e30c0..d78b4da2b 100644 --- a/vortex-array/src/compute/filter_indices.rs +++ b/vortex-array/src/compute/filter_indices.rs @@ -2,7 +2,7 @@ use vortex_dtype::DType; use vortex_error::{vortex_err, VortexResult}; use vortex_expr::Disjunction; -use crate::{Array, ArrayDType}; +use crate::{Array, ArrayDType, IntoArrayVariant}; pub trait FilterIndicesFn { fn filter_indices(&self, predicate: &Disjunction) -> VortexResult; @@ -18,7 +18,7 @@ pub fn filter_indices(array: &Array, predicate: &Disjunction) -> VortexResult { - let flat = array.clone().flatten_primitive()?; + let flat = array.clone().into_primitive()?; flat.filter_indices(predicate) } _ => Err(vortex_err!( diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index 56fd00489..057bd6cdd 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -1,25 +1,20 @@ -use cast::CastFn; use compare::CompareFn; -use fill::FillForwardFn; -use patch::PatchFn; -use scalar_at::ScalarAtFn; use search_sorted::SearchSortedFn; use slice::SliceFn; use take::TakeFn; -use crate::compute::filter_indices::FilterIndicesFn; -use crate::compute::scalar_subtract::SubtractScalarFn; +use self::filter_indices::FilterIndicesFn; +use self::unary::cast::CastFn; +use self::unary::fill_forward::FillForwardFn; +use self::unary::scalar_at::ScalarAtFn; +use self::unary::scalar_subtract::SubtractScalarFn; -pub mod cast; pub mod compare; -pub mod fill; pub mod filter_indices; -pub mod patch; -pub mod scalar_at; -pub mod scalar_subtract; pub mod search_sorted; pub mod slice; pub mod take; +pub mod unary; pub trait ArrayCompute { fn cast(&self) -> Option<&dyn CastFn> { @@ -38,10 +33,6 @@ pub trait ArrayCompute { None } - fn patch(&self) -> Option<&dyn PatchFn> { - None - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { None } diff --git a/vortex-array/src/compute/patch.rs b/vortex-array/src/compute/patch.rs deleted file mode 100644 index e77792594..000000000 --- a/vortex-array/src/compute/patch.rs +++ /dev/null @@ -1,28 +0,0 @@ -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::{Array, ArrayDType}; - -pub trait PatchFn { - fn patch(&self, patch: &Array) -> VortexResult; -} - -/// Returns a new array where the non-null values from the patch array are replaced in the original. -pub fn patch(array: &Array, patch: &Array) -> VortexResult { - if array.len() != patch.len() { - vortex_bail!( - "patch array {} must have the same length as the original array {}", - patch, - array - ); - } - - if array.dtype().as_nullable() != patch.dtype().as_nullable() { - vortex_bail!(MismatchedTypes: array.dtype(), patch.dtype()); - } - - array.with_dyn(|a| { - a.patch() - .map(|t| t.patch(patch)) - .unwrap_or_else(|| Err(vortex_err!(NotImplemented: "take", array.encoding().id()))) - }) -} diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 9dd6834fd..a60bfaf0a 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -4,7 +4,7 @@ use std::cmp::Ordering::{Equal, Greater, Less}; use vortex_error::{vortex_err, VortexResult}; use vortex_scalar::Scalar; -use crate::compute::scalar_at::scalar_at; +use crate::compute::unary::scalar_at::scalar_at; use crate::{Array, ArrayDType}; #[derive(Debug, Copy, Clone)] diff --git a/vortex-array/src/compute/take.rs b/vortex-array/src/compute/take.rs index 8dfa6b62c..ca8489da9 100644 --- a/vortex-array/src/compute/take.rs +++ b/vortex-array/src/compute/take.rs @@ -1,7 +1,7 @@ use log::info; use vortex_error::{vortex_err, VortexResult}; -use crate::{Array, IntoArray}; +use crate::{Array, IntoArray, IntoCanonical}; pub trait TakeFn { fn take(&self, indices: &Array) -> VortexResult; @@ -15,7 +15,7 @@ pub fn take(array: &Array, indices: &Array) -> VortexResult { // Otherwise, flatten and try again. info!("TakeFn not implemented for {}, flattening", array); - array.clone().flatten()?.into_array().with_dyn(|a| { + array.clone().into_canonical()?.into_array().with_dyn(|a| { a.take() .map(|t| t.take(indices)) .unwrap_or_else(|| Err(vortex_err!(NotImplemented: "take", array.encoding().id()))) diff --git a/vortex-array/src/compute/cast.rs b/vortex-array/src/compute/unary/cast.rs similarity index 73% rename from vortex-array/src/compute/cast.rs rename to vortex-array/src/compute/unary/cast.rs index 27d5e6f4f..332e95409 100644 --- a/vortex-array/src/compute/cast.rs +++ b/vortex-array/src/compute/unary/cast.rs @@ -7,7 +7,10 @@ pub trait CastFn { fn cast(&self, dtype: &DType) -> VortexResult; } -pub fn cast(array: &Array, dtype: &DType) -> VortexResult { +/// Attempt to cast an array to a desired DType. +/// +/// Some array support the ability to narrow or upcast. +pub fn try_cast(array: &Array, dtype: &DType) -> VortexResult { if array.dtype() == dtype { return Ok(array.clone()); } diff --git a/vortex-array/src/compute/fill.rs b/vortex-array/src/compute/unary/fill_forward.rs similarity index 100% rename from vortex-array/src/compute/fill.rs rename to vortex-array/src/compute/unary/fill_forward.rs diff --git a/vortex-array/src/compute/unary/mod.rs b/vortex-array/src/compute/unary/mod.rs new file mode 100644 index 000000000..f85f1cf74 --- /dev/null +++ b/vortex-array/src/compute/unary/mod.rs @@ -0,0 +1,4 @@ +pub mod cast; +pub mod fill_forward; +pub mod scalar_at; +pub mod scalar_subtract; diff --git a/vortex-array/src/compute/scalar_at.rs b/vortex-array/src/compute/unary/scalar_at.rs similarity index 100% rename from vortex-array/src/compute/scalar_at.rs rename to vortex-array/src/compute/unary/scalar_at.rs diff --git a/vortex-array/src/compute/scalar_subtract.rs b/vortex-array/src/compute/unary/scalar_subtract.rs similarity index 91% rename from vortex-array/src/compute/scalar_subtract.rs rename to vortex-array/src/compute/unary/scalar_subtract.rs index 887d072c8..f875b4f8f 100644 --- a/vortex-array/src/compute/scalar_subtract.rs +++ b/vortex-array/src/compute/unary/scalar_subtract.rs @@ -2,7 +2,7 @@ use vortex_dtype::DType; use vortex_error::{vortex_err, VortexResult}; use vortex_scalar::Scalar; -use crate::{Array, ArrayDType}; +use crate::{Array, ArrayDType, IntoArrayVariant}; pub trait SubtractScalarFn { fn subtract_scalar(&self, to_subtract: &Scalar) -> VortexResult; @@ -20,7 +20,7 @@ pub fn subtract_scalar(array: &Array, to_subtract: &Scalar) -> VortexResult { // TODO(@jcasale): pass array instead of ref to get rid of clone? // downside is that subtract_scalar then consumes the array, which is not great - let flat = array.clone().flatten_primitive()?; + let flat = array.clone().into_primitive()?; flat.subtract_scalar(to_subtract) } _ => Err(vortex_err!( diff --git a/vortex-array/src/context.rs b/vortex-array/src/context.rs index 99fbecb3f..16652d3e5 100644 --- a/vortex-array/src/context.rs +++ b/vortex-array/src/context.rs @@ -5,8 +5,8 @@ use crate::array::chunked::ChunkedEncoding; use crate::array::constant::ConstantEncoding; use crate::array::extension::ExtensionEncoding; use crate::array::primitive::PrimitiveEncoding; -use crate::array::r#struct::StructEncoding; use crate::array::sparse::SparseEncoding; +use crate::array::struct_::StructEncoding; use crate::array::varbin::VarBinEncoding; use crate::array::varbinview::VarBinViewEncoding; use crate::encoding::EncodingRef; diff --git a/vortex-array/src/encoding.rs b/vortex-array/src/encoding.rs index 9309b283f..ed5505f6f 100644 --- a/vortex-array/src/encoding.rs +++ b/vortex-array/src/encoding.rs @@ -4,8 +4,8 @@ use std::hash::{Hash, Hasher}; use vortex_error::VortexResult; +use crate::canonical::{Canonical, IntoCanonical}; use crate::compress::EncodingCompression; -use crate::flatten::{ArrayFlatten, Flattened}; use crate::ArrayDef; use crate::{Array, ArrayTrait}; @@ -39,7 +39,7 @@ pub trait ArrayEncoding: 'static + Sync + Send + Debug { fn id(&self) -> EncodingId; /// Flatten the given array. - fn flatten(&self, array: Array) -> VortexResult; + fn canonicalize(&self, array: Array) -> VortexResult; /// Unwrap the provided array into an implementation of ArrayTrait fn with_dyn( @@ -68,9 +68,9 @@ impl Hash for dyn ArrayEncoding + '_ { pub trait ArrayEncodingExt { type D: ArrayDef; - fn flatten(array: Array) -> VortexResult { + fn into_canonical(array: Array) -> VortexResult { let typed = <::Array as TryFrom>::try_from(array)?; - ArrayFlatten::flatten(typed) + IntoCanonical::into_canonical(typed) } #[inline] diff --git a/vortex-array/src/flatten.rs b/vortex-array/src/flatten.rs deleted file mode 100644 index 86cf90722..000000000 --- a/vortex-array/src/flatten.rs +++ /dev/null @@ -1,264 +0,0 @@ -use std::sync::Arc; - -use arrow_array::types::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; -use arrow_array::{ - ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray as ArrowBoolArray, LargeBinaryArray, - LargeStringArray, NullArray as ArrowNullArray, PrimitiveArray as ArrowPrimitiveArray, - StringArray, StructArray as ArrowStructArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, -}; -use arrow_buffer::ScalarBuffer; -use arrow_schema::{Field, Fields}; -use vortex_dtype::{DType, PType}; -use vortex_error::VortexResult; - -use crate::array::bool::BoolArray; -use crate::array::datetime::{LocalDateTimeArray, TimeUnit}; -use crate::array::extension::ExtensionArray; -use crate::array::null::NullArray; -use crate::array::primitive::PrimitiveArray; -use crate::array::r#struct::StructArray; -use crate::array::varbin::VarBinArray; -use crate::arrow::wrappers::as_offset_buffer; -use crate::compute::cast::cast; -use crate::encoding::ArrayEncoding; -use crate::validity::ArrayValidity; -use crate::{Array, ArrayDType, ArrayTrait, IntoArray, ToArray}; - -/// The set of encodings that can be converted to Arrow with zero-copy. -pub enum Flattened { - Null(NullArray), - Bool(BoolArray), - Primitive(PrimitiveArray), - Struct(StructArray), - VarBin(VarBinArray), - // TODO(aduffy): VarBinView is being disabled until execution engines improve their - // support for them, or we build better execution kernels of our own. - // Once DataFusion completes https://github.com/apache/datafusion/issues/10918, we should - // flip this to be the preferred "flat" encoding for all string and binary types. - // VarBinView(VarBinViewArray), - Extension(ExtensionArray), -} - -impl Flattened { - /// Convert a flat array into its equivalent [ArrayRef](Arrow array). - /// - /// Scalar arrays such as Bool and Primitive flattened arrays though should convert with - /// zero copies, while more complex variants such as Struct may require allocations if its child - /// arrays require decompression. - pub fn into_arrow(self) -> ArrayRef { - match self { - Flattened::Null(a) => null_to_arrow(a), - Flattened::Bool(a) => bool_to_arrow(a), - Flattened::Primitive(a) => primitive_to_arrow(a), - Flattened::Struct(a) => struct_to_arrow(a), - Flattened::VarBin(a) => varbin_to_arrow(a), - Flattened::Extension(a) => match a.id().as_ref() { - "vortex.localdatetime" => local_date_time_to_arrow( - LocalDateTimeArray::try_from(&a.into_array()).expect("localdatetime"), - ), - _ => panic!("unsupported extension dtype with ID {}", a.id().as_ref()), - }, - } - } -} - -fn null_to_arrow(null_array: NullArray) -> ArrayRef { - Arc::new(ArrowNullArray::new(null_array.len())) -} - -fn bool_to_arrow(bool_array: BoolArray) -> ArrayRef { - Arc::new(ArrowBoolArray::new( - bool_array.boolean_buffer(), - bool_array - .logical_validity() - .to_null_buffer() - .expect("null buffer"), - )) -} - -fn primitive_to_arrow(primitive_array: PrimitiveArray) -> ArrayRef { - fn as_arrow_array_primitive( - array: &PrimitiveArray, - ) -> ArrowPrimitiveArray { - ArrowPrimitiveArray::new( - ScalarBuffer::::new(array.buffer().clone().into(), 0, array.len()), - array - .logical_validity() - .to_null_buffer() - .expect("null buffer"), - ) - } - - match primitive_array.ptype() { - PType::U8 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::U16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::U32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::U64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::I8 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::I16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::I32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::I64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::F16 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::F32 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - PType::F64 => Arc::new(as_arrow_array_primitive::(&primitive_array)), - } -} - -fn struct_to_arrow(struct_array: StructArray) -> ArrayRef { - let field_arrays: Vec = struct_array - .children() - .map(|f| f.flatten().unwrap().into_arrow()) - .collect(); - - let arrow_fields: Fields = struct_array - .names() - .iter() - .zip(field_arrays.iter()) - .zip(struct_array.dtypes().iter()) - .map(|((name, arrow_field), vortex_field)| { - Field::new( - &**name, - arrow_field.data_type().clone(), - vortex_field.is_nullable(), - ) - }) - .map(Arc::new) - .collect(); - - Arc::new(ArrowStructArray::new(arrow_fields, field_arrays, None)) -} - -fn varbin_to_arrow(varbin_array: VarBinArray) -> ArrayRef { - let offsets = varbin_array - .offsets() - .flatten_primitive() - .expect("flatten_primitive"); - let offsets = match offsets.ptype() { - PType::I32 | PType::I64 => offsets, - // Unless it's u64, everything else can be converted into an i32. - // FIXME(ngates): do not copy offsets again - PType::U64 => cast(&offsets.to_array(), PType::I64.into()) - .expect("cast to i64") - .flatten_primitive() - .expect("flatten_primitive"), - _ => cast(&offsets.to_array(), PType::I32.into()) - .expect("cast to i32") - .flatten_primitive() - .expect("flatten_primitive"), - }; - let nulls = varbin_array - .logical_validity() - .to_null_buffer() - .expect("null buffer"); - - let data = varbin_array - .bytes() - .flatten_primitive() - .expect("flatten_primitive"); - assert_eq!(data.ptype(), PType::U8); - let data = data.buffer(); - - // Switch on Arrow DType. - match varbin_array.dtype() { - DType::Binary(_) => match offsets.ptype() { - PType::I32 => Arc::new(BinaryArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - PType::I64 => Arc::new(LargeBinaryArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - _ => panic!("Invalid offsets type"), - }, - DType::Utf8(_) => match offsets.ptype() { - PType::I32 => Arc::new(StringArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - PType::I64 => Arc::new(LargeStringArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - _ => panic!("Invalid offsets type"), - }, - _ => panic!( - "expected utf8 or binary instead of {}", - varbin_array.dtype() - ), - } -} - -fn local_date_time_to_arrow(local_date_time_array: LocalDateTimeArray) -> ArrayRef { - // A LocalDateTime maps to an Arrow Timestamp array with no timezone. - let timestamps = cast(&local_date_time_array.timestamps(), PType::I64.into()) - .expect("timestamps must cast to i64") - .flatten_primitive() - .expect("must be i64 array"); - let validity = timestamps - .logical_validity() - .to_null_buffer() - .expect("null buffer"); - let timestamps_len = timestamps.len(); - let buffer = ScalarBuffer::::new(timestamps.into_buffer().into(), 0, timestamps_len); - - match local_date_time_array.time_unit() { - TimeUnit::Ns => Arc::new(TimestampNanosecondArray::new(buffer, validity)), - TimeUnit::Us => Arc::new(TimestampMicrosecondArray::new(buffer, validity)), - TimeUnit::Ms => Arc::new(TimestampMillisecondArray::new(buffer, validity)), - TimeUnit::S => Arc::new(TimestampSecondArray::new(buffer, validity)), - } -} - -/// Support trait for transmuting an array into its [vortex_dtype::DType]'s canonical encoding. -/// -/// Flattening an Array ensures that the array's encoding matches one of the builtin canonical -/// encodings, each of which has a corresponding [Flattened] variant. -/// -/// **Important**: DType remains the same before and after a flatten operation. -pub trait ArrayFlatten { - fn flatten(self) -> VortexResult; -} - -impl Array { - pub fn flatten(self) -> VortexResult { - ArrayEncoding::flatten(self.encoding(), self) - } - - pub fn flatten_extension(self) -> VortexResult { - ExtensionArray::try_from(self.flatten()?.into_array()) - } - - pub fn flatten_bool(self) -> VortexResult { - BoolArray::try_from(self.flatten()?.into_array()) - } - - pub fn flatten_primitive(self) -> VortexResult { - PrimitiveArray::try_from(self.flatten()?.into_array()) - } - - pub fn flatten_varbin(self) -> VortexResult { - VarBinArray::try_from(self.flatten()?.into_array()) - } -} - -impl IntoArray for Flattened { - fn into_array(self) -> Array { - match self { - Self::Null(a) => a.into_array(), - Self::Bool(a) => a.into_array(), - Self::Primitive(a) => a.into_array(), - Self::Struct(a) => a.into_array(), - Self::VarBin(a) => a.into_array(), - Self::Extension(a) => a.into_array(), - } - } -} diff --git a/vortex-array/src/implementation.rs b/vortex-array/src/implementation.rs index d9c53a69e..78a3b2242 100644 --- a/vortex-array/src/implementation.rs +++ b/vortex-array/src/implementation.rs @@ -35,7 +35,6 @@ macro_rules! impl_encoding { ArrayMetadata, ArrayTrait, AsArray, - Flattened, GetArrayMetadata, IntoArray, ToArray, @@ -146,8 +145,8 @@ macro_rules! impl_encoding { } #[inline] - fn flatten(&self, array: Array) -> VortexResult { - ::flatten(array) + fn canonicalize(&self, array: Array) -> VortexResult<$crate::Canonical> { + ::into_canonical(array) } #[inline] diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index b62397527..a48683171 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -6,15 +6,15 @@ //! exploits the particular data distribution of the array's values. //! //! Every data type recognized by Vortex also has a canonical physical encoding format, which -//! arrays can be [flattened](Flattened) into for ease of access in compute functions. +//! arrays can be [canonicalized](Canonical) into for ease of access in compute functions. //! use std::fmt::{Debug, Display, Formatter}; use std::future::ready; pub use ::paste; +pub use canonical::*; pub use context::*; pub use data::*; -pub use flatten::*; pub use implementation::*; use itertools::Itertools; pub use metadata::*; @@ -35,12 +35,12 @@ use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; pub mod accessor; pub mod array; pub mod arrow; +mod canonical; pub mod compress; pub mod compute; mod context; mod data; pub mod encoding; -mod flatten; mod implementation; pub mod iter; mod metadata; @@ -223,7 +223,7 @@ pub trait ArrayTrait: ArrayEncodingRef + ArrayCompute + ArrayDType - + ArrayFlatten + + IntoCanonical + ArrayValidity + AcceptArrayVisitor + ArrayStatistics diff --git a/vortex-array/src/stream/take_rows.rs b/vortex-array/src/stream/take_rows.rs index 86f19ac6e..f4fd51a73 100644 --- a/vortex-array/src/stream/take_rows.rs +++ b/vortex-array/src/stream/take_rows.rs @@ -7,14 +7,14 @@ use vortex_dtype::match_each_integer_ptype; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; -use crate::compute::scalar_subtract::subtract_scalar; use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; use crate::compute::slice::slice; use crate::compute::take::take; +use crate::compute::unary::scalar_subtract::subtract_scalar; use crate::stats::{ArrayStatistics, Stat}; use crate::stream::ArrayStream; -use crate::IntoArray; use crate::{Array, ArrayDType}; +use crate::{IntoArray, IntoCanonical}; #[pin_project] pub struct TakeRows<'idx, R: ArrayStream> { @@ -92,7 +92,9 @@ impl<'idx, R: ArrayStream> Stream for TakeRows<'idx, R> { // TODO(ngates): this is probably too heavy to run on the event loop. We should spawn // onto a worker pool. - let indices_for_batch = slice(this.indices, left, right)?.flatten_primitive()?; + let indices_for_batch = slice(this.indices, left, right)? + .into_canonical()? + .into_primitive()?; let shifted_arr = match_each_integer_ptype!(indices_for_batch.ptype(), |$T| { subtract_scalar(&indices_for_batch.into_array(), &Scalar::from(curr_offset as $T))? }); diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index 196f0f501..a4353d8af 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -4,11 +4,11 @@ use vortex_dtype::{DType, Nullability}; use vortex_error::{vortex_bail, VortexResult}; use crate::array::bool::BoolArray; -use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; use crate::compute::take::take; +use crate::compute::unary::scalar_at::scalar_at; use crate::stats::ArrayStatistics; -use crate::{Array, IntoArray}; +use crate::{Array, Canonical, IntoArray, IntoArrayVariant, IntoCanonical}; pub trait ArrayValidity { fn is_valid(&self, index: usize) -> bool; @@ -144,8 +144,18 @@ impl PartialEq for Validity { (Self::AllValid, Self::AllValid) => true, (Self::AllInvalid, Self::AllInvalid) => true, (Self::Array(a), Self::Array(b)) => { - a.clone().flatten_bool().unwrap().boolean_buffer() - == b.clone().flatten_bool().unwrap().boolean_buffer() + a.clone() + .into_canonical() + .unwrap() + .into_bool() + .unwrap() + .boolean_buffer() + == b.clone() + .into_canonical() + .unwrap() + .into_bool() + .unwrap() + .boolean_buffer() } _ => false, } @@ -202,7 +212,8 @@ impl FromIterator for Validity { LogicalValidity::AllValid(count) => BooleanBuffer::new_set(count), LogicalValidity::AllInvalid(count) => BooleanBuffer::new_unset(count), LogicalValidity::Array(array) => array - .flatten_bool() + .into_canonical() + .and_then(Canonical::into_bool) .expect("validity must flatten to BoolArray") .boolean_buffer(), }; @@ -234,7 +245,7 @@ impl LogicalValidity { Self::AllValid(_) => Ok(None), Self::AllInvalid(l) => Ok(Some(NullBuffer::new_null(*l))), Self::Array(a) => Ok(Some(NullBuffer::new( - a.clone().flatten_bool()?.boolean_buffer(), + a.clone().into_bool()?.boolean_buffer(), ))), } } @@ -243,7 +254,7 @@ impl LogicalValidity { match self { Self::AllValid(l) => Ok(NullBuffer::new_valid(*l)), Self::AllInvalid(l) => Ok(NullBuffer::new_null(*l)), - Self::Array(a) => Ok(NullBuffer::new(a.clone().flatten_bool()?.boolean_buffer())), + Self::Array(a) => Ok(NullBuffer::new(a.clone().into_bool()?.boolean_buffer())), } } diff --git a/vortex-datafusion/src/lib.rs b/vortex-datafusion/src/lib.rs index 6ce6c2af4..ba7330848 100644 --- a/vortex-datafusion/src/lib.rs +++ b/vortex-datafusion/src/lib.rs @@ -9,9 +9,11 @@ use std::task::{Context, Poll}; use arrow_array::{RecordBatch, StructArray as ArrowStructArray}; use arrow_schema::SchemaRef; use async_trait::async_trait; +use datafusion::dataframe::DataFrame; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::prelude::SessionContext; use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError, Result as DFResult}; use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType}; use datafusion_physical_expr::EquivalenceProperties; @@ -21,8 +23,8 @@ use datafusion_physical_plan::{ use futures::{Stream, StreamExt}; use pin_project::pin_project; use vortex::array::chunked::ChunkedArray; -use vortex::array::r#struct::StructArray; -use vortex::{Array, ArrayDType, ArrayFlatten, IntoArray}; +use vortex::array::struct_::StructArray; +use vortex::{Array, ArrayDType, IntoArray, IntoCanonical}; use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexResult}; @@ -30,6 +32,24 @@ use crate::datatype::infer_schema; mod datatype; +pub trait SessionContextExt { + fn read_vortex(&self, array: Array) -> DFResult; +} + +impl SessionContextExt for SessionContext { + fn read_vortex(&self, array: Array) -> DFResult { + assert!( + matches!(array.dtype(), DType::Struct(_, _)), + "Vortex arrays must have struct type" + ); + + let vortex_table = VortexInMemoryTableProvider::try_new(array) + .map_err(|error| DataFusionError::Internal(format!("vortex error: {error}")))?; + + self.read_table(Arc::new(vortex_table)) + } +} + /// A [`TableProvider`] that exposes an existing Vortex Array to the DataFusion SQL engine. /// /// Only arrays that have a top-level [struct type](vortex_dtype::StructDType) can be exposed as @@ -139,7 +159,7 @@ impl VortexMemoryExec { _context: Arc, ) -> DFResult { let data = array - .flatten() + .into_canonical() .map_err(|vortex_error| DataFusionError::Execution(format!("{}", vortex_error)))? .into_array(); @@ -160,7 +180,7 @@ impl VortexMemoryExec { })?; let batch = RecordBatch::from( projected_struct - .flatten() + .into_canonical() .expect("struct arrays must flatten") .into_arrow() .as_any() @@ -247,61 +267,66 @@ impl ExecutionPlan for VortexMemoryExec { #[cfg(test)] mod test { - use std::sync::Arc; - + use arrow_array::types::Int64Type; use datafusion::arrow::array::AsArray; - use datafusion::arrow::datatypes::UInt64Type; use datafusion::prelude::SessionContext; + use datafusion_expr::{col, count_distinct, lit}; use vortex::array::primitive::PrimitiveArray; - use vortex::array::r#struct::StructArray; + use vortex::array::struct_::StructArray; use vortex::array::varbin::VarBinArray; use vortex::validity::Validity; use vortex::IntoArray; - use vortex_dtype::{DType, FieldName, Nullability}; + use vortex_dtype::{DType, Nullability}; - use crate::VortexInMemoryTableProvider; + use crate::SessionContextExt; #[tokio::test] async fn test_datafusion_simple() { let names = VarBinArray::from_vec( - vec!["Washington", "Adams", "Jefferson", "Madison", "Monroe"], + vec![ + "Washington", + "Adams", + "Jefferson", + "Madison", + "Monroe", + "Adams", + ], DType::Utf8(Nullability::NonNullable), ); - let term_start = - PrimitiveArray::from_vec(vec![1789u16, 1797, 1801, 1809, 1817], Validity::NonNullable); - let presidents = StructArray::try_new( - Arc::new([FieldName::from("president"), FieldName::from("term_start")]), - vec![names.into_array(), term_start.into_array()], - 5, + let term_start = PrimitiveArray::from_vec( + vec![1789u16, 1797, 1801, 1809, 1817, 1825], Validity::NonNullable, - ) - .unwrap(); + ); - let presidents_table = - Arc::new(VortexInMemoryTableProvider::try_new(presidents.into_array()).unwrap()); - let session_ctx = SessionContext::new(); + let presidents = StructArray::from_fields(&[ + ("president", names.into_array()), + ("term_start", term_start.into_array()), + ]) + .into_array(); - session_ctx - .register_table("presidents", presidents_table) - .unwrap(); + let ctx = SessionContext::new(); - let df_term_start = session_ctx - .sql("SELECT SUM(term_start) FROM presidents WHERE president <> 'Madison'") - .await + let df = ctx.read_vortex(presidents).unwrap(); + + let distinct_names = df + .filter(col("term_start").gt_eq(lit(1795))) + .unwrap() + .aggregate(vec![], vec![count_distinct(col("president"))]) .unwrap() .collect() .await .unwrap(); - assert_eq!(df_term_start.len(), 1); + assert_eq!(distinct_names.len(), 1); + assert_eq!( - *df_term_start[0] + *distinct_names[0] .column(0) - .as_primitive::() + .as_primitive::() .values() .first() .unwrap(), - vec![1789u64, 1797, 1801, 1817].into_iter().sum::() + 4i64 ); } } diff --git a/vortex-dtype/src/dtype.rs b/vortex-dtype/src/dtype.rs index c4e382207..a7b995a82 100644 --- a/vortex-dtype/src/dtype.rs +++ b/vortex-dtype/src/dtype.rs @@ -13,6 +13,10 @@ pub type FieldNames = Arc<[FieldName]>; pub type Metadata = Vec; +/// Array logical types. +/// +/// Vortex arrays preserve a single logical type, while the encodings allow for multiple +/// physical types to encode that type. #[derive(Debug, Clone, PartialOrd, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum DType { diff --git a/vortex-ipc/src/chunked_reader/take_rows.rs b/vortex-ipc/src/chunked_reader/take_rows.rs index efde6c998..c3ceb242c 100644 --- a/vortex-ipc/src/chunked_reader/take_rows.rs +++ b/vortex-ipc/src/chunked_reader/take_rows.rs @@ -10,14 +10,14 @@ use futures_util::TryStreamExt; use itertools::Itertools; use vortex::array::chunked::ChunkedArray; use vortex::array::primitive::PrimitiveArray; -use vortex::compute::cast::cast; -use vortex::compute::scalar_subtract::subtract_scalar; use vortex::compute::search_sorted::{search_sorted, SearchSortedSide}; use vortex::compute::slice::slice; use vortex::compute::take::take; +use vortex::compute::unary::cast::try_cast; +use vortex::compute::unary::scalar_subtract::subtract_scalar; use vortex::stats::ArrayStatistics; use vortex::stream::ArrayStreamExt; -use vortex::{Array, ArrayDType, IntoArray}; +use vortex::{Array, ArrayDType, IntoArray, IntoCanonical}; use vortex_dtype::PType; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; @@ -72,8 +72,12 @@ impl ChunkedArrayReader { .collect_vec(), ) .into_array(); - let start_rows = take(&self.row_offsets, &start_chunks)?.flatten_primitive()?; - let start_bytes = take(&self.byte_offsets, &start_chunks)?.flatten_primitive()?; + let start_rows = take(&self.row_offsets, &start_chunks)? + .into_canonical()? + .into_primitive()?; + let start_bytes = take(&self.byte_offsets, &start_chunks)? + .into_canonical()? + .into_primitive()?; let stop_chunks = PrimitiveArray::from( coalesced_chunks @@ -82,8 +86,12 @@ impl ChunkedArrayReader { .collect_vec(), ) .into_array(); - let stop_rows = take(&self.row_offsets, &stop_chunks)?.flatten_primitive()?; - let stop_bytes = take(&self.byte_offsets, &stop_chunks)?.flatten_primitive()?; + let stop_rows = take(&self.row_offsets, &stop_chunks)? + .into_canonical()? + .into_primitive()?; + let stop_bytes = take(&self.byte_offsets, &stop_chunks)? + .into_canonical()? + .into_primitive()?; // For each chunk-range, read the data as an ArrayStream and call take on it. let mut chunks = vec![]; @@ -159,9 +167,13 @@ impl ChunkedArrayReader { fn find_chunks(row_offsets: &Array, indices: &Array) -> VortexResult> { // TODO(ngates): lots of optimizations to be had here, potentially lots of push-down. // For now, we just flatten everything into primitive arrays and iterate. - let row_offsets = cast(row_offsets, PType::U64.into())?.flatten_primitive()?; + let row_offsets = try_cast(row_offsets, PType::U64.into())? + .into_canonical()? + .into_primitive()?; let _rows = format!("{:?}", row_offsets.maybe_null_slice::()); - let indices = cast(indices, PType::U64.into())?.flatten_primitive()?; + let indices = try_cast(indices, PType::U64.into())? + .into_canonical()? + .into_primitive()?; let _indices = format!("{:?}", indices.maybe_null_slice::()); if let (Some(last_idx), Some(num_rows)) = ( @@ -213,7 +225,7 @@ mod test { use itertools::Itertools; use vortex::array::chunked::ChunkedArray; use vortex::array::primitive::PrimitiveArray; - use vortex::{ArrayTrait, IntoArray, ViewContext}; + use vortex::{ArrayTrait, IntoArray, IntoCanonical, ViewContext}; use vortex_buffer::Buffer; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -262,7 +274,8 @@ mod test { let result = reader .take_rows(&PrimitiveArray::from(vec![0u64, 10, 10_000 - 1]).into_array()) .await? - .flatten_primitive()?; + .into_canonical()? + .into_primitive()?; assert_eq!(result.len(), 3); assert_eq!(result.maybe_null_slice::(), &[0, 10, 999]); diff --git a/vortex-ipc/src/lib.rs b/vortex-ipc/src/lib.rs index 7a8750a6f..67415794c 100644 --- a/vortex-ipc/src/lib.rs +++ b/vortex-ipc/src/lib.rs @@ -147,7 +147,7 @@ pub mod test { assert_eq!(next.encoding().id(), PrimitiveEncoding.id()); assert_eq!( - next.into_primitive().maybe_null_slice::(), + next.as_primitive().maybe_null_slice::(), vec![2999989, 2999988, 2999987, 2999986, 2899999, 0, 0] ); assert_eq!( @@ -155,7 +155,7 @@ pub mod test { .try_next() .await? .expect("Expected a chunk") - .into_primitive() + .as_primitive() .maybe_null_slice::(), vec![5999999] );