diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 95955d9dfe..5f2943b311 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,8 +3,7 @@ name: CI on: push: branches: [ "develop" ] - pull_request: - branches: [ "develop" ] + pull_request: { } workflow_dispatch: { } permissions: diff --git a/Cargo.lock b/Cargo.lock index 989887765d..3e848e4ac1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,9 +33,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -94,15 +94,6 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b3d0060af21e8d11a926981cc00c6c1541aa91dd64b9f881985c3da1094425f" -[[package]] -name = "argminmax" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52424b59d69d69d5056d508b260553afd91c57e21849579cd1f50ee8b8b88eaa" -dependencies = [ - "num-traits", -] - [[package]] name = "arrayref" version = "0.3.7" @@ -292,7 +283,7 @@ version = "50.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ff3e9c01f7cd169379d269f926892d0e622a704960350d09d331be3ec9e0029" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.5.0", ] [[package]] @@ -325,12 +316,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "atoi_simd" -version = "0.15.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" - [[package]] name = "autocfg" version = "1.1.0" @@ -384,7 +369,7 @@ version = "0.69.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.5.0", "cexpr", "clang-sys", "itertools 0.12.1", @@ -397,7 +382,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.52", + "syn", "which", ] @@ -409,15 +394,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "brotli" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -436,29 +421,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.3" +version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" [[package]] name = "bytemuck" -version = "1.14.3" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ef034f05691a48569bd920a96c81b9d91bbad1ab5ac7c4616c1f6ef36cb79f" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.52", -] +checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" [[package]] name = "byteorder" @@ -555,9 +526,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.2" +version = "4.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" +checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" dependencies = [ "clap_builder", ] @@ -768,7 +739,7 @@ checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -808,18 +779,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "ethnum" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90ca2580b73ab6a1f724b76ca11ab632df820fd6040c336200d2c1df7b3c82c" - -[[package]] -name = "fast-float" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" - [[package]] name = "fastlanez-sys" version = "0.1.0" @@ -879,12 +838,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" -[[package]] -name = "foreign_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" - [[package]] name = "form_urlencoded" version = "1.2.1" @@ -949,10 +902,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", - "js-sys", "libc", "wasi", - "wasm-bindgen", ] [[package]] @@ -969,9 +920,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" +checksum = "4fbd2820c5e49886948654ab546d0688ff24530286bdcf8fca3cefb16d4618eb" dependencies = [ "bytes", "fnv", @@ -1005,7 +956,6 @@ checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" dependencies = [ "ahash", "allocator-api2", - "rayon", ] [[package]] @@ -1344,7 +1294,7 @@ checksum = "adf157a4dc5a29b7b464aa8fe7edeff30076e07e13646a1c3874f58477dc99f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -1425,28 +1375,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "multiversion" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c7b9d7fe61760ce5ea19532ead98541f6b4c495d87247aff9826445cf6872a" -dependencies = [ - "multiversion-macros", - "target-features", -] - -[[package]] -name = "multiversion-macros" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26a83d8500ed06d68877e9de1dde76c1dbb83885dcdbda4ef44ccbc3fbda2ac8" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", - "target-features", -] - [[package]] name = "native-tls" version = "0.2.11" @@ -1557,16 +1485,6 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "num_enum" version = "0.7.2" @@ -1585,7 +1503,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -1624,7 +1542,7 @@ version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.5.0", "cfg-if", "foreign-types", "libc", @@ -1641,7 +1559,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -1790,143 +1708,6 @@ dependencies = [ "plotters-backend", ] -[[package]] -name = "polars-arrow" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faacd21a2548fa6d50c72d6b8d4649a8e029a0f3c6c5545b7f436f0610e49b0f" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "atoi_simd", - "bytemuck", - "chrono", - "dyn-clone", - "either", - "ethnum", - "fast-float", - "foreign_vec", - "getrandom", - "hashbrown", - "itoa", - "multiversion", - "num-traits", - "polars-error", - "polars-utils", - "ryu", - "simdutf8", - "streaming-iterator", - "strength_reduce", - "version_check", -] - -[[package]] -name = "polars-compute" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d9dc87f8003ae0edeef5ad9ac92b2a345480bbe17adad64496113ae84706dd" -dependencies = [ - "bytemuck", - "num-traits", - "polars-arrow", - "polars-error", - "polars-utils", - "version_check", -] - -[[package]] -name = "polars-core" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "befd4d280a82219a01035c4f901319ceba65998c594d0c64f9a439cdee1d7777" -dependencies = [ - "ahash", - "bitflags 2.4.2", - "bytemuck", - "either", - "hashbrown", - "indexmap", - "num-traits", - "once_cell", - "polars-arrow", - "polars-compute", - "polars-error", - "polars-row", - "polars-utils", - "rayon", - "smartstring", - "thiserror", - "version_check", - "xxhash-rust", -] - -[[package]] -name = "polars-error" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f2435b02d1ba36d8c1f6a722cad04e4c0b2705a3112c5706e6960d405d7798" -dependencies = [ - "simdutf8", - "thiserror", -] - -[[package]] -name = "polars-ops" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6395f5fd5e1adf016fd6403c0a493181c1a349a7a145b2687cdf50a0d630310a" -dependencies = [ - "ahash", - "argminmax", - "bytemuck", - "either", - "hashbrown", - "indexmap", - "memchr", - "num-traits", - "polars-arrow", - "polars-compute", - "polars-core", - "polars-error", - "polars-utils", - "rayon", - "regex", - "smartstring", - "version_check", -] - -[[package]] -name = "polars-row" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4984d97aad3d0db92afe76ebcab10b5e37a1216618b5703ae0d2917ccd6168c" -dependencies = [ - "polars-arrow", - "polars-error", - "polars-utils", -] - -[[package]] -name = "polars-utils" -version = "0.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f9c955bb1e9b55d835aeb7fe4e4e8826e01abe5f0ada979ceb7d2b9af7b569" -dependencies = [ - "ahash", - "bytemuck", - "hashbrown", - "indexmap", - "num-traits", - "once_cell", - "polars-error", - "rayon", - "smartstring", - "version_check", -] - [[package]] name = "portable-atomic" version = "1.6.0" @@ -1952,7 +1733,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 2.0.52", + "syn", ] [[package]] @@ -1966,9 +1747,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] @@ -2031,7 +1812,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -2044,7 +1825,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -2171,9 +1952,9 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "reqwest" -version = "0.11.24" +version = "0.11.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6920094eb85afde5e4a138be3f2de8bbdf28000f0029e72c45025a56b042251" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" dependencies = [ "base64", "bytes", @@ -2242,11 +2023,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.5.0", "errno", "libc", "linux-raw-sys", @@ -2344,7 +2125,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -2376,12 +2157,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" -[[package]] -name = "simdutf8" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" - [[package]] name = "simplelog" version = "0.12.2" @@ -2409,17 +2184,6 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" -[[package]] -name = "smartstring" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" -dependencies = [ - "autocfg", - "static_assertions", - "version_check", -] - [[package]] name = "snap" version = "1.1.1" @@ -2442,34 +2206,11 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" - -[[package]] -name = "strength_reduce" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" - [[package]] name = "syn" -version = "1.0.109" +version = "2.0.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032" dependencies = [ "proc-macro2", "quote", @@ -2503,12 +2244,6 @@ dependencies = [ "libc", ] -[[package]] -name = "target-features" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfb5fa503293557c5158bd215fdc225695e567a77e453f5d4452a50a193969bd" - [[package]] name = "target-lexicon" version = "0.12.14" @@ -2548,22 +2283,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] @@ -2654,7 +2389,6 @@ dependencies = [ "bytes", "libc", "mio", - "num_cpus", "pin-project-lite", "socket2", "windows-sys 0.48.0", @@ -2822,7 +2556,9 @@ name = "vortex-array" version = "0.1.0" dependencies = [ "allocator-api2", - "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", "dyn-clone", "half", "humansize", @@ -2833,9 +2569,6 @@ dependencies = [ "num-traits", "num_enum", "once_cell", - "polars-arrow", - "polars-core", - "polars-ops", "rand", "rayon", "roaring", @@ -2877,7 +2610,8 @@ dependencies = [ name = "vortex-ree" version = "0.1.0" dependencies = [ - "arrow", + "arrow-array", + "arrow-buffer", "half", "itertools 0.12.1", "linkme", @@ -2952,7 +2686,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.52", + "syn", "wasm-bindgen-shared", ] @@ -2986,7 +2720,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3210,12 +2944,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "xxhash-rust" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927da81e25be1e1a2901d59b81b37dd2efd1fc9c9345a55007f09bf5a2d3ee03" - [[package]] name = "zerocopy" version = "0.7.32" @@ -3233,7 +2961,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn", ] [[package]] diff --git a/LICENSE-APACHE b/LICENSE similarity index 100% rename from LICENSE-APACHE rename to LICENSE diff --git a/README.md b/README.md index 6f6a70ad8f..95c4ea25ca 100644 --- a/README.md +++ b/README.md @@ -5,46 +5,144 @@ [![Documentation](https://docs.rs/vortex-rs/badge.svg)](https://docs.rs/vortex-array) [![Rust](https://img.shields.io/badge/rust-1.76.0%2B-blue.svg?maxAge=3600)](https://github.com/fulcrum-so/vortex) -An in-memory format for 1-dimensional array data. +Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. We are using Vortex to develop a +next-generation file format for multidimensional arrays called Spiral. -Vortex is a maximally [Apache Arrow](https://arrow.apache.org/) compatible data format that aims to separate logical and -physical representation of data, and allow pluggable physical layout. +> [!CAUTION] +> This library is very much a work in progress! -Array operations are separately defined in terms of their semantics, dealing only with logical types and physical layout -that defines exact ways in which values are transformed. +The major components of Vortex are (will be!): -# Logical Types +* **Logical Types** - a schema definition that makes no assertions about physical layout. +* **Encodings** - a pluggable set of physical layouts. Vortex ships with several state-of-the-art lightweight +compression codecs that have the potential to support GPU decompression. +* **Compression** - recursive compression based on stratified samples of the input. +* **Compute** - basic compute kernels that can operate over compressed data. Note that Vortex does not intend to become + a full-fledged compute engine, but rather to provide the ability to implement basic compute operations as may be + required for efficient scanning & pushdown operations. +* **Statistics** - each array carries around lazily computed summary statistics, optionally populated at read-time. + These are available to compute kernels as well as to the compressor. +* **Serde** - zero-copy serialization. Designed to work well both on-disk and over-the-wire. -Vortex type system only conveys semantic meaning of the array data without prescribing physical layout. When operating -over arrays you can focus on semantics of the operation. Separately you can provide low level implementation dependent -on particular physical operation. +## Overview: Logical vs Physical -``` -Null: all null array -Bool: Single bit value -Integer: Fixed width signed/unsigned number. Supports 8, 16, 32, 64 bit widths -Float: Fixed width floating point number. Supports 16, 32, 64 bit float types -Decimal: Fixed width decimal with specified precision (total number of digits) and scale (number of digits after decimal point) -Instant: An instantaneous point on the time-line. Number of seconds/miliseconds/microseconds/nanoseconds from epoch -LocalDate: A date without a time-zone -LocalTime: A time without a time-zone -ZonedDateTime: A data and time including ISO-8601 timezone -List: Sequence of items of same type -Map: Key, value mapping -Struct: Named tuple of types -``` +One of the core principles in Vortex is separation of the logical from the physical. -# Physical Encodings +A Vortex array is defined by a logical data type (i.e., the type of scalar elements) as well as a physical encoding +(the type of the array itself). Vortex ships with several built-in encodings, as well as several extension encodings. -Vortex calls array implementations encodings, they encode the physical layout of the data. Encodings are recurisvely -nested, i.e. encodings contain other encodings. For every array you have their value data type and the its encoding that -defines how operations will be performed. By default necessary encodings to zero copy convert to and from Apache Arrow -are included in the package. +The built-in encodings are primarily designed to model the Apache Arrow in-memory format, enabling us to construct Vortex +arrays with zero-copy from Arrow arrays. There are also several built-in encodings (e.g., `sparse` and `chunked`) that +are useful building blocks for other encodings. +The included extension encodings are mostly designed to model compressed in-memory arrays, such as run-length or +dictionary encoding. -When performing operations they're dispatched on the encodings to provide specialized implementation. +## Components -## Compression +### Logical Types -The advantage of separating physical layout from the semantic of the data is compression. Vortex can compress data -without requiring changes to the logical operations. To support efficient data access we focus on lightweight -compression algorithms only falling back to general purpose compressors for binary data. +The Vortex type-system is still in flux. The current set of logical types is: + +* Null +* Bool +* Integer +* Float +* Decimal +* Binary +* UTF8 +* List +* Struct +* Date/Time/DateTime/Duration: TODO (in-progress, currently partially supported) +* FixedList: TODO +* Union: TODO + +### Canonical/Flat Encodings + +Vortex includes a base set of "flat" encodings that are designed to be zero-copy with Apache Arrow. These are the canonical +representations of each of the logical data types. The canonical encodings currently supported are: + +* Null +* Bool +* Primitive (Integer, Float) +* Struct +* VarBin +* VarBinView +* ...with more to come + +### Compressed Encodings + +Vortex includes a set of compressed encodings that can hold compression in-memory arrays allowing us to defer +compression. These are: + +* BitPacking +* Constant +* Chunked +* Dictionary +* Frame-of-Reference +* Run-end +* RoaringUInt +* RoaringBool +* Sparse +* ZigZag + +### Compression + +Vortex's compression scheme is based on the [BtrBlocks](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf) paper. + +Roughly, for each chunk of data a sample is taken and a set of encodings are attempted. The best-performing encoding +is then chosen to encode the entire chunk. This sounds like it would be very expensive, but given basic statistics +about a chunk, it is possible to cheaply rule out many encodings and ensure the search space does not explode in size. + +### Compute + +Vortex provides the ability for each encoding to override the implementation of a compute function to avoid +decompressing where possible. For example, filtering a dictionary-encoded UTF8 array can be more cheaply performed by +filtering the dictionary first. + +Note that Vortex does not intend to become a full-fledged compute engine, but rather to provide the ability to +implement basic compute operations as may be required for efficient scanning & operation pushdown. + +### Statistics + +Vortex arrays carry lazily-computed summary statistics. Unlike other array libraries, these statistics can be populated +from disk formats such as Parquet and preserved all the way into a compute engine. Statistics are available to compute +kernels as well as to the compressor. + +The current statistics are: + +* BitWidthFreq +* TrailingZeroFreq +* IsConstant +* IsSorted +* IsStrictSorted +* Max +* Min +* RunCount +* TrueCount +* NullCount + +### Serialization / Deserialization (Serde) + +TODO + +## Vs Apache Arrow + +It is important to note that Vortex and Arrow have different design goals. As such, it is somewhat +unfair to make any comparison at all. But given both can be used as array libraries, it is worth noting the differences. + +Vortex is designed to be maximally compatible with Apache Arrow. All Arrow arrays can be converted into Vortex arrays +with zero-copy. And a Vortex array constructed from an Arrow array can be converted back to Arrow, again with zero-copy. + +Vortex explicitly separates logical types from physical encodings, distinguishing it from Arrow. This allows +Vortex to model more complex arrays while still exposing a logical interface. For example, Vortex can model a UTF8 +`ChunkedArray` where the first chunk is run-length encoded and the second chunk is dictionary encoded. +In Arrow, `RunLengthArray` and `DictionaryArray` are separate incompatible types, and so cannot be combined in this way. + +## Contributing + +While we hope to turn Vortex into a community project, its current rapid rate of change makes taking contributions +without prior discussion infeasible. If you are interested in contributing, please open an issue to discuss your ideas. + +## License + +Licensed under the Apache License, Version 2.0 (the "License"). \ No newline at end of file diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index 99a8bfc2c5..602cb2b9ba 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -1,6 +1,6 @@ use arrow_array::RecordBatchReader; use itertools::Itertools; -use log::{info, warn}; +use log::info; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet::arrow::ProjectionMask; use std::collections::HashSet; @@ -11,11 +11,11 @@ use vortex::array::bool::BoolEncoding; use vortex::array::chunked::{ChunkedArray, ChunkedEncoding}; use vortex::array::constant::ConstantEncoding; +use vortex::array::composite::CompositeEncoding; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveEncoding; use vortex::array::sparse::SparseEncoding; use vortex::array::struct_::StructEncoding; -use vortex::array::typed::TypedEncoding; use vortex::array::varbin::VarBinEncoding; use vortex::array::varbinview::VarBinViewEncoding; use vortex::array::{Array, ArrayRef, Encoding}; @@ -34,11 +34,11 @@ pub fn enumerate_arrays() -> Vec<&'static dyn Encoding> { // Builtins &BoolEncoding, &ChunkedEncoding, + &CompositeEncoding, &ConstantEncoding, &PrimitiveEncoding, &SparseEncoding, &StructEncoding, - &TypedEncoding, &VarBinEncoding, &VarBinViewEncoding, // Encodings @@ -55,6 +55,15 @@ pub fn enumerate_arrays() -> Vec<&'static dyn Encoding> { ] } +pub fn compress_ctx() -> CompressCtx { + let cfg = CompressConfig::new( + HashSet::from_iter(enumerate_arrays().iter().map(|e| (*e).id())), + HashSet::default(), + ); + info!("Compression config {cfg:?}"); + CompressCtx::new(Arc::new(cfg)) +} + pub fn download_taxi_data() -> PathBuf { let download_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("data/yellow-tripdata-2023-11.parquet"); @@ -91,14 +100,7 @@ pub fn compress_taxi_data() -> ArrayRef { .build() .unwrap(); - // let array = ArrayRef::try_from((&mut reader) as &mut dyn RecordBatchReader).unwrap(); - let cfg = CompressConfig::new( - HashSet::from_iter(enumerate_arrays().iter().map(|e| (*e).id())), - HashSet::default(), - ); - info!("Compression config {cfg:?}"); - let ctx = CompressCtx::new(Arc::new(cfg)); - + let ctx = compress_ctx(); let schema = reader.schema(); let mut uncompressed_size = 0; let chunks = reader @@ -116,7 +118,7 @@ pub fn compress_taxi_data() -> ArrayRef { let dtype: DType = schema.clone().try_into().unwrap(); let compressed = ChunkedArray::new(chunks.clone(), dtype).boxed(); - warn!("Compressed array {}\n", display_tree(compressed.as_ref())); + info!("Compressed array {}", display_tree(compressed.as_ref())); let mut field_bytes = vec![0; schema.fields().len()]; for chunk in chunks { @@ -139,10 +141,18 @@ pub fn compress_taxi_data() -> ArrayRef { #[cfg(test)] mod test { + use arrow_array::{ArrayRef as ArrowArrayRef, StructArray as ArrowStructArray}; use log::LevelFilter; + use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use simplelog::{ColorChoice, Config, TermLogger, TerminalMode}; + use std::fs::File; + use std::ops::Deref; + use std::sync::Arc; + use vortex::array::ArrayRef; + use vortex::compute::as_arrow::as_arrow; + use vortex::encode::FromArrow; - use crate::compress_taxi_data; + use crate::{compress_ctx, compress_taxi_data, download_taxi_data}; #[allow(dead_code)] fn setup_logger(level: LevelFilter) { @@ -155,10 +165,44 @@ mod test { .unwrap(); } - // #[ignore] #[test] fn compression_ratio() { setup_logger(LevelFilter::Debug); _ = compress_taxi_data(); } + + #[ignore] + #[test] + fn round_trip_arrow() { + let file = File::open(download_taxi_data()).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let reader = builder.with_limit(1).build().unwrap(); + + for record_batch in reader.map(|batch_result| batch_result.unwrap()) { + let struct_arrow: ArrowStructArray = record_batch.into(); + let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); + let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); + let vortex_as_arrow = as_arrow(vortex_array.as_ref()).unwrap(); + assert_eq!(vortex_as_arrow.deref(), arrow_array.deref()); + } + } + + #[ignore] + #[test] + fn round_trip_arrow_compressed() { + let file = File::open(download_taxi_data()).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let reader = builder.with_limit(1).build().unwrap(); + + let ctx = compress_ctx(); + for record_batch in reader.map(|batch_result| batch_result.unwrap()) { + let struct_arrow: ArrowStructArray = record_batch.into(); + let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); + let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); + + let compressed = ctx.clone().compress(vortex_array.as_ref(), None).unwrap(); + let compressed_as_arrow = as_arrow(compressed.as_ref()).unwrap(); + assert_eq!(compressed_as_arrow.deref(), arrow_array.deref()); + } + } } diff --git a/deps/fastlanez b/deps/fastlanez index b4dfae3cc0..851765c29e 160000 --- a/deps/fastlanez +++ b/deps/fastlanez @@ -1 +1 @@ -Subproject commit b4dfae3cc006eec80139a02eabd996fcb79c6512 +Subproject commit 851765c29e45d26b4aaee29863ac56d9f31df80c diff --git a/fastlanez-sys/src/lib.rs b/fastlanez-sys/src/lib.rs index 948321bea3..2bcab75d06 100644 --- a/fastlanez-sys/src/lib.rs +++ b/fastlanez-sys/src/lib.rs @@ -8,7 +8,7 @@ use std::mem::{size_of, transmute, MaybeUninit}; -use arrayref::array_mut_ref; +use arrayref::{array_mut_ref, array_ref}; use seq_macro::seq; use uninit::prelude::VecCapacity; @@ -37,6 +37,11 @@ where input: &[Self; 1024], output: &'a mut [MaybeUninit; 128 * W], ) -> &'a [u8; 128 * W]; + + fn bitunpack<'a>( + input: &[u8; 128 * W], + output: &'a mut [MaybeUninit; 1024], + ) -> &'a [Self; 1024]; } #[derive(Debug)] @@ -62,6 +67,26 @@ where unsafe { output.set_len(output.len() + (width * 128)) } Ok(()) } + + fn try_bitunpack<'a>( + input: &[u8], + width: usize, + output: &'a mut [MaybeUninit; 1024], + ) -> Result<&'a [Self; 1024], UnsupportedBitWidth>; + + fn try_bitunpack_into( + input: &[u8], + width: usize, + output: &mut Vec, + ) -> Result<(), UnsupportedBitWidth> { + Self::try_bitunpack( + input, + width, + array_mut_ref![output.reserve_uninit(1024), 0, 1024], + )?; + unsafe { output.set_len(output.len() + 1024) } + Ok(()) + } } macro_rules! bitpack_impl { @@ -80,6 +105,18 @@ macro_rules! bitpack_impl { output_array } } + + #[inline] + fn bitunpack<'a>( + input: &[u8; 128 * N], + output: &'a mut [MaybeUninit; 1024], + ) -> &'a [Self; 1024] { + unsafe { + let output_array: &mut [Self; 1024] = std::mem::transmute(output); + []~N(input, output_array); + output_array + } + } } }); } @@ -97,6 +134,19 @@ macro_rules! bitpack_impl { } }) } + + fn try_bitunpack<'a>( + input: &[u8], + width: usize, + output: &'a mut [MaybeUninit; 1024], + ) -> Result<&'a [Self; 1024], UnsupportedBitWidth> { + seq!(N in 1..$W { + match width { + #(N => Ok(BitPack::::bitunpack(array_ref![input, 0, N * 128], output)),)* + _ => Err(UnsupportedBitWidth), + } + }) + } } }; } diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 72e9864db9..b0ae53c42a 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -19,7 +19,7 @@ name = "pyvortex" crate-type = ["rlib", "cdylib"] [dependencies] -arrow = { version = "50.0.0", features = ["ffi"] } +arrow = { version = "50.0.0", features = ["pyarrow"] } vortex-array = { path = "../vortex-array" } vortex-alp = { path = "../vortex-alp" } vortex-dict = { path = "../vortex-dict" } diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index e6db24a5a4..caf327eeac 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -4,11 +4,11 @@ use pyo3::prelude::*; use vortex::array::bool::BoolArray; use vortex::array::chunked::ChunkedArray; +use vortex::array::composite::CompositeArray; use vortex::array::constant::ConstantArray; use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; use vortex::array::struct_::StructArray; -use vortex::array::typed::TypedArray; use vortex::array::varbin::VarBinArray; use vortex::array::varbinview::VarBinViewArray; use vortex::array::{Array, ArrayKind, ArrayRef}; @@ -53,11 +53,11 @@ macro_rules! pyarray { pyarray!(BoolArray, "BoolArray"); pyarray!(ChunkedArray, "ChunkedArray"); +pyarray!(CompositeArray, "CompositeArray"); pyarray!(ConstantArray, "ConstantArray"); pyarray!(PrimitiveArray, "PrimitiveArray"); pyarray!(SparseArray, "SparseArray"); pyarray!(StructArray, "StructArray"); -pyarray!(TypedArray, "TypedArray"); pyarray!(VarBinArray, "VarBinArray"); pyarray!(VarBinViewArray, "VarBinViewArray"); @@ -82,6 +82,10 @@ impl PyArray { PyChunkedArray::wrap(py, inner.into_any().downcast::().unwrap())? .extract(py) } + ArrayKind::Composite(_) => { + PyCompositeArray::wrap(py, inner.into_any().downcast::().unwrap())? + .extract(py) + } ArrayKind::Constant(_) => { PyConstantArray::wrap(py, inner.into_any().downcast::().unwrap())? .extract(py) @@ -98,10 +102,6 @@ impl PyArray { PyStructArray::wrap(py, inner.into_any().downcast::().unwrap())? .extract(py) } - ArrayKind::Typed(_) => { - PyTypedArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } ArrayKind::VarBin(_) => { PyVarBinArray::wrap(py, inner.into_any().downcast::().unwrap())? .extract(py) diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs index 14f76243fc..bbe3b4a377 100644 --- a/pyvortex/src/dtype.rs +++ b/pyvortex/src/dtype.rs @@ -1,13 +1,10 @@ -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field}; use arrow::pyarrow::FromPyArrow; use pyo3::types::PyType; use pyo3::{pyclass, pymethods, Py, PyAny, PyResult, Python}; -use vortex::arrow::convert::TryIntoDType; use vortex::dtype::DType; -use crate::error::PyVortexError; - #[pyclass(name = "DType", module = "vortex", subclass)] pub struct PyDType { inner: DType, @@ -35,12 +32,7 @@ impl PyDType { #[pyo3(from_py_with = "import_arrow_dtype")] arrow_dtype: DataType, nullable: bool, ) -> PyResult> { - PyDType::wrap( - cls.py(), - arrow_dtype - .try_into_dtype(nullable) - .map_err(PyVortexError::new)?, - ) + PyDType::wrap(cls.py(), (&Field::new("_", arrow_dtype, nullable)).into()) } } diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index 2604cec8dd..53d8168b26 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -1,5 +1,5 @@ use arrow::array::{make_array, ArrayData}; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field}; use arrow::ffi_stream::ArrowArrayStreamReader; use arrow::pyarrow::FromPyArrow; use arrow::record_batch::RecordBatchReader; @@ -8,8 +8,8 @@ use pyo3::prelude::*; use vortex::array::chunked::ChunkedArray; use vortex::array::{Array, ArrayRef}; -use vortex::arrow::convert::TryIntoDType; use vortex::dtype::DType; +use vortex::encode::FromArrow; use crate::array::PyArray; use crate::error::PyVortexError; @@ -26,7 +26,7 @@ pub fn encode(obj: &PyAny) -> PyResult> { if obj.is_instance(pa_array)? { let arrow_array = ArrayData::from_pyarrow(obj).map(make_array)?; - let enc_array: vortex::array::ArrayRef = arrow_array.into(); + let enc_array = ArrayRef::from_arrow(arrow_array, false); PyArray::wrap(obj.py(), enc_array) } else if obj.is_instance(chunked_array)? { let chunks: Vec<&PyAny> = obj.getattr("chunks")?.extract()?; @@ -35,15 +35,13 @@ pub fn encode(obj: &PyAny) -> PyResult> { .map(|a| { ArrayData::from_pyarrow(a) .map(make_array) - .map(ArrayRef::from) + .map(|a| ArrayRef::from_arrow(a, false)) }) .collect::>>()?; - let null_count: usize = obj.getattr("null_count")?.extract()?; let dtype: DType = obj .getattr("type") - .and_then(DataType::from_pyarrow)? - .try_into_dtype(null_count > 0) - .map_err(PyVortexError::map_err)?; + .and_then(DataType::from_pyarrow) + .map(|dt| (&Field::new("_", dt, false)).into())?; PyArray::wrap(obj.py(), ChunkedArray::new(encoded_chunks, dtype).boxed()) } else if obj.is_instance(table)? { let array_stream = ArrowArrayStreamReader::from_pyarrow(obj)?; diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index 062575586a..3c1a314ce7 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -37,13 +37,13 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/pyvortex/src/vortex_arrow.rs b/pyvortex/src/vortex_arrow.rs index 4f9fb44031..3cef6dcfad 100644 --- a/pyvortex/src/vortex_arrow.rs +++ b/pyvortex/src/vortex_arrow.rs @@ -1,5 +1,4 @@ use arrow::array::Array as ArrowArray; -use arrow::datatypes::DataType; use arrow::error::ArrowError; use arrow::pyarrow::ToPyArrow; use pyo3::exceptions::PyValueError; @@ -7,6 +6,7 @@ use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyList}; use vortex::array::Array; +use vortex::compute::as_arrow::as_arrow_chunks; pub fn map_arrow_err(error: ArrowError) -> PyErr { PyValueError::new_err(error.to_string()) @@ -15,21 +15,24 @@ pub fn map_arrow_err(error: ArrowError) -> PyErr { pub fn export_array<'py, T: AsRef>(py: Python<'py>, array: &T) -> PyResult<&'py PyAny> { // NOTE(ngates): for struct arrays, we could also return a RecordBatchStreamReader. // NOTE(robert): Return RecordBatchStreamReader always? + let chunks = as_arrow_chunks(array.as_ref()).unwrap(); + if chunks.is_empty() { + return Err(PyValueError::new_err("No chunks in array")); + } // Export the schema once - let data_type: DataType = array.as_ref().dtype().into(); + let data_type = chunks[0].data_type().clone(); let pa_data_type = data_type.to_pyarrow(py)?; - // Import pyarrow and its Array class - let mod_pyarrow = PyModule::import(py, "pyarrow")?; - // Iterate each chunk, export it to Arrow FFI, then import as a pyarrow array - let chunks: PyResult> = array - .as_ref() - .iter_arrow() + let chunks: PyResult> = chunks + .iter() .map(|arrow_array| arrow_array.into_data().to_pyarrow(py)) .collect(); + // Import pyarrow and its Array class + let mod_pyarrow = PyModule::import(py, "pyarrow")?; + // Combine into a chunked array mod_pyarrow.call_method( "chunked_array", diff --git a/pyvortex/test/test_array.py b/pyvortex/test/test_array.py index 6734586e24..5d383c9402 100644 --- a/pyvortex/test/test_array.py +++ b/pyvortex/test/test_array.py @@ -1,5 +1,4 @@ import pyarrow as pa -import pytest import vortex @@ -17,14 +16,6 @@ def test_varbin_array_round_trip(): assert arr.to_pyarrow().combine_chunks() == a -@pytest.mark.xfail(strict=True) -def test_varbin_array_doesnt_round_trip(): - a = pa.array(["a", "b", "c"], type=pa.large_utf8()) - arr = vortex.encode(a) - assert isinstance(arr, vortex.VarBinArray) - assert arr.to_pyarrow().combine_chunks() == a - - def test_empty_array(): a = pa.array([], type=pa.uint8()) primitive = vortex.encode(a) diff --git a/pyvortex/test/test_compress.py b/pyvortex/test/test_compress.py index 2c161e2334..2f89ef175d 100644 --- a/pyvortex/test/test_compress.py +++ b/pyvortex/test/test_compress.py @@ -1,5 +1,10 @@ +import os.path +from pathlib import Path + import numpy as np import pyarrow as pa +import pyarrow.parquet as pq +import pytest import vortex @@ -63,3 +68,12 @@ def test_table_encode(): assert encoded.to_pyarrow().combine_chunks() == pa.StructArray.from_arrays( [pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"])], names=["number", "string"] ) + + +@pytest.mark.xfail(reason="Not yet implemented") +def test_taxi(): + curdir = Path(os.path.dirname(__file__)).parent.parent + table = pq.read_table(curdir / "bench-vortex/data/yellow-tripdata-2023-11.parquet") + compressed = vortex.compress(vortex.encode(table[:100])) + decompressed = compressed.to_pyarrow() + assert not decompressed diff --git a/vortex-alp/src/array.rs b/vortex-alp/src/array.rs index 7ebfc5a001..e80703a7e9 100644 --- a/vortex-alp/src/array.rs +++ b/vortex-alp/src/array.rs @@ -2,7 +2,7 @@ use std::any::Any; use std::sync::{Arc, RwLock}; use crate::alp::Exponents; -use vortex::array::{Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; use vortex::dtype::{DType, IntWidth, Signedness}; use vortex::error::{VortexError, VortexResult}; @@ -104,10 +104,6 @@ impl Array for ALPArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new( self.encoded().slice(start, stop)?, @@ -127,8 +123,8 @@ impl Array for ALPArray { self.encoded().nbytes() + self.patches().map(|p| p.nbytes()).unwrap_or(0) } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index fe2881a800..e58336b268 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -107,7 +107,6 @@ mod tests { fn test_compress() { let array = PrimitiveArray::from(vec![1.234f32; 1025]); let encoded = alp_encode(&array).unwrap(); - println!("Encoded {:?}", encoded); assert!(encoded.patches().is_none()); assert_eq!( encoded.encoded().as_primitive().typed_data::(), diff --git a/vortex-alp/src/compute.rs b/vortex-alp/src/compute.rs index 86426ab162..68867d26ca 100644 --- a/vortex-alp/src/compute.rs +++ b/vortex-alp/src/compute.rs @@ -6,7 +6,7 @@ use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::dtype::{DType, FloatWidth}; use vortex::error::{VortexError, VortexResult}; -use vortex::scalar::{NullableScalar, Scalar, ScalarRef}; +use vortex::scalar::Scalar; impl ArrayCompute for ALPArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -15,30 +15,24 @@ impl ArrayCompute for ALPArray { } impl ScalarAtFn for ALPArray { - fn scalar_at(&self, index: usize) -> VortexResult { - if let Some(patch) = self - .patches() - .and_then(|p| scalar_at(p, index).ok()) - .and_then(|p| p.into_nonnull()) - { + fn scalar_at(&self, index: usize) -> VortexResult { + if let Some(patch) = self.patches().and_then(|p| scalar_at(p, index).ok()) { return Ok(patch); } - let Some(encoded_val) = scalar_at(self.encoded(), index)?.into_nonnull() else { - return Ok(NullableScalar::none(self.dtype().clone()).boxed()); - }; + let encoded_val = scalar_at(self.encoded(), index)?; match self.dtype() { DType::Float(FloatWidth::_32, _) => { let encoded_val: i32 = encoded_val.try_into().unwrap(); - Ok(ScalarRef::from(::decode_single( + Ok(Scalar::from(::decode_single( encoded_val, self.exponents(), ))) } DType::Float(FloatWidth::_64, _) => { let encoded_val: i64 = encoded_val.try_into().unwrap(); - Ok(ScalarRef::from(::decode_single( + Ok(Scalar::from(::decode_single( encoded_val, self.exponents(), ))) diff --git a/vortex-alp/src/serde.rs b/vortex-alp/src/serde.rs index a18d82de84..5d2836bb44 100644 --- a/vortex-alp/src/serde.rs +++ b/vortex-alp/src/serde.rs @@ -1,16 +1,14 @@ -use std::io; -use std::io::ErrorKind; - use crate::alp::Exponents; use vortex::array::{Array, ArrayRef}; use vortex::dtype::{DType, FloatWidth, Signedness}; +use vortex::error::{VortexError, VortexResult}; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::ALPArray; use crate::ALPEncoding; impl ArraySerde for ALPArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_option_tag(self.patches().is_some())?; if let Some(p) = self.patches() { ctx.write(p.as_ref())?; @@ -21,7 +19,7 @@ impl ArraySerde for ALPArray { } impl EncodingSerde for ALPEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let patches_tag = ctx.read_nbytes::<1>()?[0]; let patches = if patches_tag == 0x01 { Some(ctx.read()?) @@ -33,9 +31,9 @@ impl EncodingSerde for ALPEncoding { DType::Float(width, nullability) => match width { FloatWidth::_32 => DType::Int(32.into(), Signedness::Signed, *nullability), FloatWidth::_64 => DType::Int(64.into(), Signedness::Signed, *nullability), - _ => return Err(io::Error::new(ErrorKind::InvalidData, "invalid dtype")), + _ => return Err(VortexError::InvalidDType(ctx.schema().clone())), }, - _ => return Err(io::Error::new(ErrorKind::InvalidData, "invalid dtype")), + _ => return Err(VortexError::InvalidDType(ctx.schema().clone())), }; let encoded = ctx.with_schema(&encoded_dtype).read()?; Ok(ALPArray::new( @@ -52,17 +50,16 @@ impl EncodingSerde for ALPEncoding { #[cfg(test)] mod test { - use std::io; - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::{Array, ArrayRef}; + use vortex::error::VortexResult; use vortex::serde::{ReadCtx, WriteCtx}; use crate::compress::alp_encode; use crate::downcast::DowncastALP; - fn roundtrip_array(array: &dyn Array) -> io::Result { + fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 65b9537a5a..8d6a412874 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -20,7 +20,9 @@ workspace = true [dependencies] allocator-api2 = "0.2.16" -arrow = { version = "50.0.0", features = ["pyarrow"] } +arrow-array = { version = "50.0.0" } +arrow-buffer = { version = "50.0.0" } +arrow-schema = { version = "50.0.0" } dyn-clone = "1.0.16" half = "2.3.1" humansize = "2.1.3" @@ -31,9 +33,6 @@ log = "0.4.20" num-traits = "0.2.18" num_enum = "0.7.2" once_cell = "1.19.0" -polars-arrow = { version = "0.37.0", features = ["arrow_rs"] } -polars-core = "0.37.0" -polars-ops = { version = "0.37.0", features = ["search_sorted"] } rand = { version = "0.8.5", features = [] } rayon = "1.8.1" roaring = "0.10.3" diff --git a/vortex-array/src/accessor.rs b/vortex-array/src/accessor.rs new file mode 100644 index 0000000000..ec36393bce --- /dev/null +++ b/vortex-array/src/accessor.rs @@ -0,0 +1,5 @@ +use crate::array::Array; + +pub trait ArrayAccessor: Array { + fn value(&self, index: usize) -> Option; +} diff --git a/vortex-array/src/array/bool/compute.rs b/vortex-array/src/array/bool/compute.rs index 44dc7417c1..953284f704 100644 --- a/vortex-array/src/array/bool/compute.rs +++ b/vortex-array/src/array/bool/compute.rs @@ -1,22 +1,23 @@ +use arrow_buffer::buffer::BooleanBuffer; +use itertools::Itertools; + use crate::array::bool::BoolArray; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::{Array, ArrayRef, CloneOptionalArray}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::cast::{cast_bool, CastBoolFn}; use crate::compute::fill::FillForwardFn; +use crate::compute::flatten::{flatten_bool, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::error::VortexResult; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; -use arrow::buffer::BooleanBuffer; -use itertools::Itertools; +use crate::scalar::{BoolScalar, Scalar}; impl ArrayCompute for BoolArray { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } - fn cast_bool(&self) -> Option<&dyn CastBoolFn> { + fn flatten(&self) -> Option<&dyn FlattenFn> { Some(self) } @@ -61,18 +62,18 @@ impl AsContiguousFn for BoolArray { } } -impl CastBoolFn for BoolArray { - fn cast_bool(&self) -> VortexResult { - Ok(self.clone()) +impl FlattenFn for BoolArray { + fn flatten(&self) -> VortexResult { + Ok(FlattenedArray::Bool(self.clone())) } } impl ScalarAtFn for BoolArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { Ok(self.buffer.value(index).into()) } else { - Ok(NullableScalar::none(self.dtype().clone()).boxed()) + Ok(BoolScalar::new(None).into()) } } } @@ -82,7 +83,7 @@ impl FillForwardFn for BoolArray { if self.validity().is_none() { Ok(dyn_clone::clone_box(self)) } else { - let validity = cast_bool(self.validity().unwrap())?; + let validity = flatten_bool(self.validity().unwrap())?; let bools = self.buffer(); let mut last_value = false; let filled = bools diff --git a/vortex-array/src/array/bool/flatten.rs b/vortex-array/src/array/bool/flatten.rs new file mode 100644 index 0000000000..6a723882f9 --- /dev/null +++ b/vortex-array/src/array/bool/flatten.rs @@ -0,0 +1,21 @@ +use crate::array::bool::BoolArray; +use crate::compute::as_arrow::AsArrowArray; +use crate::compute::flatten::flatten_bool; +use crate::error::VortexResult; +use arrow_array::{ArrayRef as ArrowArrayRef, BooleanArray as ArrowBoolArray}; +use arrow_buffer::NullBuffer; +use std::sync::Arc; + +impl AsArrowArray for BoolArray { + fn as_arrow(&self) -> VortexResult { + let validity = self + .validity() + .map(flatten_bool) + .transpose()? + .map(|b| NullBuffer::new(b.buffer)); + Ok(Arc::new(ArrowBoolArray::new( + self.buffer().clone(), + validity, + ))) + } +} diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index 33fad93f54..43c33a72ac 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -1,12 +1,9 @@ use std::any::Any; -use std::iter; use std::sync::{Arc, RwLock}; -use arrow::array::{ArrayRef as ArrowArrayRef, AsArray, BooleanArray}; -use arrow::buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::buffer::BooleanBuffer; use linkme::distributed_slice; -use crate::arrow::CombineChunks; use crate::compute::scalar_at::scalar_at; use crate::dtype::{DType, Nullability}; use crate::error::VortexResult; @@ -15,11 +12,12 @@ use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stat, Stats, StatsSet}; use super::{ - check_slice_bounds, check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding, - EncodingId, EncodingRef, ENCODINGS, + check_slice_bounds, check_validity_buffer, Array, ArrayRef, Encoding, EncodingId, EncodingRef, + ENCODINGS, }; mod compute; +mod flatten; mod serde; mod stats; @@ -53,6 +51,13 @@ impl BoolArray { .unwrap_or(true) } + pub fn null(n: usize) -> Self { + BoolArray::new( + BooleanBuffer::from(vec![false; n]), + Some(BoolArray::from(vec![false; n]).boxed()), + ) + } + #[inline] pub fn buffer(&self) -> &BooleanBuffer { &self.buffer @@ -104,21 +109,6 @@ impl Array for BoolArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - Box::new(iter::once(Arc::new(BooleanArray::new( - self.buffer.clone(), - self.validity().map(|v| { - NullBuffer::new( - v.iter_arrow() - .combine_chunks() - .as_boolean() - .values() - .clone(), - ) - }), - )) as ArrowArrayRef)) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -144,8 +134,8 @@ impl Array for BoolArray { (self.len() + 7) / 8 } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-array/src/array/bool/serde.rs b/vortex-array/src/array/bool/serde.rs index 7bf1262db9..4a6370db0e 100644 --- a/vortex-array/src/array/bool/serde.rs +++ b/vortex-array/src/array/bool/serde.rs @@ -1,13 +1,12 @@ -use std::io; - -use arrow::buffer::BooleanBuffer; +use arrow_buffer::buffer::BooleanBuffer; use crate::array::bool::{BoolArray, BoolEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for BoolArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { if let Some(v) = self.validity() { ctx.write(v.as_ref())?; } @@ -16,7 +15,7 @@ impl ArraySerde for BoolArray { } impl EncodingSerde for BoolEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let validity = if ctx.schema().is_nullable() { Some(ctx.validity().read()?) } else { diff --git a/vortex-array/src/array/chunked/compute.rs b/vortex-array/src/array/chunked/compute.rs index 2caa7d543d..6e15e31a2a 100644 --- a/vortex-array/src/array/chunked/compute.rs +++ b/vortex-array/src/array/chunked/compute.rs @@ -1,18 +1,24 @@ +use itertools::Itertools; + use crate::array::chunked::ChunkedArray; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::ArrayRef; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; +use crate::compute::flatten::{FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::ArrayCompute; use crate::error::VortexResult; -use crate::scalar::ScalarRef; -use itertools::Itertools; +use crate::scalar::Scalar; impl ArrayCompute for ChunkedArray { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -30,8 +36,14 @@ impl AsContiguousFn for ChunkedArray { } } +impl FlattenFn for ChunkedArray { + fn flatten(&self) -> VortexResult { + Ok(FlattenedArray::Chunked(self.clone())) + } +} + impl ScalarAtFn for ChunkedArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { let (chunk_index, chunk_offset) = self.find_physical_location(index); scalar_at(self.chunks[chunk_index].as_ref(), chunk_offset) } diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index 500d12e08f..4d40a1aa7b 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -1,14 +1,11 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use std::vec::IntoIter; -use arrow::array::ArrayRef as ArrowArrayRef; use itertools::Itertools; use linkme::distributed_slice; use crate::array::{ - check_slice_bounds, Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, - ENCODINGS, + check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, }; use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; @@ -124,10 +121,6 @@ impl Array for ChunkedArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - Box::new(ChunkedArrowIterator::new(self)) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -167,8 +160,8 @@ impl Array for ChunkedArray { self.chunks().iter().map(|arr| arr.nbytes()).sum() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -218,49 +211,14 @@ impl Encoding for ChunkedEncoding { } } -struct ChunkedArrowIterator { - chunks_iter: IntoIter, - arrow_iter: Option>, -} - -impl ChunkedArrowIterator { - fn new(array: &ChunkedArray) -> Self { - let mut chunks_iter = array.chunks.clone().into_iter(); - let arrow_iter = chunks_iter.next().map(|c| c.iter_arrow()); - Self { - chunks_iter, - arrow_iter, - } - } -} - -impl Iterator for ChunkedArrowIterator { - type Item = ArrowArrayRef; - - fn next(&mut self) -> Option { - self.arrow_iter - .as_mut() - .and_then(|iter| iter.next()) - .or_else(|| { - self.chunks_iter.next().and_then(|next_chunk| { - self.arrow_iter = Some(next_chunk.iter_arrow()); - self.next() - }) - }) - } -} - #[cfg(test)] mod test { - use arrow::array::cast::AsArray; - use arrow::array::types::UInt64Type; - use arrow::array::ArrayRef as ArrowArrayRef; - use arrow::array::ArrowPrimitiveType; - use itertools::Itertools; + use crate::array::{Array, ArrayRef}; use crate::array::chunked::ChunkedArray; - use crate::array::Array; + use crate::compute::flatten::{flatten, flatten_primitive, FlattenedArray}; use crate::dtype::{DType, IntWidth, Nullability, Signedness}; + use crate::ptype::NativePType; fn chunked_array() -> ChunkedArray { ChunkedArray::new( @@ -277,74 +235,41 @@ mod test { ) } - fn assert_equal_slices(arr: ArrowArrayRef, slice: &[T::Native]) { - assert_eq!(*arr.as_primitive::().values(), slice); - } - - #[test] - pub fn iter() { - let chunked = ChunkedArray::new( - vec![vec![1u64, 2, 3].into(), vec![4u64, 5, 6].into()], - DType::Int( - IntWidth::_64, - Signedness::Unsigned, - Nullability::NonNullable, - ), - ); - + fn assert_equal_slices(arr: ArrayRef, slice: &[T]) { + let FlattenedArray::Chunked(chunked) = flatten(arr.as_ref()).unwrap() else { + unreachable!() + }; + let mut values = Vec::with_capacity(arr.len()); chunked - .iter_arrow() - .zip_eq([[1u64, 2, 3], [4, 5, 6]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + .chunks() + .iter() + .map(|a| flatten_primitive(a.as_ref()).unwrap()) + .for_each(|a| values.extend_from_slice(a.typed_data::())); + assert_eq!(values, slice); } #[test] pub fn slice_middle() { - chunked_array() - .slice(2, 5) - .unwrap() - .iter_arrow() - .zip_eq([vec![3u64], vec![4, 5]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + assert_equal_slices(chunked_array().slice(2, 5).unwrap(), &[3u64, 4, 5]) } #[test] pub fn slice_begin() { - chunked_array() - .slice(1, 3) - .unwrap() - .iter_arrow() - .zip_eq([[2u64, 3]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + assert_equal_slices(chunked_array().slice(1, 3).unwrap(), &[2u64, 3]); } #[test] pub fn slice_aligned() { - chunked_array() - .slice(3, 6) - .unwrap() - .iter_arrow() - .zip_eq([[4u64, 5, 6]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + assert_equal_slices(chunked_array().slice(3, 6).unwrap(), &[4u64, 5, 6]); } #[test] pub fn slice_many_aligned() { - chunked_array() - .slice(0, 6) - .unwrap() - .iter_arrow() - .zip_eq([[1u64, 2, 3], [4, 5, 6]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + assert_equal_slices(chunked_array().slice(0, 6).unwrap(), &[1u64, 2, 3, 4, 5, 6]); } #[test] pub fn slice_end() { - chunked_array() - .slice(7, 8) - .unwrap() - .iter_arrow() - .zip_eq([[8u64]]) - .for_each(|(arr, slice)| assert_equal_slices::(arr, &slice)); + assert_equal_slices(chunked_array().slice(7, 8).unwrap(), &[8u64]); } } diff --git a/vortex-array/src/array/chunked/serde.rs b/vortex-array/src/array/chunked/serde.rs index 34b95a8b05..ad1bad2ccd 100644 --- a/vortex-array/src/array/chunked/serde.rs +++ b/vortex-array/src/array/chunked/serde.rs @@ -1,11 +1,10 @@ -use std::io; - use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for ChunkedArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.chunks().len())?; for c in self.chunks() { ctx.write(c.as_ref())?; @@ -15,7 +14,7 @@ impl ArraySerde for ChunkedArray { } impl EncodingSerde for ChunkedEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let chunk_len = ctx.read_usize()?; let mut chunks = Vec::::with_capacity(chunk_len); // TODO(robert): Use read_vectored diff --git a/vortex-array/src/array/composite/as_arrow.rs b/vortex-array/src/array/composite/as_arrow.rs new file mode 100644 index 0000000000..825c73bbe3 --- /dev/null +++ b/vortex-array/src/array/composite/as_arrow.rs @@ -0,0 +1,96 @@ +use crate::array::composite::CompositeArray; +use crate::array::Array; +use crate::arrow::wrappers::as_nulls; +use crate::composite_dtypes::{TimeUnit, TimeUnitSerializer}; +use crate::compute::as_arrow::AsArrowArray; +use crate::compute::cast::cast; +use crate::compute::flatten::{flatten_bool, flatten_primitive, flatten_struct}; +use crate::compute::scalar_at::scalar_at; +use crate::dtype::DType; +use crate::error::{VortexError, VortexResult}; +use crate::ptype::PType; +use crate::stats::Stat; +use arrow_array::{ + ArrayRef as ArrowArrayRef, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow_buffer::NullBuffer; +use std::sync::Arc; + +impl AsArrowArray for CompositeArray { + fn as_arrow(&self) -> VortexResult { + // Decide based on the DType if we know how to do this or not... + match self.dtype() { + DType::Composite(id, _dtype, metadata) => match id.as_str() { + "zoneddatetime" => hacky_zoneddatetime_as_arrow(self.underlying(), metadata), + &_ => Err(VortexError::InvalidArgument( + format!("Cannot convert composite DType {} to arrow", id).into(), + )), + }, + _ => Err(VortexError::InvalidArgument( + format!("Cannot convert {} into Arrow array", self.dtype().clone()).into(), + )), + } + } +} + +fn hacky_zoneddatetime_as_arrow(array: &dyn Array, metadata: &[u8]) -> VortexResult { + // A ZonedDateTime is currently just a primitive that ignores the timezone... + let array = flatten_primitive(cast(array, &PType::I64.into())?.as_ref())?; + + let values = array.scalar_buffer::(); + let validity = as_nulls(array.validity())?; + + let time_unit = TimeUnitSerializer::deserialize(metadata); + Ok(match time_unit { + TimeUnit::Ns => Arc::new(TimestampNanosecondArray::new(values, validity)), + TimeUnit::Us => Arc::new(TimestampMicrosecondArray::new(values, validity)), + TimeUnit::Ms => Arc::new(TimestampMillisecondArray::new(values, validity)), + TimeUnit::S => Arc::new(TimestampSecondArray::new(values, validity)), + }) +} + +// FIXME(ngates): this is what ZonedDateTime should look like, but it's not implemented yet. +#[allow(dead_code)] +fn zoneddatetime_as_arrow(array: &dyn Array, metadata: &[u8]) -> VortexResult { + // A ZonedDateTime is a composite of {instant, timezone}. + // TODO(ngates): make this actually a composite of Instant, instead of directly a primitive. + let array = flatten_struct(array)?; + assert_eq!(array.names()[0].as_str(), "instant"); + assert_eq!(array.names()[1].as_str(), "timezone"); + + // Map the instant into an i64 primitive + let instant = array.fields().first().unwrap(); + let instant = flatten_primitive(cast(instant.as_ref(), &PType::I64.into())?.as_ref())?; + + // Extract the values and validity buffer + let values = instant.scalar_buffer::(); + let validity = instant + .validity() + .map(flatten_bool) + .transpose()? + .map(|b| NullBuffer::new(b.buffer().clone())); + + // Unwrap the constant timezone + let timezone = array.fields().get(1).unwrap(); + if !timezone + .stats() + .get_or_compute_as::(&Stat::IsConstant) + .unwrap_or(false) + { + return Err(VortexError::InvalidArgument( + "Timezone must be constant to convert into Arrow".into(), + )); + } + let _timezone = scalar_at(timezone.as_ref(), 0)?; + + // Extract the instant unit + let time_unit = TimeUnitSerializer::deserialize(metadata); + + Ok(match time_unit { + TimeUnit::Ns => Arc::new(TimestampNanosecondArray::new(values, validity)), + TimeUnit::Us => Arc::new(TimestampMicrosecondArray::new(values, validity)), + TimeUnit::Ms => Arc::new(TimestampMillisecondArray::new(values, validity)), + TimeUnit::S => Arc::new(TimestampSecondArray::new(values, validity)), + }) +} diff --git a/vortex-array/src/array/typed/compress.rs b/vortex-array/src/array/composite/compress.rs similarity index 59% rename from vortex-array/src/array/typed/compress.rs rename to vortex-array/src/array/composite/compress.rs index 917c7ea00c..e9b60e0df0 100644 --- a/vortex-array/src/array/typed/compress.rs +++ b/vortex-array/src/array/composite/compress.rs @@ -1,10 +1,9 @@ +use crate::array::composite::{CompositeArray, CompositeEncoding}; use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::typed::{TypedArray, TypedEncoding}; use crate::array::{Array, ArrayRef}; use crate::compress::{CompressConfig, CompressCtx, EncodingCompression}; use crate::error::VortexResult; - -impl EncodingCompression for TypedEncoding { +impl EncodingCompression for CompositeEncoding { fn cost(&self) -> u8 { 0 } @@ -23,15 +22,16 @@ impl EncodingCompression for TypedEncoding { like: Option<&dyn Array>, ctx: CompressCtx, ) -> VortexResult { - let typed_array = array.as_typed(); - let typed_like = like.map(|like_array| like_array.as_typed()); + let composite_array = array.as_composite(); + let composite_like = like.map(|like_array| like_array.as_composite()); - Ok(TypedArray::new( + Ok(CompositeArray::new( + composite_array.id(), + composite_array.metadata().clone(), ctx.compress( - typed_array.untyped_array(), - typed_like.map(|typed_arr| typed_arr.untyped_array()), + composite_array.underlying(), + composite_like.map(|c| c.underlying()), )?, - array.dtype().clone(), ) .boxed()) } diff --git a/vortex-array/src/array/composite/compute.rs b/vortex-array/src/array/composite/compute.rs new file mode 100644 index 0000000000..19a4523401 --- /dev/null +++ b/vortex-array/src/array/composite/compute.rs @@ -0,0 +1,59 @@ +use itertools::Itertools; + +use crate::array::composite::CompositeArray; +use crate::array::downcast::DowncastArrayBuiltin; +use crate::array::{Array, ArrayRef}; +use crate::compute::as_arrow::AsArrowArray; +use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; +use crate::compute::flatten::{FlattenFn, FlattenedArray}; +use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; +use crate::compute::ArrayCompute; +use crate::error::VortexResult; +use crate::scalar::Scalar; + +impl ArrayCompute for CompositeArray { + fn as_arrow(&self) -> Option<&dyn AsArrowArray> { + Some(self) + } + + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { + Some(self) + } + + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } +} + +impl AsContiguousFn for CompositeArray { + fn as_contiguous(&self, arrays: Vec) -> VortexResult { + Ok(CompositeArray::new( + self.id(), + self.metadata().clone(), + as_contiguous( + arrays + .into_iter() + .map(|array| dyn_clone::clone_box(array.as_composite().underlying())) + .collect_vec(), + )?, + ) + .boxed()) + } +} + +impl FlattenFn for CompositeArray { + fn flatten(&self) -> VortexResult { + Ok(FlattenedArray::Composite(self.clone())) + } +} + +impl ScalarAtFn for CompositeArray { + fn scalar_at(&self, index: usize) -> VortexResult { + let underlying = scalar_at(self.underlying(), index)?; + underlying.cast(self.dtype()) + } +} diff --git a/vortex-array/src/array/composite/mod.rs b/vortex-array/src/array/composite/mod.rs new file mode 100644 index 0000000000..327b0668f9 --- /dev/null +++ b/vortex-array/src/array/composite/mod.rs @@ -0,0 +1,187 @@ +use std::any::Any; +use std::sync::{Arc, RwLock}; + +use linkme::distributed_slice; + +use crate::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS}; +use crate::compress::EncodingCompression; +use crate::dtype::{DType, Metadata}; +use crate::error::VortexResult; +use crate::formatter::{ArrayDisplay, ArrayFormatter}; +use crate::serde::{ArraySerde, EncodingSerde}; +use crate::stats::{Stats, StatsCompute, StatsSet}; + +mod as_arrow; +mod compress; +mod compute; +mod serde; + +#[derive(Debug, Clone)] +pub struct CompositeArray { + underlying: ArrayRef, + dtype: DType, + stats: Arc>, +} + +impl CompositeArray { + pub fn new(id: Arc, metadata: Metadata, underlying: ArrayRef) -> Self { + let dtype = DType::Composite(id, Box::new(underlying.dtype().clone()), metadata); + Self { + underlying, + dtype, + stats: Arc::new(RwLock::new(StatsSet::new())), + } + } + + pub fn id(&self) -> Arc { + let DType::Composite(id, _, _) = &self.dtype else { + unreachable!() + }; + id.clone() + } + + pub fn metadata(&self) -> &Metadata { + let DType::Composite(_, _, metadata) = &self.dtype else { + unreachable!() + }; + metadata + } + + #[inline] + pub fn underlying(&self) -> &dyn Array { + self.underlying.as_ref() + } +} + +impl Array for CompositeArray { + #[inline] + fn as_any(&self) -> &dyn Any { + self + } + + #[inline] + fn boxed(self) -> ArrayRef { + Box::new(self) + } + + #[inline] + fn into_any(self: Box) -> Box { + self + } + + #[inline] + fn len(&self) -> usize { + self.underlying.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.underlying.is_empty() + } + + #[inline] + fn dtype(&self) -> &DType { + &self.dtype + } + + #[inline] + fn stats(&self) -> Stats { + Stats::new(&self.stats, self) + } + + fn slice(&self, start: usize, stop: usize) -> VortexResult { + Ok(Self::new( + self.id().clone(), + self.metadata().clone(), + self.underlying.slice(start, stop)?, + ) + .boxed()) + } + + #[inline] + fn encoding(&self) -> EncodingRef { + &CompositeEncoding + } + + #[inline] + fn nbytes(&self) -> usize { + self.underlying.nbytes() + } + + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) + } +} + +impl StatsCompute for CompositeArray {} + +impl<'arr> AsRef<(dyn Array + 'arr)> for CompositeArray { + fn as_ref(&self) -> &(dyn Array + 'arr) { + self + } +} + +#[derive(Debug)] +pub struct CompositeEncoding; + +impl CompositeEncoding { + pub const ID: EncodingId = EncodingId::new("vortex.composite"); +} + +#[distributed_slice(ENCODINGS)] +static ENCODINGS_COMPOSITE: EncodingRef = &CompositeEncoding; + +impl Encoding for CompositeEncoding { + fn id(&self) -> &EncodingId { + &Self::ID + } + + fn compression(&self) -> Option<&dyn EncodingCompression> { + Some(self) + } + + fn serde(&self) -> Option<&dyn EncodingSerde> { + Some(self) + } +} + +impl ArrayDisplay for CompositeArray { + fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { + f.child("composite", self.underlying()) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::composite_dtypes::{localtime, TimeUnit, TimeUnitSerializer}; + use crate::compute::scalar_at::scalar_at; + use crate::dtype::{IntWidth, Nullability}; + use crate::scalar::{CompositeScalar, PScalar, PrimitiveScalar}; + + #[test] + pub fn scalar() { + let dtype = localtime(TimeUnit::Us, IntWidth::_64, Nullability::NonNullable); + let arr = CompositeArray::new( + Arc::new("localtime".into()), + TimeUnitSerializer::serialize(TimeUnit::Us), + vec![64_799_000_000_i64, 43_000_000_000].into(), + ); + assert_eq!( + scalar_at(arr.as_ref(), 0).unwrap(), + CompositeScalar::new( + dtype.clone(), + Box::new(PrimitiveScalar::some(PScalar::I64(64_799_000_000)).into()), + ) + .into() + ); + assert_eq!( + scalar_at(arr.as_ref(), 1).unwrap(), + CompositeScalar::new( + dtype.clone(), + Box::new(PrimitiveScalar::some(PScalar::I64(43_000_000_000)).into()), + ) + .into() + ); + } +} diff --git a/vortex-array/src/array/composite/serde.rs b/vortex-array/src/array/composite/serde.rs new file mode 100644 index 0000000000..ea07bb369a --- /dev/null +++ b/vortex-array/src/array/composite/serde.rs @@ -0,0 +1,56 @@ +use crate::array::composite::{CompositeArray, CompositeEncoding}; +use crate::array::{Array, ArrayRef}; +use crate::dtype::DType; +use crate::error::VortexResult; +use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; + +impl ArraySerde for CompositeArray { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { + // TODO(ngates): just write the ID and metadata? + ctx.dtype(self.dtype())?; + ctx.write(self.underlying()) + } +} + +impl EncodingSerde for CompositeEncoding { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { + let DType::Composite(id, underlying, metadata) = ctx.dtype()? else { + panic!("Invalid DType") + }; + Ok(CompositeArray::new(id, metadata, ctx.with_schema(&underlying).read()?).boxed()) + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use crate::array::composite::CompositeArray; + use crate::array::downcast::DowncastArrayBuiltin; + use crate::array::Array; + use crate::dtype::Metadata; + use crate::serde::test::roundtrip_array; + + #[test] + fn roundtrip() { + let arr = CompositeArray::new( + Arc::new("test".into()), + Metadata::default(), + vec![7u8, 37, 71, 97].into(), + ); + + let read_arr = roundtrip_array(arr.as_ref()).unwrap(); + + assert_eq!( + arr.underlying().as_primitive().buffer().typed_data::(), + read_arr + .as_composite() + .underlying() + .as_primitive() + .buffer() + .typed_data::() + ); + + assert_eq!(arr.dtype(), read_arr.dtype()); + } +} diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs new file mode 100644 index 0000000000..4052fd22f5 --- /dev/null +++ b/vortex-array/src/array/constant/compute.rs @@ -0,0 +1,89 @@ +use crate::array::bool::BoolArray; +use crate::array::constant::ConstantArray; +use crate::array::downcast::DowncastArrayBuiltin; +use crate::array::primitive::PrimitiveArray; +use crate::array::{Array, ArrayRef}; +use crate::compute::as_contiguous::AsContiguousFn; +use crate::compute::flatten::{FlattenFn, FlattenedArray}; +use crate::compute::scalar_at::ScalarAtFn; +use crate::compute::take::TakeFn; +use crate::compute::ArrayCompute; +use crate::error::VortexResult; +use crate::match_each_native_ptype; +use crate::scalar::Scalar; +use itertools::Itertools; + +impl ArrayCompute for ConstantArray { + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { + Some(self) + } + + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } + + fn take(&self) -> Option<&dyn TakeFn> { + Some(self) + } +} + +impl AsContiguousFn for ConstantArray { + fn as_contiguous(&self, arrays: Vec) -> VortexResult { + let chunks = arrays.iter().map(|a| a.as_constant().clone()).collect_vec(); + if chunks.iter().map(|c| c.scalar()).all_equal() { + Ok(ConstantArray::new( + chunks.first().unwrap().scalar().clone(), + chunks.iter().map(|c| c.len()).sum(), + ) + .boxed()) + } else { + // TODO(ngates): we need to flatten the constant arrays and then concatenate them + Err("Cannot concatenate constant arrays with differing scalars".into()) + } + } +} + +impl FlattenFn for ConstantArray { + fn flatten(&self) -> VortexResult { + Ok(match self.scalar() { + Scalar::Bool(b) => { + if let Some(bv) = b.value() { + FlattenedArray::Bool(BoolArray::from(vec![bv; self.len()])) + } else { + FlattenedArray::Bool(BoolArray::null(self.len())) + } + } + Scalar::Primitive(p) => { + if let Some(ps) = p.value() { + match_each_native_ptype!(ps.ptype(), |$P| { + FlattenedArray::Primitive(PrimitiveArray::from_value::<$P>( + $P::try_from(self.scalar())?, + self.len(), + )) + }) + } else { + match_each_native_ptype!(p.ptype(), |$P| { + FlattenedArray::Primitive(PrimitiveArray::null::<$P>(self.len())) + }) + } + } + _ => panic!("Unsupported scalar type {}", self.dtype()), + }) + } +} + +impl ScalarAtFn for ConstantArray { + fn scalar_at(&self, _index: usize) -> VortexResult { + Ok(self.scalar().clone()) + } +} + +impl TakeFn for ConstantArray { + fn take(&self, indices: &dyn Array) -> VortexResult { + Ok(ConstantArray::new(self.scalar().clone(), indices.len()).boxed()) + } +} diff --git a/vortex-array/src/array/constant/compute/mod.rs b/vortex-array/src/array/constant/compute/mod.rs deleted file mode 100644 index 7cf87841f4..0000000000 --- a/vortex-array/src/array/constant/compute/mod.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::array::constant::ConstantArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; -use crate::error::VortexResult; -use crate::scalar::ScalarRef; - -impl ArrayCompute for ConstantArray { - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl ScalarAtFn for ConstantArray { - fn scalar_at(&self, _index: usize) -> VortexResult { - Ok(dyn_clone::clone_box(self.scalar())) - } -} - -impl TakeFn for ConstantArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - Ok(ConstantArray::new(dyn_clone::clone_box(self.scalar()), indices.len()).boxed()) - } -} diff --git a/vortex-array/src/array/constant/mod.rs b/vortex-array/src/array/constant/mod.rs index 093a93cd96..d3065f5531 100644 --- a/vortex-array/src/array/constant/mod.rs +++ b/vortex-array/src/array/constant/mod.rs @@ -1,20 +1,17 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use arrow::array::Datum; use linkme::distributed_slice; use crate::array::{ - check_slice_bounds, Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, - ENCODINGS, + check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, }; -use crate::arrow::compute::repeat; use crate::dtype::DType; use crate::error::VortexResult; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::scalar::{Scalar, ScalarRef}; +use crate::scalar::Scalar; use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsSet}; +use crate::stats::{Stat, Stats, StatsSet}; mod compute; mod serde; @@ -22,22 +19,32 @@ mod stats; #[derive(Debug, Clone)] pub struct ConstantArray { - scalar: ScalarRef, + scalar: Scalar, length: usize, stats: Arc>, } impl ConstantArray { - pub fn new(scalar: ScalarRef, length: usize) -> Self { + pub fn new(scalar: Scalar, length: usize) -> Self { + let stats = StatsSet::from( + [ + (Stat::Max, scalar.clone()), + (Stat::Min, scalar.clone()), + (Stat::IsConstant, true.into()), + (Stat::IsSorted, true.into()), + (Stat::RunCount, 1.into()), + ] + .into(), + ); Self { scalar, length, - stats: Arc::new(RwLock::new(StatsSet::new())), + stats: Arc::new(RwLock::new(stats)), } } - pub fn scalar(&self) -> &dyn Scalar { - self.scalar.as_ref() + pub fn scalar(&self) -> &Scalar { + &self.scalar } } @@ -77,11 +84,6 @@ impl Array for ConstantArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - let arrow_scalar: Box = self.scalar.as_ref().into(); - Box::new(std::iter::once(repeat(arrow_scalar.as_ref(), self.length))) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -98,8 +100,8 @@ impl Array for ConstantArray { self.scalar.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-array/src/array/constant/serde.rs b/vortex-array/src/array/constant/serde.rs index 47c7a46d32..54dd3cb14e 100644 --- a/vortex-array/src/array/constant/serde.rs +++ b/vortex-array/src/array/constant/serde.rs @@ -1,18 +1,17 @@ -use std::io; - use crate::array::constant::{ConstantArray, ConstantEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for ConstantArray { - fn write(&self, ctx: &mut WriteCtx<'_>) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx<'_>) -> VortexResult<()> { ctx.write_usize(self.len())?; ctx.scalar(self.scalar()) } } impl EncodingSerde for ConstantEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let len = ctx.read_usize()?; let scalar = ctx.scalar()?; Ok(ConstantArray::new(scalar, len).boxed()) @@ -24,12 +23,12 @@ mod test { use crate::array::constant::ConstantArray; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::Array; - use crate::scalar::NullableScalarOption; + use crate::scalar::{PScalar, PrimitiveScalar}; use crate::serde::test::roundtrip_array; #[test] fn roundtrip() { - let arr = ConstantArray::new(NullableScalarOption(Some(42)).into(), 100); + let arr = ConstantArray::new(PrimitiveScalar::some(PScalar::I32(42)).into(), 100); let read_arr = roundtrip_array(arr.as_ref()).unwrap(); assert_eq!(arr.scalar(), read_arr.as_constant().scalar()); diff --git a/vortex-array/src/array/constant/stats.rs b/vortex-array/src/array/constant/stats.rs index 14eeefad12..a4130ff801 100644 --- a/vortex-array/src/array/constant/stats.rs +++ b/vortex-array/src/array/constant/stats.rs @@ -1,38 +1,25 @@ -use std::collections::HashMap; - use crate::array::constant::ConstantArray; use crate::array::Array; -use crate::dtype::{DType, Nullability}; +use crate::dtype::DType; use crate::error::VortexResult; -use crate::scalar::{BoolScalar, PScalar, Scalar}; +use crate::scalar::Scalar; use crate::stats::{Stat, StatsCompute, StatsSet}; impl StatsCompute for ConstantArray { fn compute(&self, _stat: &Stat) -> VortexResult { - let mut m = HashMap::from([ - (Stat::Max, dyn_clone::clone_box(self.scalar())), - (Stat::Min, dyn_clone::clone_box(self.scalar())), - (Stat::IsConstant, true.into()), - (Stat::IsSorted, true.into()), - (Stat::RunCount, 1.into()), - ]); - - if matches!(self.dtype(), &DType::Bool(Nullability::NonNullable)) { - m.insert( - Stat::TrueCount, - PScalar::U64( - self.len() as u64 - * self - .scalar() - .as_any() - .downcast_ref::() - .unwrap() - .value() as u64, - ) - .boxed(), - ); + if matches!(self.dtype(), &DType::Bool(_)) { + let Scalar::Bool(b) = self.scalar() else { + unreachable!("Got bool dtype without bool scalar") + }; + return Ok(StatsSet::from( + [( + Stat::TrueCount, + (self.len() as u64 * b.value().map(|v| v as u64).unwrap_or(0)).into(), + )] + .into(), + )); } - Ok(StatsSet::from(m)) + Ok(StatsSet::default()) } } diff --git a/vortex-array/src/array/downcast.rs b/vortex-array/src/array/downcast.rs index d6c67f8d6e..93c58d16a8 100644 --- a/vortex-array/src/array/downcast.rs +++ b/vortex-array/src/array/downcast.rs @@ -1,10 +1,10 @@ use crate::array::bool::BoolArray; use crate::array::chunked::ChunkedArray; +use crate::array::composite::CompositeArray; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; use crate::array::struct_::StructArray; -use crate::array::typed::TypedArray; use crate::array::varbin::VarBinArray; use crate::array::varbinview::VarBinViewArray; use crate::array::{Array, ArrayRef}; @@ -38,10 +38,10 @@ pub trait DowncastArrayBuiltin: private::Sealed { self.maybe_varbinview().unwrap() } - fn maybe_typed(&self) -> Option<&TypedArray>; + fn maybe_composite(&self) -> Option<&CompositeArray>; - fn as_typed(&self) -> &TypedArray { - self.maybe_typed().unwrap() + fn as_composite(&self) -> &CompositeArray { + self.maybe_composite().unwrap() } fn maybe_struct(&self) -> Option<&StructArray>; @@ -88,7 +88,7 @@ impl DowncastArrayBuiltin for dyn Array { self.as_any().downcast_ref() } - fn maybe_typed(&self) -> Option<&TypedArray> { + fn maybe_composite(&self) -> Option<&CompositeArray> { self.as_any().downcast_ref() } @@ -128,7 +128,7 @@ impl DowncastArrayBuiltin for ArrayRef { self.as_any().downcast_ref() } - fn maybe_typed(&self) -> Option<&TypedArray> { + fn maybe_composite(&self) -> Option<&CompositeArray> { self.as_any().downcast_ref() } diff --git a/vortex-array/src/array/mod.rs b/vortex-array/src/array/mod.rs index dcd87c712a..83892653c0 100644 --- a/vortex-array/src/array/mod.rs +++ b/vortex-array/src/array/mod.rs @@ -1,17 +1,16 @@ use std::any::Any; use std::fmt::{Debug, Display, Formatter}; -use arrow::array::ArrayRef as ArrowArrayRef; use linkme::distributed_slice; use crate::array::bool::{BoolArray, BoolEncoding}; use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; +use crate::array::composite::{CompositeArray, CompositeEncoding}; use crate::array::constant::{ConstantArray, ConstantEncoding}; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding}; use crate::array::sparse::{SparseArray, SparseEncoding}; use crate::array::struct_::{StructArray, StructEncoding}; -use crate::array::typed::{TypedArray, TypedEncoding}; use crate::array::varbin::{VarBinArray, VarBinEncoding}; use crate::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; use crate::compress::EncodingCompression; @@ -24,16 +23,15 @@ use crate::stats::Stats; pub mod bool; pub mod chunked; +pub mod composite; pub mod constant; pub mod downcast; pub mod primitive; pub mod sparse; pub mod struct_; -pub mod typed; pub mod varbin; pub mod varbinview; -pub type ArrowIterator = dyn Iterator; pub type ArrayRef = Box; /// An Enc Array is the base object representing all arrays in enc. @@ -61,8 +59,6 @@ pub trait Array: fn dtype(&self) -> &DType; /// Get statistics for the array fn stats(&self) -> Stats; - /// Produce arrow batches from the encoding - fn iter_arrow(&self) -> Box; /// Limit array to start..stop range fn slice(&self, start: usize, stop: usize) -> VortexResult; /// Encoding kind of the array @@ -70,7 +66,9 @@ pub trait Array: /// Approximate size in bytes of the array. Only takes into account variable size portion of the array fn nbytes(&self) -> usize; - fn serde(&self) -> &dyn ArraySerde; + fn serde(&self) -> Option<&dyn ArraySerde> { + None + } } dyn_clone::clone_trait_object!(Array); @@ -177,11 +175,11 @@ pub static ENCODINGS: [EncodingRef] = [..]; pub enum ArrayKind<'a> { Bool(&'a BoolArray), Chunked(&'a ChunkedArray), + Composite(&'a CompositeArray), Constant(&'a ConstantArray), Primitive(&'a PrimitiveArray), Sparse(&'a SparseArray), Struct(&'a StructArray), - Typed(&'a TypedArray), VarBin(&'a VarBinArray), VarBinView(&'a VarBinViewArray), Other(&'a dyn Array), @@ -192,11 +190,11 @@ impl<'a> From<&'a dyn Array> for ArrayKind<'a> { match *value.encoding().id() { BoolEncoding::ID => ArrayKind::Bool(value.as_bool()), ChunkedEncoding::ID => ArrayKind::Chunked(value.as_chunked()), + CompositeEncoding::ID => ArrayKind::Composite(value.as_composite()), ConstantEncoding::ID => ArrayKind::Constant(value.as_constant()), PrimitiveEncoding::ID => ArrayKind::Primitive(value.as_primitive()), SparseEncoding::ID => ArrayKind::Sparse(value.as_sparse()), StructEncoding::ID => ArrayKind::Struct(value.as_struct()), - TypedEncoding::ID => ArrayKind::Typed(value.as_typed()), VarBinEncoding::ID => ArrayKind::VarBin(value.as_varbin()), VarBinViewEncoding::ID => ArrayKind::VarBinView(value.as_varbinview()), _ => ArrayKind::Other(value), diff --git a/vortex-array/src/array/primitive/compute/as_arrow.rs b/vortex-array/src/array/primitive/compute/as_arrow.rs new file mode 100644 index 0000000000..07bc063618 --- /dev/null +++ b/vortex-array/src/array/primitive/compute/as_arrow.rs @@ -0,0 +1,41 @@ +use std::sync::Arc; + +use arrow_array::{ + ArrayRef as ArrowArrayRef, ArrowPrimitiveType, PrimitiveArray as ArrowPrimitiveArray, +}; +use arrow_buffer::ScalarBuffer; + +use crate::array::primitive::PrimitiveArray; +use crate::array::Array; +use crate::arrow::wrappers::as_nulls; +use crate::compute::as_arrow::AsArrowArray; +use crate::error::VortexResult; +use crate::ptype::PType; + +impl AsArrowArray for PrimitiveArray { + fn as_arrow(&self) -> VortexResult { + use arrow_array::types::*; + Ok(match self.ptype() { + PType::U8 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U64 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I8 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I64 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F64 => Arc::new(as_arrow_array_primitive::(self)?), + }) + } +} + +fn as_arrow_array_primitive( + array: &PrimitiveArray, +) -> VortexResult> { + Ok(ArrowPrimitiveArray::new( + ScalarBuffer::::new(array.buffer().clone(), 0, array.len()), + as_nulls(array.validity())?, + )) +} diff --git a/vortex-array/src/array/primitive/compute/cast.rs b/vortex-array/src/array/primitive/compute/cast.rs index 2f7931e386..be7ca6c147 100644 --- a/vortex-array/src/array/primitive/compute/cast.rs +++ b/vortex-array/src/array/primitive/compute/cast.rs @@ -1,20 +1,24 @@ use crate::array::primitive::PrimitiveArray; use crate::array::CloneOptionalArray; -use crate::compute::cast::CastPrimitiveFn; +use crate::array::{Array, ArrayRef}; +use crate::compute::cast::CastFn; +use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; use crate::match_each_native_ptype; use crate::ptype::{NativePType, PType}; -impl CastPrimitiveFn for PrimitiveArray { - fn cast_primitive(&self, ptype: &PType) -> VortexResult { - if self.ptype() == ptype { - Ok(self.clone()) +impl CastFn for PrimitiveArray { + fn cast(&self, dtype: &DType) -> VortexResult { + // TODO(ngates): check validity + let ptype = PType::try_from(dtype)?; + if ptype == self.ptype { + Ok(self.clone().boxed()) } else { match_each_native_ptype!(ptype, |$T| { Ok(PrimitiveArray::from_nullable( cast::<$T>(self)?, self.validity().clone_optional(), - )) + ).boxed()) }) } } @@ -37,6 +41,7 @@ fn cast(array: &PrimitiveArray) -> VortexResult> { #[cfg(test)] mod test { + use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::compute; use crate::error::VortexError; @@ -45,24 +50,25 @@ mod test { #[test] fn cast_u32_u8() { let arr = PrimitiveArray::from(vec![0u32, 10, 200]); - let u8arr = compute::cast::cast_primitive(&arr, &PType::U8).unwrap(); - assert_eq!(u8arr.typed_data::(), vec![0u8, 10, 200]); + let u8arr = compute::cast::cast(&arr, &PType::U8.into()).unwrap(); + assert_eq!(u8arr.as_primitive().typed_data::(), vec![0u8, 10, 200]); } #[test] fn cast_u32_f32() { let arr = PrimitiveArray::from(vec![0u32, 10, 200]); - let u8arr = compute::cast::cast_primitive(&arr, &PType::F32).unwrap(); - assert_eq!(u8arr.typed_data::(), vec![0.0f32, 10., 200.]); + let u8arr = compute::cast::cast(&arr, &PType::F32.into()).unwrap(); + assert_eq!( + u8arr.as_primitive().typed_data::(), + vec![0.0f32, 10., 200.] + ); } #[test] fn cast_i32_u32() { let arr = PrimitiveArray::from(vec![-1i32]); assert_eq!( - compute::cast::cast_primitive(&arr, &PType::U32) - .err() - .unwrap(), + compute::cast::cast(&arr, &PType::U32.into()).err().unwrap(), VortexError::ComputeError("Failed to cast -1 to U32".into()) ) } diff --git a/vortex-array/src/array/primitive/compute/fill.rs b/vortex-array/src/array/primitive/compute/fill.rs index f2dd4c98d7..af0c03c0e3 100644 --- a/vortex-array/src/array/primitive/compute/fill.rs +++ b/vortex-array/src/array/primitive/compute/fill.rs @@ -2,8 +2,8 @@ use num_traits::Zero; use crate::array::primitive::PrimitiveArray; use crate::array::{Array, ArrayRef}; -use crate::compute::cast::cast_bool; use crate::compute::fill::FillForwardFn; +use crate::compute::flatten::flatten_bool; use crate::error::VortexResult; use crate::match_each_native_ptype; use crate::stats::Stat; @@ -21,7 +21,7 @@ impl FillForwardFn for PrimitiveArray { return Ok(PrimitiveArray::new(*self.ptype(), self.buffer().clone(), None).boxed()); } else { match_each_native_ptype!(self.ptype(), |$P| { - let validity = cast_bool(self.validity().unwrap())?; + let validity = flatten_bool(self.validity().unwrap())?; let typed_data = self.typed_data::<$P>(); let mut last_value = $P::zero(); let filled = typed_data diff --git a/vortex-array/src/array/primitive/compute/flatten.rs b/vortex-array/src/array/primitive/compute/flatten.rs new file mode 100644 index 0000000000..865c344e85 --- /dev/null +++ b/vortex-array/src/array/primitive/compute/flatten.rs @@ -0,0 +1,9 @@ +use crate::array::primitive::PrimitiveArray; +use crate::compute::flatten::{FlattenFn, FlattenedArray}; +use crate::error::VortexResult; + +impl FlattenFn for PrimitiveArray { + fn flatten(&self) -> VortexResult { + Ok(FlattenedArray::Primitive(self.clone())) + } +} diff --git a/vortex-array/src/array/primitive/compute/mod.rs b/vortex-array/src/array/primitive/compute/mod.rs index e364de4bd3..17f4936c17 100644 --- a/vortex-array/src/array/primitive/compute/mod.rs +++ b/vortex-array/src/array/primitive/compute/mod.rs @@ -1,23 +1,37 @@ use crate::array::primitive::PrimitiveArray; +use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::cast::CastPrimitiveFn; +use crate::compute::cast::CastFn; use crate::compute::fill::FillForwardFn; +use crate::compute::flatten::FlattenFn; use crate::compute::patch::PatchFn; use crate::compute::scalar_at::ScalarAtFn; +use crate::compute::search_sorted::SearchSortedFn; use crate::compute::ArrayCompute; +mod as_arrow; mod as_contiguous; mod cast; mod fill; +mod flatten; mod patch; mod scalar_at; +mod search_sorted; impl ArrayCompute for PrimitiveArray { + fn as_arrow(&self) -> Option<&dyn AsArrowArray> { + Some(self) + } + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } - fn cast_primitive(&self) -> Option<&dyn CastPrimitiveFn> { + fn cast(&self) -> Option<&dyn CastFn> { + Some(self) + } + + fn flatten(&self) -> Option<&dyn FlattenFn> { Some(self) } @@ -32,4 +46,8 @@ impl ArrayCompute for PrimitiveArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } + + fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { + Some(self) + } } diff --git a/vortex-array/src/array/primitive/compute/patch.rs b/vortex-array/src/array/primitive/compute/patch.rs index 67c16fb28a..428ef3588c 100644 --- a/vortex-array/src/array/primitive/compute/patch.rs +++ b/vortex-array/src/array/primitive/compute/patch.rs @@ -26,7 +26,10 @@ fn patch_with_sparse(array: &PrimitiveArray, patch: &SparseArray) -> VortexResul let patch_indices = patch.resolved_indices(); match_each_native_ptype!(array.ptype(), |$T| { let mut values = Vec::from(array.typed_data::<$T>()); - let patch_values = compute::cast::cast_primitive(patch.values(), array.ptype())?; + let patch_values = compute::flatten::flatten_primitive(patch.values())?; + if (array.ptype() != patch_values.ptype()) { + return Err(VortexError::InvalidDType(patch_values.dtype().clone())) + } for (idx, value) in patch_indices.iter().zip_eq(patch_values.typed_data::<$T>().iter()) { values[*idx] = *value; } diff --git a/vortex-array/src/array/primitive/compute/scalar_at.rs b/vortex-array/src/array/primitive/compute/scalar_at.rs index bf550ef10c..8e0e3d4c78 100644 --- a/vortex-array/src/array/primitive/compute/scalar_at.rs +++ b/vortex-array/src/array/primitive/compute/scalar_at.rs @@ -1,16 +1,15 @@ use crate::array::primitive::PrimitiveArray; -use crate::array::Array; use crate::compute::scalar_at::ScalarAtFn; use crate::error::VortexResult; use crate::match_each_native_ptype; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; +use crate::scalar::{PrimitiveScalar, Scalar}; impl ScalarAtFn for PrimitiveArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { Ok(match_each_native_ptype!(self.ptype, |$T| self.typed_data::<$T>()[index].into())) } else { - Ok(NullableScalar::none(self.dtype().clone()).boxed()) + Ok(PrimitiveScalar::none(self.ptype).into()) } } } diff --git a/vortex-array/src/array/primitive/compute/search_sorted.rs b/vortex-array/src/array/primitive/compute/search_sorted.rs new file mode 100644 index 0000000000..4e0993b883 --- /dev/null +++ b/vortex-array/src/array/primitive/compute/search_sorted.rs @@ -0,0 +1,57 @@ +use crate::array::primitive::PrimitiveArray; +use crate::compute::search_sorted::{SearchSortedFn, SearchSortedSide}; +use crate::error::VortexResult; +use crate::match_each_native_ptype; +use crate::ptype::NativePType; +use crate::scalar::Scalar; + +impl SearchSortedFn for PrimitiveArray { + fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult { + match_each_native_ptype!(self.ptype(), |$T| { + let pvalue: $T = value.try_into()?; + Ok(search_sorted(self.typed_data::<$T>(), pvalue, side)) + }) + } +} + +fn search_sorted(arr: &[T], target: T, side: SearchSortedSide) -> usize { + match side { + SearchSortedSide::Left => search_sorted_cmp(arr, target, |a, b| a < b), + SearchSortedSide::Right => search_sorted_cmp(arr, target, |a, b| a <= b), + } +} + +fn search_sorted_cmp(arr: &[T], target: T, cmp: Cmp) -> usize +where + Cmp: Fn(T, T) -> bool + 'static, +{ + let mut low = 0; + let mut high = arr.len(); + + while low < high { + let mid = low + (high - low) / 2; + + if cmp(arr[mid], target) { + low = mid + 1; + } else { + high = mid; + } + } + + low +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_searchsorted_primitive() { + let values = vec![1u16, 2, 3]; + + assert_eq!(search_sorted(&values, 0, SearchSortedSide::Left), 0); + assert_eq!(search_sorted(&values, 1, SearchSortedSide::Left), 0); + assert_eq!(search_sorted(&values, 1, SearchSortedSide::Right), 1); + assert_eq!(search_sorted(&values, 4, SearchSortedSide::Left), 3); + } +} diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index 4be48ed899..ae122cf40e 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -6,20 +6,21 @@ use std::panic::RefUnwindSafe; use std::ptr::NonNull; use std::sync::{Arc, RwLock}; +use crate::accessor::ArrayAccessor; use allocator_api2::alloc::Allocator; -use arrow::array::{make_array, ArrayData, AsArray}; -use arrow::buffer::{Buffer, NullBuffer, ScalarBuffer}; +use arrow_buffer::buffer::{Buffer, ScalarBuffer}; use linkme::distributed_slice; +use crate::array::bool::BoolArray; use crate::array::{ - check_slice_bounds, check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding, - EncodingId, EncodingRef, ENCODINGS, + check_slice_bounds, check_validity_buffer, Array, ArrayRef, Encoding, EncodingId, EncodingRef, + ENCODINGS, }; -use crate::arrow::CombineChunks; use crate::compute::scalar_at::scalar_at; use crate::dtype::DType; use crate::error::VortexResult; use crate::formatter::{ArrayDisplay, ArrayFormatter}; +use crate::iterator::ArrayIter; use crate::ptype::{match_each_native_ptype, NativePType, PType}; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; @@ -98,6 +99,17 @@ impl PrimitiveArray { .unwrap_or(true) } + pub fn from_value(value: T, n: usize) -> Self { + PrimitiveArray::from(iter::repeat(value).take(n).collect::>()) + } + + pub fn null(n: usize) -> Self { + PrimitiveArray::from_nullable( + iter::repeat(T::zero()).take(n).collect::>(), + Some(BoolArray::from(vec![false; n]).boxed()), + ) + } + #[inline] pub fn ptype(&self) -> &PType { &self.ptype @@ -161,25 +173,6 @@ impl Array for PrimitiveArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - Box::new(iter::once(make_array( - ArrayData::builder(self.dtype().into()) - .len(self.len()) - .nulls(self.validity().map(|v| { - NullBuffer::new( - v.iter_arrow() - .combine_chunks() - .as_boolean() - .values() - .clone(), - ) - })) - .add_buffer(self.buffer.clone()) - .build() - .unwrap(), - ))) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -210,8 +203,8 @@ impl Array for PrimitiveArray { self.buffer.len() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -221,6 +214,24 @@ impl<'arr> AsRef<(dyn Array + 'arr)> for PrimitiveArray { } } +impl ArrayAccessor for PrimitiveArray { + fn value(&self, index: usize) -> Option { + if self.is_valid(index) { + Some(self.typed_data::()[index]) + } else { + None + } + } +} + +impl PrimitiveArray { + pub fn iter(&self) -> ArrayIter { + ArrayIter::new(self.clone()) + } +} + +pub type PrimitiveIter<'a, T> = ArrayIter, T>; + #[derive(Debug)] pub struct PrimitiveEncoding; diff --git a/vortex-array/src/array/primitive/serde.rs b/vortex-array/src/array/primitive/serde.rs index af2c7d7ee8..f670fd78d1 100644 --- a/vortex-array/src/array/primitive/serde.rs +++ b/vortex-array/src/array/primitive/serde.rs @@ -3,11 +3,12 @@ use std::io::ErrorKind; use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::ptype::PType; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for PrimitiveArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { if let Some(v) = self.validity() { ctx.write(v.as_ref())?; } @@ -16,7 +17,7 @@ impl ArraySerde for PrimitiveArray { } impl EncodingSerde for PrimitiveEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let validity = if ctx.schema().is_nullable() { Some(ctx.validity().read()?) } else { diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index 80ee311f83..d231eebef6 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -1,13 +1,14 @@ -use arrow::buffer::BooleanBuffer; use std::collections::HashMap; use std::mem::size_of; +use arrow_buffer::buffer::BooleanBuffer; + use crate::array::primitive::PrimitiveArray; -use crate::compute::cast::cast_bool; +use crate::compute::flatten::flatten_bool; use crate::error::VortexResult; use crate::match_each_native_ptype; use crate::ptype::NativePType; -use crate::scalar::{ListScalarVec, NullableScalar, PScalar, Scalar}; +use crate::scalar::{ListScalarVec, PScalar}; use crate::stats::{Stat, StatsCompute, StatsSet}; impl StatsCompute for PrimitiveArray { @@ -16,7 +17,7 @@ impl StatsCompute for PrimitiveArray { match self.validity() { None => self.typed_data::<$P>().compute(stat), Some(validity_array) => { - let validity = cast_bool(validity_array)?; + let validity = flatten_bool(validity_array)?; NullableValues(self.typed_data::<$P>(), validity.buffer()).compute(stat) } } @@ -54,8 +55,8 @@ impl<'a, T: NativePType> StatsCompute for NullableValues<'a, T> { if first_non_null.is_none() { return Ok(StatsSet::from(HashMap::from([ - (Stat::Min, NullableScalar::none(T::PTYPE.into()).boxed()), - (Stat::Max, NullableScalar::none(T::PTYPE.into()).boxed()), + (Stat::Min, Option::::None.into()), + (Stat::Max, Option::::None.into()), (Stat::IsConstant, true.into()), (Stat::IsSorted, true.into()), (Stat::IsStrictSorted, true.into()), @@ -65,6 +66,10 @@ impl<'a, T: NativePType> StatsCompute for NullableValues<'a, T> { Stat::BitWidthFreq, ListScalarVec(vec![0; size_of::() * 8 + 1]).into(), ), + ( + Stat::TrailingZeroFreq, + ListScalarVec(vec![size_of::() * 8; size_of::() * 8 + 1]).into(), + ), ]))); } @@ -81,6 +86,7 @@ impl<'a, T: NativePType> StatsCompute for NullableValues<'a, T> { trait BitWidth { fn bit_width(self) -> usize; + fn trailing_zeros(self) -> usize; } impl> BitWidth for T { @@ -101,6 +107,23 @@ impl> BitWidth for T { PScalar::F64(_) => bit_width, } } + + fn trailing_zeros(self) -> usize { + let scalar: PScalar = self.into(); + match scalar { + PScalar::U8(i) => i.trailing_zeros() as usize, + PScalar::U16(i) => i.trailing_zeros() as usize, + PScalar::U32(i) => i.trailing_zeros() as usize, + PScalar::U64(i) => i.trailing_zeros() as usize, + PScalar::I8(i) => i.trailing_zeros() as usize, + PScalar::I16(i) => i.trailing_zeros() as usize, + PScalar::I32(i) => i.trailing_zeros() as usize, + PScalar::I64(i) => i.trailing_zeros() as usize, + PScalar::F16(_) => 0, + PScalar::F32(_) => 0, + PScalar::F64(_) => 0, + } + } } struct StatsAccumulator { @@ -112,6 +135,7 @@ struct StatsAccumulator { run_count: usize, null_count: usize, bit_widths: Vec, + trailing_zeros: Vec, } impl StatsAccumulator { @@ -125,8 +149,10 @@ impl StatsAccumulator { run_count: 1, null_count: 0, bit_widths: vec![0; size_of::() * 8 + 1], + trailing_zeros: vec![0; size_of::() * 8 + 1], }; stats.bit_widths[first_value.bit_width()] += 1; + stats.trailing_zeros[first_value.trailing_zeros()] += 1; stats } @@ -135,6 +161,7 @@ impl StatsAccumulator { Some(n) => self.next(n), None => { self.bit_widths[0] += 1; + self.trailing_zeros[T::PTYPE.bit_width()] += 1; self.null_count += 1; } } @@ -142,6 +169,7 @@ impl StatsAccumulator { pub fn next(&mut self, next: T) { self.bit_widths[next.bit_width()] += 1; + self.trailing_zeros[next.trailing_zeros()] += 1; if self.prev == next { self.is_strict_sorted = false; @@ -166,6 +194,10 @@ impl StatsAccumulator { (Stat::NullCount, self.null_count.into()), (Stat::IsConstant, (self.min == self.max).into()), (Stat::BitWidthFreq, ListScalarVec(self.bit_widths).into()), + ( + Stat::TrailingZeroFreq, + ListScalarVec(self.trailing_zeros).into(), + ), (Stat::IsSorted, self.is_sorted.into()), ( Stat::IsStrictSorted, @@ -199,6 +231,11 @@ mod test { .get_or_compute_as::>(&Stat::BitWidthFreq) .unwrap() .0; + let trailing_zeros_freq: Vec = arr + .stats() + .get_or_compute_as::>(&Stat::TrailingZeroFreq) + .unwrap() + .0; let run_count: u64 = arr.stats().get_or_compute_as(&Stat::RunCount).unwrap(); assert_eq!(min, 1); assert_eq!(max, 5); @@ -212,6 +249,15 @@ mod test { 0, 0, 0, 0, 0, 0, ] ); + assert_eq!( + trailing_zeros_freq, + vec![ + // 1, 3, 5 have 0 trailing zeros + // 2 has 1 trailing zero, 4 has 2 trailing zeros + 3u64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ] + ); assert_eq!(run_count, 5); } diff --git a/vortex-array/src/array/sparse/compute.rs b/vortex-array/src/array/sparse/compute.rs index 3d3e4216cb..07f65a2969 100644 --- a/vortex-array/src/array/sparse/compute.rs +++ b/vortex-array/src/array/sparse/compute.rs @@ -1,19 +1,29 @@ +use arrow_buffer::BooleanBufferBuilder; +use itertools::Itertools; + +use crate::array::bool::BoolArray; use crate::array::downcast::DowncastArrayBuiltin; +use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; use crate::array::{Array, ArrayRef}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; +use crate::compute::flatten::{flatten, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::search_sorted::{search_sorted_usize, SearchSortedSide}; +use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; use crate::compute::ArrayCompute; -use crate::error::VortexResult; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; -use itertools::Itertools; +use crate::error::{VortexError, VortexResult}; +use crate::match_each_native_ptype; +use crate::scalar::Scalar; impl ArrayCompute for SparseArray { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -42,27 +52,61 @@ impl AsContiguousFn for SparseArray { } } +impl FlattenFn for SparseArray { + fn flatten(&self) -> VortexResult { + // Resolve our indices into a vector of usize applying the offset + let indices = self.resolved_indices(); + + let mut validity = BooleanBufferBuilder::new(self.len()); + validity.append_n(self.len(), false); + + let values = flatten(self.values())?; + if let FlattenedArray::Primitive(parray) = values { + match_each_native_ptype!(parray.ptype(), |$P| { + let mut values = vec![$P::default(); self.len()]; + let mut offset = 0; + + for v in parray.typed_data::<$P>() { + let idx = indices[offset]; + values[idx] = *v; + validity.set_bit(idx, true); + offset += 1; + } + + let validity = BoolArray::new(validity.finish(), None); + + Ok(FlattenedArray::Primitive(PrimitiveArray::from_nullable( + values, + Some(validity.boxed()), + ))) + }) + } else { + Err(VortexError::InvalidArgument( + "Cannot flatten SparseArray with non-primitive values".into(), + )) + } + } +} + impl ScalarAtFn for SparseArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { // Check whether `true_patch_index` exists in the patch index array // First, get the index of the patch index array that is the first index // greater than or equal to the true index let true_patch_index = index + self.indices_offset; - search_sorted_usize(self.indices(), true_patch_index, SearchSortedSide::Left).and_then( - |idx| { - // If the value at this index is equal to the true index, then it exists in the patch index array - // and we should return the value at the corresponding index in the patch values array - scalar_at(self.indices(), idx) - .or_else(|_| Ok(NullableScalar::none(self.values().dtype().clone()).boxed())) - .and_then(usize::try_from) - .and_then(|patch_index| { - if patch_index == true_patch_index { - scalar_at(self.values(), idx) - } else { - Ok(NullableScalar::none(self.values().dtype().clone()).boxed()) - } - }) - }, - ) + search_sorted(self.indices(), true_patch_index, SearchSortedSide::Left).and_then(|idx| { + // If the value at this index is equal to the true index, then it exists in the patch index array, + // and we should return the value at the corresponding index in the patch values array + scalar_at(self.indices(), idx) + .or_else(|_| Ok(Scalar::null(self.values().dtype()))) + .and_then(usize::try_from) + .and_then(|patch_index| { + if patch_index == true_patch_index { + scalar_at(self.values(), idx) + } else { + Ok(Scalar::null(self.values().dtype())) + } + }) + }) } } diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index 6a0e7ad943..9410a4abce 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -1,25 +1,19 @@ use std::any::Any; -use std::iter; use std::sync::{Arc, RwLock}; -use arrow::array::AsArray; -use arrow::array::{ - ArrayRef as ArrowArrayRef, BooleanBufferBuilder, PrimitiveArray as ArrowPrimitiveArray, -}; -use arrow::buffer::{NullBuffer, ScalarBuffer}; -use arrow::datatypes::UInt64Type; +use itertools::Itertools; use linkme::distributed_slice; use crate::array::ENCODINGS; -use crate::array::{ - check_slice_bounds, Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef}; use crate::compress::EncodingCompression; -use crate::compute::search_sorted::{search_sorted_usize, SearchSortedSide}; +use crate::compute::cast::cast; +use crate::compute::flatten::flatten_primitive; +use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::match_arrow_numeric_type; +use crate::ptype::PType; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsCompute, StatsSet}; @@ -53,7 +47,9 @@ impl SparseArray { indices_offset: usize, ) -> VortexResult { if !matches!(indices.dtype(), &DType::IDX) { - return Err(VortexError::InvalidDType(indices.dtype().clone())); + return Err(VortexError::InvalidArgument( + format!("Cannot use {} as indices", indices.dtype().clone()).into(), + )); } Ok(Self { @@ -82,18 +78,12 @@ impl SparseArray { /// Return indices as a vector of usize with the indices_offset applied. pub fn resolved_indices(&self) -> Vec { - let mut indices = Vec::with_capacity(self.len()); - self.indices().iter_arrow().for_each(|c| { - indices.extend( - arrow::compute::cast(c.as_ref(), &arrow::datatypes::DataType::UInt64) - .unwrap() - .as_primitive::() - .values() - .into_iter() - .map(|v| (*v as usize) - self.indices_offset), - ) - }); - indices + flatten_primitive(cast(self.indices(), &PType::U64.into()).unwrap().as_ref()) + .unwrap() + .typed_data::() + .iter() + .map(|v| (*v as usize) - self.indices_offset) + .collect_vec() } } @@ -133,36 +123,12 @@ impl Array for SparseArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - // Resolve our indices into a vector of usize applying the offset - let indices = self.resolved_indices(); - let array: ArrowArrayRef = match_arrow_numeric_type!(self.values().dtype(), |$E| { - let mut validity = BooleanBufferBuilder::new(self.len()); - validity.append_n(self.len(), false); - let mut values = vec![<$E as ArrowPrimitiveType>::Native::default(); self.len()]; - let mut offset = 0; - for values_array in self.values().iter_arrow() { - for v in values_array.as_primitive::<$E>().values() { - let idx = indices[offset]; - values[idx] = *v; - validity.set_bit(idx, true); - offset += 1; - } - } - Arc::new(ArrowPrimitiveArray::<$E>::new( - ScalarBuffer::from(values), - Some(NullBuffer::from(validity.finish())), - )) - }); - Box::new(iter::once(array)) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; // Find the index of the first patch index that is greater than or equal to the offset of this array - let index_start_index = search_sorted_usize(self.indices(), start, SearchSortedSide::Left)?; - let index_end_index = search_sorted_usize(self.indices(), stop, SearchSortedSide::Left)?; + let index_start_index = search_sorted(self.indices(), start, SearchSortedSide::Left)?; + let index_end_index = search_sorted(self.indices(), stop, SearchSortedSide::Left)?; Ok(SparseArray { indices_offset: self.indices_offset + start, @@ -184,8 +150,8 @@ impl Array for SparseArray { self.indices.nbytes() + self.values.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -231,12 +197,11 @@ impl Encoding for SparseEncoding { #[cfg(test)] mod test { - use arrow::array::AsArray; - use arrow::datatypes::Int32Type; use itertools::Itertools; use crate::array::sparse::SparseArray; use crate::array::Array; + use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; use crate::error::VortexError; @@ -246,13 +211,9 @@ mod test { } fn assert_sparse_array(sparse: &dyn Array, values: &[Option]) { - let sparse_arrow = sparse - .as_ref() - .iter_arrow() - .next() + let sparse_arrow = flatten_primitive(sparse) .unwrap() - .as_primitive::() - .into_iter() + .iter::() .collect_vec(); assert_eq!(sparse_arrow, values); } diff --git a/vortex-array/src/array/sparse/serde.rs b/vortex-array/src/array/sparse/serde.rs index 85fe6f715b..7275d25d81 100644 --- a/vortex-array/src/array/sparse/serde.rs +++ b/vortex-array/src/array/sparse/serde.rs @@ -4,10 +4,11 @@ use std::io::ErrorKind; use crate::array::sparse::{SparseArray, SparseEncoding}; use crate::array::{Array, ArrayRef}; use crate::dtype::DType; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for SparseArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.len())?; // TODO(robert): Rewrite indices and don't store offset ctx.write_usize(self.indices_offset())?; @@ -17,7 +18,7 @@ impl ArraySerde for SparseArray { } impl EncodingSerde for SparseEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let len = ctx.read_usize()?; let offset = ctx.read_usize()?; let indices = ctx.with_schema(&DType::IDX).read()?; diff --git a/vortex-array/src/array/struct_/compute.rs b/vortex-array/src/array/struct_/compute.rs index a7281ef1c3..13c16d0515 100644 --- a/vortex-array/src/array/struct_/compute.rs +++ b/vortex-array/src/array/struct_/compute.rs @@ -1,22 +1,71 @@ +use std::sync::Arc; + +use arrow_array::{ + Array as ArrowArray, ArrayRef as ArrowArrayRef, StructArray as ArrowStructArray, +}; +use arrow_schema::{Field, Fields}; +use itertools::Itertools; + use crate::array::downcast::DowncastArrayBuiltin; use crate::array::struct_::StructArray; use crate::array::{Array, ArrayRef}; +use crate::compute::as_arrow::{as_arrow, AsArrowArray}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; +use crate::compute::flatten::{flatten, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::ArrayCompute; use crate::error::VortexResult; -use crate::scalar::{Scalar, ScalarRef, StructScalar}; -use itertools::Itertools; +use crate::scalar::{Scalar, StructScalar}; impl ArrayCompute for StructArray { + fn as_arrow(&self) -> Option<&dyn AsArrowArray> { + Some(self) + } + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } + + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } } +impl AsArrowArray for StructArray { + fn as_arrow(&self) -> VortexResult { + let field_arrays: Vec = self + .fields() + .iter() + .map(|f| as_arrow(f.as_ref())) + .try_collect()?; + + let arrow_fields: Fields = self + .names() + .iter() + .zip(field_arrays.iter()) + .zip(self.fields().iter()) + .map(|((name, arrow_field), vortex_field)| { + Field::new( + name.as_str(), + arrow_field.data_type().clone(), + vortex_field.dtype().is_nullable(), + ) + }) + .map(Arc::new) + .collect(); + + Ok(Arc::new(ArrowStructArray::new( + arrow_fields, + field_arrays, + None, + ))) + } +} + impl AsContiguousFn for StructArray { fn as_contiguous(&self, arrays: Vec) -> VortexResult { let mut fields = vec![Vec::new(); self.fields().len()]; @@ -37,8 +86,20 @@ impl AsContiguousFn for StructArray { } } +impl FlattenFn for StructArray { + fn flatten(&self) -> VortexResult { + Ok(FlattenedArray::Struct(StructArray::new( + self.names().clone(), + self.fields() + .iter() + .map(|field| flatten(field.as_ref()).map(FlattenedArray::into_array)) + .try_collect()?, + ))) + } +} + impl ScalarAtFn for StructArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { Ok(StructScalar::new( self.dtype.clone(), self.fields @@ -46,6 +107,6 @@ impl ScalarAtFn for StructArray { .map(|field| scalar_at(field.as_ref(), index)) .try_collect()?, ) - .boxed()) + .into()) } } diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs index 386f883f85..a68bea1e51 100644 --- a/vortex-array/src/array/struct_/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -1,13 +1,9 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use arrow::array::StructArray as ArrowStructArray; -use arrow::array::{Array as ArrowArray, ArrayRef as ArrowArrayRef}; -use arrow::datatypes::{Field, Fields}; use itertools::Itertools; use linkme::distributed_slice; -use crate::arrow::aligned_iter::AlignedArrowArrayIterator; use crate::compress::EncodingCompression; use crate::dtype::{DType, FieldNames}; use crate::error::VortexResult; @@ -15,10 +11,7 @@ use crate::formatter::{ArrayDisplay, ArrayFormatter}; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsCompute, StatsSet}; -use super::{ - check_slice_bounds, Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, - ENCODINGS, -}; +use super::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS}; mod compress; mod compute; @@ -65,15 +58,6 @@ impl StructArray { panic!("dtype is not a struct") } } - - fn arrow_fields(&self) -> Fields { - self.names() - .iter() - .zip(self.field_dtypes()) - .map(|(name, dtype)| Field::new(name.as_str(), dtype.into(), dtype.is_nullable())) - .map(Arc::new) - .collect() - } } impl Array for StructArray { @@ -111,25 +95,6 @@ impl Array for StructArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - let fields = self.arrow_fields(); - Box::new( - AlignedArrowArrayIterator::new( - self.fields - .iter() - .map(|f| f.iter_arrow()) - .collect::>(), - ) - .map(move |items| { - Arc::new(ArrowStructArray::new( - fields.clone(), - items.into_iter().map(ArrowArrayRef::from).collect(), - None, - )) as Arc - }), - ) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -155,8 +120,8 @@ impl Array for StructArray { self.fields.iter().map(|arr| arr.nbytes()).sum() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -203,43 +168,3 @@ impl ArrayDisplay for StructArray { Ok(()) } } - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use arrow::array::types::UInt64Type; - use arrow::array::PrimitiveArray as ArrowPrimitiveArray; - use arrow::array::StructArray as ArrowStructArray; - use arrow::array::{Array as ArrowArray, GenericStringArray as ArrowStringArray}; - - use crate::array::struct_::StructArray; - use crate::array::Array; - - #[test] - pub fn iter() { - let arrow_aas = ArrowPrimitiveArray::::from(vec![1, 2, 3]); - let arrow_bbs = ArrowStringArray::::from(vec!["a", "b", "c"]); - - let array = StructArray::new( - vec![Arc::new("a".to_string()), Arc::new("b".to_string())], - vec![(&arrow_aas).into(), (&arrow_bbs).into()], - ); - let arrow_struct = ArrowStructArray::new( - array.arrow_fields(), - vec![Arc::new(arrow_aas), Arc::new(arrow_bbs)], - None, - ); - - assert_eq!( - array - .iter_arrow() - .next() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(), - &arrow_struct - ); - } -} diff --git a/vortex-array/src/array/struct_/serde.rs b/vortex-array/src/array/struct_/serde.rs index 17e0307ad3..79737d5264 100644 --- a/vortex-array/src/array/struct_/serde.rs +++ b/vortex-array/src/array/struct_/serde.rs @@ -1,13 +1,11 @@ -use std::io; -use std::io::ErrorKind; - use crate::array::struct_::{StructArray, StructEncoding}; use crate::array::{Array, ArrayRef}; use crate::dtype::DType; +use crate::error::{VortexError, VortexResult}; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for StructArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.fields().len())?; for f in self.fields() { ctx.write(f.as_ref())?; @@ -17,20 +15,17 @@ impl ArraySerde for StructArray { } impl EncodingSerde for StructEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let num_fields = ctx.read_usize()?; let mut fields = Vec::::with_capacity(num_fields); // TODO(robert): use read_vectored for i in 0..num_fields { fields.push(ctx.subfield(i).read()?); } - let DType::Struct(ns, _) = ctx.schema() else { - return Err(io::Error::new( - ErrorKind::InvalidData, - "invalid schema type", - )); + let DType::Struct(names, _) = ctx.schema() else { + return Err(VortexError::InvalidDType(ctx.schema().clone())); }; - Ok(StructArray::new(ns.clone(), fields).boxed()) + Ok(StructArray::new(names.clone(), fields).boxed()) } } diff --git a/vortex-array/src/array/typed/compute.rs b/vortex-array/src/array/typed/compute.rs deleted file mode 100644 index e82d8364f1..0000000000 --- a/vortex-array/src/array/typed/compute.rs +++ /dev/null @@ -1,41 +0,0 @@ -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::typed::TypedArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::ArrayCompute; -use crate::error::VortexResult; -use crate::scalar::ScalarRef; -use itertools::Itertools; - -impl ArrayCompute for TypedArray { - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } -} - -impl AsContiguousFn for TypedArray { - fn as_contiguous(&self, arrays: Vec) -> VortexResult { - Ok(TypedArray::new( - as_contiguous( - arrays - .into_iter() - .map(|array| dyn_clone::clone_box(array.as_typed().untyped_array())) - .collect_vec(), - )?, - self.dtype().clone(), - ) - .boxed()) - } -} - -impl ScalarAtFn for TypedArray { - fn scalar_at(&self, index: usize) -> VortexResult { - let underlying = scalar_at(self.array.as_ref(), index)?; - underlying.as_ref().cast(self.dtype()) - } -} diff --git a/vortex-array/src/array/typed/mod.rs b/vortex-array/src/array/typed/mod.rs deleted file mode 100644 index 5965bc46bf..0000000000 --- a/vortex-array/src/array/typed/mod.rs +++ /dev/null @@ -1,203 +0,0 @@ -use std::any::Any; -use std::sync::{Arc, RwLock}; - -use arrow::datatypes::DataType; -use linkme::distributed_slice; - -use crate::array::{Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::compress::EncodingCompression; -use crate::dtype::DType; -use crate::error::VortexResult; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsCompute, StatsSet}; - -mod compress; -mod compute; -mod serde; - -#[derive(Debug, Clone)] -pub struct TypedArray { - array: ArrayRef, - dtype: DType, - stats: Arc>, -} - -impl TypedArray { - pub fn new(array: ArrayRef, dtype: DType) -> Self { - Self { - array, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - } - } - - /// Possibly wrap an array in a TypedArray if the dtype is different - pub fn maybe_wrap(array: ArrayRef, dtype: &DType) -> ArrayRef { - if array.dtype() == dtype { - array - } else { - // Should we check the DType is compatible...? - Self::new(array, dtype.clone()).boxed() - } - } - - #[inline] - pub fn untyped_array(&self) -> &dyn Array { - self.array.as_ref() - } -} - -impl Array for TypedArray { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn boxed(self) -> ArrayRef { - Box::new(self) - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn len(&self) -> usize { - self.array.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.array.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - // TODO(robert): Have cast happen in enc space and not in arrow space - fn iter_arrow(&self) -> Box { - let datatype: DataType = self.dtype().into(); - Box::new( - self.array.iter_arrow().map(move |arr| { - arrow::compute::kernels::cast::cast(arr.as_ref(), &datatype).unwrap() - }), - ) - } - - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(Self::new(self.array.slice(start, stop)?, self.dtype.clone()).boxed()) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &TypedEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.array.nbytes() - } - - fn serde(&self) -> &dyn ArraySerde { - self - } -} - -impl StatsCompute for TypedArray {} - -impl<'arr> AsRef<(dyn Array + 'arr)> for TypedArray { - fn as_ref(&self) -> &(dyn Array + 'arr) { - self - } -} - -#[derive(Debug)] -pub struct TypedEncoding; - -impl TypedEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.typed"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_TYPED: EncodingRef = &TypedEncoding; - -impl Encoding for TypedEncoding { - fn id(&self) -> &EncodingId { - &Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} - -impl ArrayDisplay for TypedArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("untyped", self.untyped_array()) - } -} - -#[cfg(test)] -mod test { - use std::iter; - - use arrow::array::cast::AsArray; - use arrow::array::types::Time64MicrosecondType; - use arrow::array::Time64MicrosecondArray; - use itertools::Itertools; - - use crate::array::typed::TypedArray; - use crate::array::Array; - use crate::compute::scalar_at::scalar_at; - use crate::dtype::{DType, Nullability, TimeUnit}; - use crate::scalar::{LocalTimeScalar, PScalar, Scalar}; - - #[test] - pub fn scalar() { - let arr = TypedArray::new( - vec![64_799_000_000_u64, 43_000_000_000].into(), - DType::LocalTime(TimeUnit::Us, Nullability::NonNullable), - ); - assert_eq!( - scalar_at(arr.as_ref(), 0).unwrap().as_ref(), - &LocalTimeScalar::new(PScalar::U64(64_799_000_000), TimeUnit::Us) as &dyn Scalar - ); - assert_eq!( - scalar_at(arr.as_ref(), 1).unwrap().as_ref(), - &LocalTimeScalar::new(PScalar::U64(43_000_000_000), TimeUnit::Us) as &dyn Scalar - ); - } - - #[test] - pub fn iter() { - let arr = TypedArray::new( - vec![64_799_000_000_i64, 43_000_000_000].into(), - DType::LocalTime(TimeUnit::Us, Nullability::NonNullable), - ); - arr.iter_arrow() - .zip_eq(iter::once(Box::new(Time64MicrosecondArray::from(vec![ - 64_799_000_000i64, - 43_000_000_000, - ])))) - .for_each(|(enc, arrow)| { - assert_eq!( - *enc.as_primitive::().values(), - *arrow.values() - ) - }); - } -} diff --git a/vortex-array/src/array/typed/serde.rs b/vortex-array/src/array/typed/serde.rs deleted file mode 100644 index 6643925961..0000000000 --- a/vortex-array/src/array/typed/serde.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::io; - -use crate::array::typed::{TypedArray, TypedEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for TypedArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { - ctx.dtype(self.untyped_array().dtype())?; - ctx.write(self.untyped_array()) - } -} - -impl EncodingSerde for TypedEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { - let inner_dtype = ctx.dtype()?; - Ok(TypedArray::new(ctx.with_schema(&inner_dtype).read()?, ctx.schema().clone()).boxed()) - } -} - -#[cfg(test)] -mod test { - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::typed::TypedArray; - use crate::array::Array; - use crate::dtype::{DType, IntWidth, Nullability, Signedness}; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = TypedArray::new( - vec![7u8, 37, 71, 97].into(), - DType::Int(IntWidth::_64, Signedness::Signed, Nullability::NonNullable), - ); - - let read_arr = roundtrip_array(arr.as_ref()).unwrap(); - - assert_eq!( - arr.untyped_array() - .as_primitive() - .buffer() - .typed_data::(), - read_arr - .as_typed() - .untyped_array() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!(arr.dtype(), read_arr.dtype()); - } -} diff --git a/vortex-array/src/array/typed/stats.rs b/vortex-array/src/array/typed/stats.rs deleted file mode 100644 index 355834f051..0000000000 --- a/vortex-array/src/array/typed/stats.rs +++ /dev/null @@ -1,3 +0,0 @@ -use crate::array::typed::TypedArray; -use crate::error::VortexResult; -use crate::stats::{Stat, StatsCompute, StatsSet}; diff --git a/vortex-array/src/array/varbin/compute.rs b/vortex-array/src/array/varbin/compute.rs index 22ae1b6ca5..b00bbf8542 100644 --- a/vortex-array/src/array/varbin/compute.rs +++ b/vortex-array/src/array/varbin/compute.rs @@ -1,23 +1,40 @@ +use std::sync::Arc; + +use arrow_array::{ + ArrayRef as ArrowArrayRef, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, +}; +use itertools::Itertools; + use crate::array::bool::BoolArray; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; use crate::array::{Array, ArrayRef, CloneOptionalArray}; +use crate::arrow::wrappers::{as_nulls, as_offset_buffer}; +use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::cast::cast_primitive; +use crate::compute::cast::cast; +use crate::compute::flatten::{flatten, flatten_primitive, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::dtype::DType; -use crate::error::VortexResult; +use crate::error::{VortexError, VortexResult}; use crate::ptype::PType; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; -use itertools::Itertools; +use crate::scalar::{BinaryScalar, Scalar, Utf8Scalar}; impl ArrayCompute for VarBinArray { + fn as_arrow(&self) -> Option<&dyn AsArrowArray> { + Some(self) + } + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -48,7 +65,7 @@ impl AsContiguousFn for VarBinArray { offsets.push(0); for a in arrays.iter().map(|a| a.as_varbin()) { let first_offset: u64 = a.first_offset()?; - let offsets_array = cast_primitive(a.offsets(), &PType::U64)?; + let offsets_array = flatten_primitive(cast(a.offsets(), &PType::U64.into())?.as_ref())?; let shift = offsets.last().copied().unwrap_or(0); offsets.extend( offsets_array @@ -65,8 +82,74 @@ impl AsContiguousFn for VarBinArray { } } +impl AsArrowArray for VarBinArray { + fn as_arrow(&self) -> VortexResult { + // Ensure the offsets are either i32 or i64 + let offsets = flatten_primitive(self.offsets())?; + let offsets = match offsets.ptype() { + &PType::I32 | &PType::I64 => offsets, + // Unless it's u64, everything else can be converted into an i32. + &PType::U64 => flatten_primitive(cast(offsets.as_ref(), &PType::I64.into())?.as_ref())?, + _ => flatten_primitive(cast(offsets.as_ref(), &PType::I32.into())?.as_ref())?, + }; + let nulls = as_nulls(offsets.validity())?; + + let data = flatten_primitive(self.bytes())?; + assert_eq!(data.ptype(), &PType::U8); + let data = data.buffer().clone(); + + // Switch on Arrow DType. + Ok(match self.dtype() { + DType::Binary(_) => match offsets.ptype() { + PType::I32 => Arc::new(BinaryArray::new( + as_offset_buffer::(offsets), + data, + nulls, + )), + PType::I64 => Arc::new(LargeBinaryArray::new( + as_offset_buffer::(offsets), + data, + nulls, + )), + _ => panic!("Invalid offsets type"), + }, + DType::Utf8(_) => match offsets.ptype() { + PType::I32 => Arc::new(StringArray::new( + as_offset_buffer::(offsets), + data, + nulls, + )), + PType::I64 => Arc::new(LargeStringArray::new( + as_offset_buffer::(offsets), + data, + nulls, + )), + _ => panic!("Invalid offsets type"), + }, + _ => return Err(VortexError::InvalidDType(self.dtype().clone())), + }) + } +} + +impl FlattenFn for VarBinArray { + fn flatten(&self) -> VortexResult { + let bytes = flatten(self.bytes())?.into_array(); + let offsets = flatten(self.offsets())?.into_array(); + let validity = self + .validity() + .map(|v| flatten(v).map(FlattenedArray::into_array)) + .transpose()?; + Ok(FlattenedArray::VarBin(VarBinArray::new( + offsets, + bytes, + self.dtype.clone(), + validity, + ))) + } +} + impl ScalarAtFn for VarBinArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { self.bytes_at(index).map(|bytes| { if matches!(self.dtype, DType::Utf8(_)) { @@ -75,8 +158,10 @@ impl ScalarAtFn for VarBinArray { bytes.into() } }) + } else if matches!(self.dtype, DType::Utf8(_)) { + Ok(Utf8Scalar::new(None).into()) } else { - Ok(NullableScalar::none(self.dtype.clone()).boxed()) + Ok(BinaryScalar::new(None).into()) } } } diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index 6c5aa3ac65..35d543eb3e 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -1,10 +1,6 @@ use std::any::Any; -use std::iter; use std::sync::{Arc, RwLock}; -use arrow::array::{make_array, Array as ArrowArray, ArrayData, AsArray}; -use arrow::buffer::NullBuffer; -use arrow::datatypes::UInt8Type; use linkme::distributed_slice; use num_traits::{FromPrimitive, Unsigned}; @@ -13,11 +9,11 @@ use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::values_iter::{VarBinIter, VarBinPrimitiveIter}; use crate::array::{ - check_slice_bounds, check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding, - EncodingId, EncodingRef, ENCODINGS, + check_slice_bounds, check_validity_buffer, Array, ArrayRef, Encoding, EncodingId, EncodingRef, + ENCODINGS, }; -use crate::arrow::CombineChunks; use crate::compress::EncodingCompression; +use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; use crate::dtype::{DType, IntWidth, Nullability, Signedness}; use crate::error::{VortexError, VortexResult}; @@ -210,8 +206,9 @@ impl VarBinArray { let start = scalar_at(self.offsets(), index)?.try_into()?; let end = scalar_at(self.offsets(), index + 1)?.try_into()?; let sliced = self.bytes().slice(start, end)?; - let arr_ref = sliced.iter_arrow().combine_chunks(); - Ok(arr_ref.as_primitive::().values().to_vec()) + Ok(flatten_primitive(sliced.as_ref())? + .typed_data::() + .to_vec()) } } @@ -251,29 +248,6 @@ impl Array for VarBinArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - let offsets_data = self.offsets.iter_arrow().combine_chunks().into_data(); - let bytes_data = self.bytes.iter_arrow().combine_chunks().into_data(); - - let data = ArrayData::builder(self.dtype.clone().into()) - .len(self.len()) - .nulls(self.validity().map(|v| { - NullBuffer::new( - v.iter_arrow() - .combine_chunks() - .as_boolean() - .values() - .clone(), - ) - })) - .add_buffer(offsets_data.buffers()[0].to_owned()) - .add_buffer(bytes_data.buffers()[0].to_owned()) - .build() - .unwrap(); - - Box::new(iter::once(make_array(data))) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -299,8 +273,8 @@ impl Array for VarBinArray { self.bytes.nbytes() + self.offsets.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -392,12 +366,9 @@ impl<'a> FromIterator> for VarBinArray { #[cfg(test)] mod test { - use arrow::array::{AsArray, GenericStringArray as ArrowStringArray}; - use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; use crate::array::Array; - use crate::arrow::CombineChunks; use crate::compute::scalar_at::scalar_at; use crate::dtype::{DType, Nullability}; @@ -436,19 +407,4 @@ mod test { Ok("hello world this is a long string".into()) ); } - - #[test] - pub fn iter() { - let binary_array = binary_array(); - assert_eq!( - binary_array - .iter_arrow() - .combine_chunks() - .as_string::(), - &ArrowStringArray::::from(vec![ - "hello world", - "hello world this is a long string", - ]) - ); - } } diff --git a/vortex-array/src/array/varbin/serde.rs b/vortex-array/src/array/varbin/serde.rs index f6966effa5..f5f24adc66 100644 --- a/vortex-array/src/array/varbin/serde.rs +++ b/vortex-array/src/array/varbin/serde.rs @@ -1,11 +1,10 @@ -use std::io; - use crate::array::varbin::{VarBinArray, VarBinEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for VarBinArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { if let Some(v) = self.validity() { ctx.write(v.as_ref())?; } @@ -16,7 +15,7 @@ impl ArraySerde for VarBinArray { } impl EncodingSerde for VarBinEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let validity = if ctx.schema().is_nullable() { Some(ctx.validity().read()?) } else { diff --git a/vortex-array/src/array/varbin/values_iter.rs b/vortex-array/src/array/varbin/values_iter.rs index 26723f97ee..6c8595a7b5 100644 --- a/vortex-array/src/array/varbin/values_iter.rs +++ b/vortex-array/src/array/varbin/values_iter.rs @@ -1,9 +1,6 @@ -use arrow::array::AsArray; -use arrow::datatypes::UInt8Type; - use crate::array::primitive::PrimitiveArray; use crate::array::Array; -use crate::arrow::CombineChunks; +use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; use crate::match_each_native_ptype; use num_traits::AsPrimitive; @@ -85,16 +82,12 @@ impl<'a> Iterator for VarBinIter<'a> { .try_into() .unwrap(); let slice_bytes = self.bytes.slice(self.last_offset, next_offset).unwrap(); + let slice_bytes = flatten_primitive(slice_bytes.as_ref()) + .unwrap() + .typed_data::() + .to_vec(); self.last_offset = next_offset; self.idx += 1; - // TODO(robert): iter as primitive vs arrow - Some( - slice_bytes - .iter_arrow() - .combine_chunks() - .as_primitive::() - .values() - .to_vec(), - ) + Some(slice_bytes) } } diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index ae4b3949ee..a6f440b111 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -1,9 +1,10 @@ use crate::array::varbinview::VarBinViewArray; +use crate::array::Array; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::ArrayCompute; use crate::dtype::DType; use crate::error::VortexResult; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; +use crate::scalar::Scalar; impl ArrayCompute for VarBinViewArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -12,7 +13,7 @@ impl ArrayCompute for VarBinViewArray { } impl ScalarAtFn for VarBinViewArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { self.bytes_at(index).map(|bytes| { if matches!(self.dtype, DType::Utf8(_)) { @@ -22,7 +23,7 @@ impl ScalarAtFn for VarBinViewArray { } }) } else { - Ok(NullableScalar::none(self.dtype.clone()).boxed()) + Ok(Scalar::null(self.dtype())) } } } diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index bc00954828..4cdac25357 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -1,21 +1,14 @@ -mod compute; -mod serde; - use std::any::Any; -use std::str::from_utf8_unchecked; +use std::mem; use std::sync::{Arc, RwLock}; -use std::{iter, mem}; -use arrow::array::cast::AsArray; -use arrow::array::types::UInt8Type; -use arrow::array::{ArrayRef as ArrowArrayRef, BinaryBuilder, StringBuilder}; use linkme::distributed_slice; use crate::array::{ - check_slice_bounds, check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding, - EncodingId, EncodingRef, ENCODINGS, + check_slice_bounds, check_validity_buffer, Array, ArrayRef, Encoding, EncodingId, EncodingRef, + ENCODINGS, }; -use crate::arrow::CombineChunks; +use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; use crate::dtype::{DType, IntWidth, Nullability, Signedness}; use crate::error::{VortexError, VortexResult}; @@ -23,6 +16,9 @@ use crate::formatter::{ArrayDisplay, ArrayFormatter}; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; +mod compute; +mod serde; + #[derive(Clone, Copy)] #[repr(C, align(8))] struct Inlined { @@ -157,15 +153,14 @@ impl VarBinViewArray { } pub(self) fn view_at(&self, index: usize) -> BinaryView { - let view_slice = self - .views - .slice(index * VIEW_SIZE, (index + 1) * VIEW_SIZE) - .unwrap() - .iter_arrow() - .next() - .unwrap(); - let view_vec: &[u8] = view_slice.as_primitive::().values(); - BinaryView::from_le_bytes(view_vec.try_into().unwrap()) + let view_vec = flatten_primitive( + self.views + .slice(index * VIEW_SIZE, (index + 1) * VIEW_SIZE) + .unwrap() + .as_ref(), + ) + .unwrap(); + BinaryView::from_le_bytes(view_vec.typed_data::().try_into().unwrap()) } #[inline] @@ -187,21 +182,18 @@ impl VarBinViewArray { let view = self.view_at(index); unsafe { if view.inlined.size > 12 { - let arrow_data_buffer = self - .data - .get(view._ref.buffer_index as usize) - .unwrap() - .slice( - view._ref.offset as usize, - (view._ref.size + view._ref.offset) as usize, - )? - .iter_arrow() - .combine_chunks(); - - Ok(arrow_data_buffer - .as_primitive::() - .values() - .to_vec()) + let arrow_data_buffer = flatten_primitive( + self.data + .get(view._ref.buffer_index as usize) + .unwrap() + .slice( + view._ref.offset as usize, + (view._ref.size + view._ref.offset) as usize, + )? + .as_ref(), + )?; + // TODO(ngates): can we avoid returning a copy? + Ok(arrow_data_buffer.typed_data::().to_vec()) } else { Ok(view.inlined.data[..view.inlined.size as usize].to_vec()) } @@ -245,35 +237,6 @@ impl Array for VarBinViewArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - let data_arr: ArrowArrayRef = if matches!(self.dtype, DType::Utf8(_)) { - let mut data_buf = StringBuilder::with_capacity(self.len(), self.plain_size()); - for i in 0..self.views.len() / VIEW_SIZE { - if !self.is_valid(i) { - data_buf.append_null() - } else { - unsafe { - data_buf.append_value(from_utf8_unchecked( - self.bytes_at(i).unwrap().as_slice(), - )); - } - } - } - Arc::new(data_buf.finish()) - } else { - let mut data_buf = BinaryBuilder::with_capacity(self.len(), self.plain_size()); - for i in 0..self.views.len() / VIEW_SIZE { - if !self.is_valid(i) { - data_buf.append_null() - } else { - data_buf.append_value(self.bytes_at(i).unwrap()) - } - } - Arc::new(data_buf.finish()) - }; - Box::new(iter::once(data_arr)) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -300,8 +263,8 @@ impl Array for VarBinViewArray { self.views.nbytes() + self.data.iter().map(|arr| arr.nbytes()).sum::() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -343,8 +306,6 @@ impl ArrayDisplay for VarBinViewArray { #[cfg(test)] mod test { - use arrow::array::GenericStringArray as ArrowStringArray; - use crate::array::primitive::PrimitiveArray; use super::*; @@ -396,19 +357,4 @@ mod test { Ok("hello world this is a long string".into()) ); } - - #[test] - pub fn iter() { - let binary_array = binary_array(); - assert_eq!( - binary_array - .iter_arrow() - .combine_chunks() - .as_string::(), - &ArrowStringArray::::from(vec![ - "hello world", - "hello world this is a long string", - ]) - ); - } } diff --git a/vortex-array/src/array/varbinview/serde.rs b/vortex-array/src/array/varbinview/serde.rs index 6950787de8..a476077a78 100644 --- a/vortex-array/src/array/varbinview/serde.rs +++ b/vortex-array/src/array/varbinview/serde.rs @@ -1,11 +1,10 @@ -use std::io; - use crate::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; use crate::array::{Array, ArrayRef}; +use crate::error::VortexResult; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for VarBinViewArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { if let Some(v) = self.validity() { ctx.write(v.as_ref())?; } @@ -19,7 +18,7 @@ impl ArraySerde for VarBinViewArray { } impl EncodingSerde for VarBinViewEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let validity = if ctx.schema().is_nullable() { Some(ctx.validity().read()?) } else { diff --git a/vortex-array/src/arrow/aligned_iter.rs b/vortex-array/src/arrow/aligned_iter.rs deleted file mode 100644 index d52fc44671..0000000000 --- a/vortex-array/src/arrow/aligned_iter.rs +++ /dev/null @@ -1,88 +0,0 @@ -use arrow::array::{Array as ArrowArray, ArrayRef}; - -pub struct AlignedArray { - iter: Box>, - current_chunk: Option, - offset: usize, -} - -impl AlignedArray { - pub fn new(mut iter: Box>) -> Self { - let current_chunk = iter.next(); - Self { - iter, - current_chunk, - offset: 0, - } - } - - pub fn length(&self) -> usize { - self.current_chunk.as_ref().unwrap().len() - self.offset - } -} - -pub struct AlignedArrowArrayIterator { - items: Vec, -} - -impl AlignedArrowArrayIterator { - pub fn new(iterators: Vec>>) -> Self { - let items = iterators.into_iter().map(AlignedArray::new).collect(); - Self { items } - } -} - -impl Iterator for AlignedArrowArrayIterator { - type Item = Vec; - - fn next(&mut self) -> Option { - let missing_chunks: usize = self - .items - .iter_mut() - .map(|v| { - if v.length() == 0 { - v.current_chunk = v.iter.next(); - v.offset = 0; - if v.current_chunk.is_none() { - 1 - } else { - 0 - } - } else { - 0 - } - }) - .sum(); - - if missing_chunks == self.items.len() { - return None; - } else if missing_chunks > 0 { - panic!( - "Misaligned arrays, {} arrays didn't return a next chunk", - missing_chunks - ); - } - - let smallest_chunk = self.items.iter().map(|v| v.length()).min().unwrap(); - - Some( - self.items - .iter_mut() - .map(|v| { - let len = v.length(); - let offset = v.offset; - v.offset += smallest_chunk; - - if len == smallest_chunk { - v.current_chunk.clone().unwrap() - } else { - v.current_chunk - .as_ref() - .unwrap() - .slice(offset, smallest_chunk) - } - }) - .collect::>(), - ) - } -} diff --git a/vortex-array/src/arrow/compute/mod.rs b/vortex-array/src/arrow/compute/mod.rs deleted file mode 100644 index 9ca7476da5..0000000000 --- a/vortex-array/src/arrow/compute/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub use repeat::*; - -mod repeat; diff --git a/vortex-array/src/arrow/compute/repeat.rs b/vortex-array/src/arrow/compute/repeat.rs deleted file mode 100644 index 662775e721..0000000000 --- a/vortex-array/src/arrow/compute/repeat.rs +++ /dev/null @@ -1,79 +0,0 @@ -use std::sync::Arc; - -use arrow::array::cast::AsArray; -use arrow::array::types::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; -use arrow::array::{ArrayRef, ArrowPrimitiveType, BooleanArray, Datum, NullArray, PrimitiveArray}; -use arrow::buffer::BooleanBuffer; -use arrow::datatypes::DataType; - -macro_rules! repeat_primitive { - ($arrow_type:ty, $arr:expr, $n:expr) => {{ - if $arr.is_null(0) { - return repeat_primitive::<$arrow_type>(None, $n) as ArrayRef; - } - - repeat_primitive::<$arrow_type>(Some($arr.as_primitive::<$arrow_type>().value(0)), $n) - as ArrayRef - }}; -} - -pub fn repeat(scalar: &dyn Datum, n: usize) -> ArrayRef { - let (arr, is_scalar) = scalar.get(); - assert!(is_scalar, "Datum was not a scalar"); - match arr.data_type() { - DataType::Null => Arc::new(NullArray::new(n)), - DataType::Boolean => { - if arr.is_valid(0) { - if arr.as_boolean().value(0) { - Arc::new(BooleanArray::from(BooleanBuffer::new_set(n))) - } else { - Arc::new(BooleanArray::from(BooleanBuffer::new_unset(n))) - } - } else { - Arc::new(BooleanArray::new_null(n)) - } - } - DataType::UInt8 => repeat_primitive!(UInt8Type, arr, n), - DataType::UInt16 => repeat_primitive!(UInt16Type, arr, n), - DataType::UInt32 => repeat_primitive!(UInt32Type, arr, n), - DataType::UInt64 => repeat_primitive!(UInt64Type, arr, n), - DataType::Int8 => repeat_primitive!(Int8Type, arr, n), - DataType::Int16 => repeat_primitive!(Int16Type, arr, n), - DataType::Int32 => repeat_primitive!(Int32Type, arr, n), - DataType::Int64 => repeat_primitive!(Int64Type, arr, n), - DataType::Float16 => repeat_primitive!(Float16Type, arr, n), - DataType::Float32 => repeat_primitive!(Float32Type, arr, n), - DataType::Float64 => repeat_primitive!(Float64Type, arr, n), - _ => todo!("Not implemented yet"), - } -} - -fn repeat_primitive( - value: Option, - n: usize, -) -> Arc> { - Arc::new( - value - .map(|v| PrimitiveArray::from_value(v, n)) - .unwrap_or_else(|| PrimitiveArray::new_null(n)), - ) -} - -#[cfg(test)] -mod test { - use crate::arrow::compute::repeat; - use arrow::array::cast::AsArray; - use arrow::array::types::UInt64Type; - use arrow::array::{Scalar, UInt64Array}; - - #[test] - fn test_repeat() { - let scalar = Scalar::new(UInt64Array::from(vec![47])); - let array = repeat(&scalar, 100); - assert_eq!(array.len(), 100); - assert_eq!(array.as_primitive::().value(50), 47); - } -} diff --git a/vortex-array/src/arrow/convert.rs b/vortex-array/src/arrow/convert.rs index 2892d6410c..f0e3b3f0b2 100644 --- a/vortex-array/src/arrow/convert.rs +++ b/vortex-array/src/arrow/convert.rs @@ -1,63 +1,19 @@ -use std::iter::zip; use std::sync::Arc; -use arrow::array::RecordBatch; -use arrow::array::RecordBatchReader; -use arrow::datatypes::{ - DataType, Field, FieldRef, Fields, Schema, SchemaRef, TimeUnit as ArrowTimeUnit, -}; +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field, SchemaRef, TimeUnit as ArrowTimeUnit}; use itertools::Itertools; -use crate::array::chunked::ChunkedArray; use crate::array::struct_::StructArray; -use crate::array::typed::TypedArray; use crate::array::{Array, ArrayRef}; +use crate::composite_dtypes::{localdate, localtime, zoneddatetime, TimeUnit}; +use crate::compute::cast::cast; use crate::dtype::DType::*; -use crate::dtype::{DType, FloatWidth, IntWidth, Nullability, TimeUnit}; +use crate::dtype::{DType, FloatWidth, IntWidth, Nullability}; +use crate::encode::FromArrow; use crate::error::{VortexError, VortexResult}; use crate::ptype::PType; -#[allow(dead_code)] -trait CollectRecordBatches: IntoIterator { - fn collect_record_batches(&self, schema: &Schema) -> ArrayRef; -} - -#[allow(dead_code)] -impl TryFrom<&mut dyn RecordBatchReader> for ArrayRef { - type Error = VortexError; - - fn try_from(reader: &mut dyn RecordBatchReader) -> Result { - let schema = reader.schema(); - let mut fields = vec![Vec::new(); schema.fields().len()]; - - for batch_result in reader { - let batch = batch_result?; - for f in 0..schema.fields().len() { - let col = batch.column(f).clone(); - fields[f].push(ArrayRef::from(col)); - } - } - - let names = schema - .fields() - .iter() - .map(|f| f.name()) - .cloned() - .map(Arc::new) - .collect_vec(); - - let chunks: VortexResult> = fields - .into_iter() - .zip(schema.fields()) - .map(|(field_chunks, arrow_type)| { - Ok(ChunkedArray::try_new(field_chunks, DType::try_from(arrow_type)?)?.boxed()) - }) - .try_collect(); - - Ok(StructArray::new(names, chunks?).boxed()) - } -} - impl From for ArrayRef { fn from(value: RecordBatch) -> Self { StructArray::new( @@ -75,8 +31,9 @@ impl From for ArrayRef { .zip(value.schema().fields()) .map(|(array, field)| { // The dtype of the child arrays infer their nullability from the array itself. - // In case the schema says something different, we wrap the array with the schema's dtype. - TypedArray::maybe_wrap(array.clone().into(), &field.try_into().unwrap()) + // In case the schema says something different, we cast into the schema's dtype. + let vortex_array = ArrayRef::from_arrow(array.clone(), field.is_nullable()); + cast(vortex_array.as_ref(), &field.as_ref().into()).unwrap() }) .collect(), ) @@ -97,8 +54,8 @@ impl TryFrom for DType { value .fields() .iter() - .map(|f| f.data_type().try_into_dtype(f.is_nullable())) - .collect::>>()?, + .map(|f| f.as_ref().into()) + .collect_vec(), )) } } @@ -130,76 +87,47 @@ impl TryFrom<&DataType> for PType { } } -pub trait TryIntoDType { - fn try_into_dtype(self, is_nullable: bool) -> VortexResult; -} - -impl TryIntoDType for &DataType { - fn try_into_dtype(self, is_nullable: bool) -> VortexResult { - use crate::dtype::Nullability::*; +impl From<&Field> for DType { + fn from(field: &Field) -> Self { use crate::dtype::Signedness::*; - let nullability: Nullability = is_nullable.into(); - - match self { - DataType::Null => Ok(Null), - DataType::Boolean => Ok(Bool(nullability)), - DataType::Int8 => Ok(Int(IntWidth::_8, Signed, nullability)), - DataType::Int16 => Ok(Int(IntWidth::_16, Signed, nullability)), - DataType::Int32 => Ok(Int(IntWidth::_32, Signed, nullability)), - DataType::Int64 => Ok(Int(IntWidth::_64, Signed, nullability)), - DataType::UInt8 => Ok(Int(IntWidth::_8, Unsigned, nullability)), - DataType::UInt16 => Ok(Int(IntWidth::_16, Unsigned, nullability)), - DataType::UInt32 => Ok(Int(IntWidth::_32, Unsigned, nullability)), - DataType::UInt64 => Ok(Int(IntWidth::_64, Unsigned, nullability)), - DataType::Float16 => Ok(Float(FloatWidth::_16, nullability)), - DataType::Float32 => Ok(Float(FloatWidth::_32, nullability)), - DataType::Float64 => Ok(Float(FloatWidth::_64, nullability)), - DataType::Utf8 | DataType::LargeUtf8 => Ok(Utf8(nullability)), - DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { - Ok(Binary(nullability)) - } + let nullability: Nullability = field.is_nullable().into(); + + match field.data_type() { + DataType::Null => Null, + DataType::Boolean => Bool(nullability), + DataType::Int8 => Int(IntWidth::_8, Signed, nullability), + DataType::Int16 => Int(IntWidth::_16, Signed, nullability), + DataType::Int32 => Int(IntWidth::_32, Signed, nullability), + DataType::Int64 => Int(IntWidth::_64, Signed, nullability), + DataType::UInt8 => Int(IntWidth::_8, Unsigned, nullability), + DataType::UInt16 => Int(IntWidth::_16, Unsigned, nullability), + DataType::UInt32 => Int(IntWidth::_32, Unsigned, nullability), + DataType::UInt64 => Int(IntWidth::_64, Unsigned, nullability), + DataType::Float16 => Float(FloatWidth::_16, nullability), + DataType::Float32 => Float(FloatWidth::_32, nullability), + DataType::Float64 => Float(FloatWidth::_64, nullability), + DataType::Utf8 | DataType::LargeUtf8 => Utf8(nullability), + DataType::Binary | DataType::LargeBinary => Binary(nullability), // TODO(robert): what to do about this timezone? - DataType::Timestamp(u, _) => Ok(ZonedDateTime(u.into(), nullability)), - DataType::Date32 | DataType::Date64 => Ok(LocalDate(nullability)), - DataType::Time32(u) | DataType::Time64(u) => Ok(LocalTime(u.into(), nullability)), - DataType::List(e) | DataType::FixedSizeList(e, _) | DataType::LargeList(e) => { - Ok(List(Box::new(e.try_into()?), nullability)) + DataType::Timestamp(u, _) => zoneddatetime(u.into(), nullability), + DataType::Date32 => localdate(IntWidth::_32, nullability), + DataType::Date64 => localdate(IntWidth::_64, nullability), + DataType::Time32(u) => localtime(u.into(), IntWidth::_32, nullability), + DataType::Time64(u) => localtime(u.into(), IntWidth::_64, nullability), + DataType::List(e) | DataType::LargeList(e) => { + List(Box::new(e.as_ref().into()), nullability) } - DataType::Struct(f) => Ok(Struct( + DataType::Struct(f) => Struct( f.iter().map(|f| Arc::new(f.name().clone())).collect(), - f.iter() - .map(|f| f.data_type().try_into_dtype(f.is_nullable())) - .collect::>>()?, - )), - DataType::Dictionary(_, v) => v.as_ref().try_into_dtype(is_nullable), - DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => { - Ok(Decimal(*p, *s, nullability)) - } - DataType::Map(e, _) => match e.data_type() { - DataType::Struct(f) => Ok(Map( - Box::new(f.first().unwrap().try_into()?), - Box::new(f.get(1).unwrap().try_into()?), - Nullable, - )), - _ => Err(VortexError::InvalidArrowDataType(e.data_type().clone())), - }, - DataType::RunEndEncoded(_, v) => v.try_into(), - DataType::Duration(_) | DataType::Interval(_) | DataType::Union(_, _) => { - Err(VortexError::InvalidArrowDataType(self.clone())) - } + f.iter().map(|f| f.as_ref().into()).collect_vec(), + ), + DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(*p, *s, nullability), + _ => unimplemented!("Arrow data type not yet supported: {:?}", field.data_type()), } } } -impl TryFrom<&FieldRef> for DType { - type Error = VortexError; - - fn try_from(value: &FieldRef) -> VortexResult { - value.data_type().try_into_dtype(value.is_nullable()) - } -} - impl From<&ArrowTimeUnit> for TimeUnit { fn from(value: &ArrowTimeUnit) -> Self { match value { @@ -211,109 +139,13 @@ impl From<&ArrowTimeUnit> for TimeUnit { } } -impl From for DataType { - fn from(value: DType) -> Self { - (&value).into() - } -} - -// TODO(ngates): we probably want to implement this for an arrow Field not a DataType? -impl From<&DType> for DataType { - fn from(value: &DType) -> Self { - use crate::dtype::Signedness::*; +impl From for ArrowTimeUnit { + fn from(value: TimeUnit) -> Self { match value { - Null => DataType::Null, - Bool(_) => DataType::Boolean, - Int(w, s, _) => match w { - IntWidth::Unknown => match s { - Unknown => DataType::Int64, - Unsigned => DataType::UInt64, - Signed => DataType::Int64, - }, - IntWidth::_8 => match s { - Unknown => DataType::Int8, - Unsigned => DataType::UInt8, - Signed => DataType::Int8, - }, - IntWidth::_16 => match s { - Unknown => DataType::Int16, - Unsigned => DataType::UInt16, - Signed => DataType::Int16, - }, - IntWidth::_32 => match s { - Unknown => DataType::Int32, - Unsigned => DataType::UInt32, - Signed => DataType::Int32, - }, - IntWidth::_64 => match s { - Unknown => DataType::Int64, - Unsigned => DataType::UInt64, - Signed => DataType::Int64, - }, - }, - Decimal(p, w, _) => DataType::Decimal128(*p, *w), - Float(w, _) => match w { - FloatWidth::Unknown => DataType::Float64, - FloatWidth::_16 => DataType::Float16, - FloatWidth::_32 => DataType::Float32, - FloatWidth::_64 => DataType::Float64, - }, - Utf8(_) => DataType::Utf8, - Binary(_) => DataType::Binary, - LocalTime(u, _) => DataType::Time64(match u { - TimeUnit::Ns => ArrowTimeUnit::Nanosecond, - TimeUnit::Us => ArrowTimeUnit::Microsecond, - TimeUnit::Ms => ArrowTimeUnit::Millisecond, - TimeUnit::S => ArrowTimeUnit::Second, - }), - LocalDate(_) => DataType::Date64, - Instant(u, _) => DataType::Timestamp( - match u { - TimeUnit::Ns => ArrowTimeUnit::Nanosecond, - TimeUnit::Us => ArrowTimeUnit::Microsecond, - TimeUnit::Ms => ArrowTimeUnit::Millisecond, - TimeUnit::S => ArrowTimeUnit::Second, - }, - None, - ), - ZonedDateTime(_, _) => { - unimplemented!("Converting ZoneDateTime to arrow datatype is not supported") - } - Struct(names, dtypes) => DataType::Struct( - zip(names, dtypes) - .map(|(n, dt)| Field::new((**n).clone(), dt.into(), dt.is_nullable())) - .collect(), - ), - List(c, _) => DataType::List(Arc::new(Field::new( - "element", - c.as_ref().into(), - c.is_nullable(), - ))), - Map(k, v, _) => DataType::Map( - Arc::new(Field::new( - "entries", - DataType::Struct(Fields::from(vec![ - Field::new("key", k.as_ref().into(), false), - Field::new("value", v.as_ref().into(), v.is_nullable()), - ])), - false, - )), - false, - ), + TimeUnit::S => ArrowTimeUnit::Second, + TimeUnit::Ms => ArrowTimeUnit::Millisecond, + TimeUnit::Us => ArrowTimeUnit::Microsecond, + TimeUnit::Ns => ArrowTimeUnit::Nanosecond, } } } - -#[cfg(test)] -mod tests { - use crate::dtype::*; - - use super::*; - - #[test] - fn test_dtype_to_datatype() { - let dtype = Int(IntWidth::_32, Signedness::Signed, Nullability::Nullable); - let data_type: DataType = dtype.into(); - assert_eq!(data_type, DataType::Int32); - } -} diff --git a/vortex-array/src/arrow/mod.rs b/vortex-array/src/arrow/mod.rs index e2c9df1c2f..ad0c08b3d8 100644 --- a/vortex-array/src/arrow/mod.rs +++ b/vortex-array/src/arrow/mod.rs @@ -1,48 +1,2 @@ -use arrow::array::ArrayRef; -use itertools::Itertools; - -use crate::array::ArrowIterator; - -pub mod aligned_iter; -pub mod compute; pub mod convert; - -pub trait CombineChunks { - fn combine_chunks(self) -> ArrayRef; -} - -impl CombineChunks for Box { - fn combine_chunks(self) -> ArrayRef { - let chunks = self.collect_vec(); - let chunk_refs = chunks.iter().map(|a| a.as_ref()).collect_vec(); - arrow::compute::concat(&chunk_refs).unwrap() - } -} - -#[macro_export] -macro_rules! match_arrow_numeric_type { - ($self:expr, | $_:tt $enc:ident | $($body:tt)*) => ({ - macro_rules! __with__ {( $_ $enc:ident ) => ( $($body)* )} - use $crate::dtype::DType::*; - use $crate::dtype::IntWidth::*; - use $crate::dtype::Signedness::*; - use $crate::dtype::FloatWidth; - use arrow::datatypes::*; - match $self { - Int(_8, Unsigned, _) => __with__! {UInt8Type}, - Int(_16, Unsigned, _) => __with__!{UInt16Type}, - Int(_32, Unsigned, _) => __with__!{UInt32Type}, - Int(_64, Unsigned, _) => __with__!{UInt64Type}, - Int(_8, Signed, _) => __with__! {Int8Type}, - Int(_16, Signed, _) => __with__!{Int16Type}, - Int(_32, Signed, _) => __with__!{Int32Type}, - Int(_64, Signed, _) => __with__!{Int64Type}, - Float(FloatWidth::_16, _) => __with__!{Float16Type}, - Float(FloatWidth::_32, _) => __with__!{Float32Type}, - Float(FloatWidth::_64, _) => __with__!{Float64Type}, - _ => unimplemented!("Convert this DType to ArrowPrimitiveType") - } - }) -} - -pub use match_arrow_numeric_type; +pub mod wrappers; diff --git a/vortex-array/src/arrow/wrappers.rs b/vortex-array/src/arrow/wrappers.rs new file mode 100644 index 0000000000..c84b2d5ca5 --- /dev/null +++ b/vortex-array/src/arrow/wrappers.rs @@ -0,0 +1,45 @@ +use crate::array::primitive::PrimitiveArray; +use crate::array::Array; +use crate::compute::flatten::flatten_bool; +use crate::compute::scalar_at::scalar_at; +use crate::error::VortexResult; +use crate::ptype::NativePType; +use crate::stats::Stat; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; + +pub fn as_scalar_buffer( + array: PrimitiveArray, +) -> ScalarBuffer { + assert_eq!(array.ptype(), &T::PTYPE); + ScalarBuffer::from(array.buffer().clone()) +} + +pub fn as_offset_buffer( + array: PrimitiveArray, +) -> OffsetBuffer { + OffsetBuffer::new(as_scalar_buffer(array)) +} + +pub fn as_nulls(validity: Option<&dyn Array>) -> VortexResult> { + if validity.is_none() { + return Ok(None); + } + + // Short-circuit if the validity is constant + let validity = validity.unwrap(); + if validity + .stats() + .get_as::(&Stat::IsConstant) + .unwrap_or_default() + { + if scalar_at(validity, 0)?.try_into().unwrap() { + return Ok(None); + } else { + return Ok(Some(NullBuffer::new_null(validity.len()))); + } + } + + Ok(Some(NullBuffer::new( + flatten_bool(validity)?.buffer().clone(), + ))) +} diff --git a/vortex-array/src/composite_dtypes.rs b/vortex-array/src/composite_dtypes.rs new file mode 100644 index 0000000000..08ab92de9f --- /dev/null +++ b/vortex-array/src/composite_dtypes.rs @@ -0,0 +1,90 @@ +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +use crate::dtype::{DType, IntWidth, Nullability, Signedness}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub enum TimeUnit { + Ns, + Us, + Ms, + S, +} + +impl Display for TimeUnit { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + TimeUnit::Ns => write!(f, "ns"), + TimeUnit::Us => write!(f, "us"), + TimeUnit::Ms => write!(f, "ms"), + TimeUnit::S => write!(f, "s"), + } + } +} + +pub struct TimeUnitSerializer; + +impl TimeUnitSerializer { + pub fn serialize(unit: TimeUnit) -> Vec { + vec![unit as u8] + } + + pub fn deserialize(bytes: &[u8]) -> TimeUnit { + match bytes[0] { + 0x00 => TimeUnit::Ns, + 0x01 => TimeUnit::Us, + 0x02 => TimeUnit::Ms, + 0x03 => TimeUnit::S, + _ => panic!("Unknown timeunit variant"), + } + } +} + +const LOCALTIME_DTYPE: &str = "localtime"; + +pub fn localtime(unit: TimeUnit, width: IntWidth, nullability: Nullability) -> DType { + DType::Composite( + Arc::new(LOCALTIME_DTYPE.to_string()), + Box::new(DType::Int(width, Signedness::Signed, nullability)), + TimeUnitSerializer::serialize(unit), + ) +} + +const LOCALDATE_DTYPE: &str = "localdate"; + +pub fn localdate(width: IntWidth, nullability: Nullability) -> DType { + DType::Composite( + Arc::new(LOCALDATE_DTYPE.to_string()), + Box::new(DType::Int(width, Signedness::Signed, nullability)), + vec![], + ) +} + +const INSTANT_DTYPE: &str = "instant"; + +pub fn instant(unit: TimeUnit, nullability: Nullability) -> DType { + DType::Composite( + Arc::new(INSTANT_DTYPE.to_string()), + Box::new(DType::Int(IntWidth::_64, Signedness::Signed, nullability)), + TimeUnitSerializer::serialize(unit), + ) +} + +const ZONEDDATETIME_DTYPE: &str = "zoneddatetime"; + +pub fn zoneddatetime(unit: TimeUnit, nullability: Nullability) -> DType { + DType::Composite( + Arc::new(ZONEDDATETIME_DTYPE.to_string()), + Box::new(DType::Struct( + vec![ + Arc::new("instant".to_string()), + Arc::new("timezone".to_string()), + ], + vec![ + DType::Int(IntWidth::_64, Signedness::Signed, nullability), + DType::Utf8(nullability), + ], + )), + TimeUnitSerializer::serialize(unit), + ) +} diff --git a/vortex-array/src/compute/add.rs b/vortex-array/src/compute/add.rs index f1d4158a25..f020b75610 100644 --- a/vortex-array/src/compute/add.rs +++ b/vortex-array/src/compute/add.rs @@ -1,7 +1,7 @@ use crate::array::constant::ConstantArray; use crate::array::{Array, ArrayKind, ArrayRef}; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{Scalar, ScalarRef}; +use crate::scalar::Scalar; // TODO(ngates): convert this to arithmetic operations with macro over the kernel. pub fn add(lhs: &dyn Array, rhs: &dyn Array) -> VortexResult { @@ -21,7 +21,7 @@ pub fn add(lhs: &dyn Array, rhs: &dyn Array) -> VortexResult { } } -pub fn add_scalar(lhs: &dyn Array, rhs: &dyn Scalar) -> VortexResult { +pub fn add_scalar(lhs: &dyn Array, rhs: &Scalar) -> VortexResult { match ArrayKind::from(lhs) { ArrayKind::Constant(lhs) => { Ok(ConstantArray::new(add_scalars(lhs.scalar(), rhs)?, lhs.len()).boxed()) @@ -30,7 +30,7 @@ pub fn add_scalar(lhs: &dyn Array, rhs: &dyn Scalar) -> VortexResult { } } -pub fn add_scalars(_lhs: &dyn Scalar, _rhs: &dyn Scalar) -> VortexResult { +pub fn add_scalars(_lhs: &Scalar, _rhs: &Scalar) -> VortexResult { // Might need to improve this implementation... Ok(24.into()) } diff --git a/vortex-array/src/compute/as_arrow.rs b/vortex-array/src/compute/as_arrow.rs new file mode 100644 index 0000000000..6d4114bbef --- /dev/null +++ b/vortex-array/src/compute/as_arrow.rs @@ -0,0 +1,39 @@ +use crate::array::downcast::DowncastArrayBuiltin; +use crate::array::Array; +use crate::compute::flatten::flatten; +use crate::error::{VortexError, VortexResult}; +use arrow_array::ArrayRef as ArrowArrayRef; +use itertools::Itertools; + +pub trait AsArrowArray { + fn as_arrow(&self) -> VortexResult; +} + +pub fn as_arrow(array: &dyn Array) -> VortexResult { + // If as_arrow is implemented, then invoke that. + if let Some(a) = array.as_arrow() { + return a.as_arrow(); + } + + // Otherwise, flatten and try again. + let array = flatten(array)?.into_array(); + array.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { + Err(VortexError::NotImplemented( + "as_arrow", + array.encoding().id(), + )) + }) +} + +// TODO(ngates): return a RecordBatchReader instead? +pub fn as_arrow_chunks(array: &dyn Array) -> VortexResult> { + if let Some(chunked) = array.maybe_chunked() { + chunked + .chunks() + .iter() + .map(|a| as_arrow(a.as_ref())) + .try_collect() + } else { + as_arrow(array).map(|a| vec![a]) + } +} diff --git a/vortex-array/src/compute/cast.rs b/vortex-array/src/compute/cast.rs index d5657f2ae1..f8759ed3bd 100644 --- a/vortex-array/src/compute/cast.rs +++ b/vortex-array/src/compute/cast.rs @@ -1,35 +1,19 @@ -use crate::array::bool::BoolArray; -use crate::array::primitive::PrimitiveArray; -use crate::array::Array; +use crate::array::{Array, ArrayRef}; +use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; -use crate::ptype::PType; -pub trait CastPrimitiveFn { - fn cast_primitive(&self, ptype: &PType) -> VortexResult; +pub trait CastFn { + fn cast(&self, dtype: &DType) -> VortexResult; } -pub fn cast_primitive(array: &dyn Array, ptype: &PType) -> VortexResult { - PType::try_from(array.dtype()).map_err(|_| VortexError::InvalidDType(array.dtype().clone()))?; - array - .cast_primitive() - .map(|t| t.cast_primitive(ptype)) - .unwrap_or_else(|| { - Err(VortexError::NotImplemented( - "cast_primitive", - array.encoding().id(), - )) - }) -} +pub fn cast(array: &dyn Array, dtype: &DType) -> VortexResult { + if array.dtype() == dtype { + return Ok(dyn_clone::clone_box(array)); + } -pub trait CastBoolFn { - fn cast_bool(&self) -> VortexResult; -} - -pub fn cast_bool(array: &dyn Array) -> VortexResult { - array.cast_bool().map(|t| t.cast_bool()).unwrap_or_else(|| { - Err(VortexError::NotImplemented( - "cast_bool", - array.encoding().id(), - )) - }) + // TODO(ngates): check for null_count if dtype is non-nullable + array + .cast() + .map(|f| f.cast(dtype)) + .unwrap_or_else(|| Err(VortexError::NotImplemented("cast", array.encoding().id()))) } diff --git a/vortex-array/src/compute/flatten.rs b/vortex-array/src/compute/flatten.rs new file mode 100644 index 0000000000..15679f52f3 --- /dev/null +++ b/vortex-array/src/compute/flatten.rs @@ -0,0 +1,76 @@ +use crate::array::bool::BoolArray; +use crate::array::chunked::ChunkedArray; +use crate::array::composite::CompositeArray; +use crate::array::primitive::PrimitiveArray; +use crate::array::struct_::StructArray; +use crate::array::varbin::VarBinArray; +use crate::array::{Array, ArrayRef}; +use crate::error::{VortexError, VortexResult}; + +pub trait FlattenFn { + fn flatten(&self) -> VortexResult; +} + +/// The set of encodings that can be converted to Arrow with zero-copy. +pub enum FlattenedArray { + Bool(BoolArray), + Chunked(ChunkedArray), + Composite(CompositeArray), + Primitive(PrimitiveArray), + Struct(StructArray), + VarBin(VarBinArray), +} + +impl FlattenedArray { + pub fn into_array(self) -> ArrayRef { + match self { + FlattenedArray::Bool(array) => array.boxed(), + FlattenedArray::Chunked(array) => array.boxed(), + FlattenedArray::Composite(array) => array.boxed(), + FlattenedArray::Primitive(array) => array.boxed(), + FlattenedArray::Struct(array) => array.boxed(), + FlattenedArray::VarBin(array) => array.boxed(), + } + } +} + +/// Flatten an array into one of the flat encodings. +/// This does not guarantee that the array is recursively flattened. +pub fn flatten(array: &dyn Array) -> VortexResult { + array.flatten().map(|f| f.flatten()).unwrap_or_else(|| { + Err(VortexError::NotImplemented( + "flatten", + array.encoding().id(), + )) + }) +} + +pub fn flatten_bool(array: &dyn Array) -> VortexResult { + if let FlattenedArray::Bool(b) = flatten(array)? { + Ok(b) + } else { + Err(VortexError::InvalidArgument( + format!("Cannot flatten array {} into bool", array).into(), + )) + } +} + +pub fn flatten_primitive(array: &dyn Array) -> VortexResult { + if let FlattenedArray::Primitive(p) = flatten(array)? { + Ok(p) + } else { + Err(VortexError::InvalidArgument( + format!("Cannot flatten array {} into primitive", array).into(), + )) + } +} + +pub fn flatten_struct(array: &dyn Array) -> VortexResult { + if let FlattenedArray::Struct(s) = flatten(array)? { + Ok(s) + } else { + Err(VortexError::InvalidArgument( + format!("Cannot flatten array {} into struct", array).into(), + )) + } +} diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index 70cbdefa0f..7a962ba189 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -1,14 +1,19 @@ -use crate::compute::as_contiguous::AsContiguousFn; -use cast::{CastBoolFn, CastPrimitiveFn}; +use as_arrow::AsArrowArray; +use as_contiguous::AsContiguousFn; +use cast::CastFn; use fill::FillForwardFn; +use flatten::*; use patch::PatchFn; use scalar_at::ScalarAtFn; +use search_sorted::SearchSortedFn; use take::TakeFn; pub mod add; +pub mod as_arrow; pub mod as_contiguous; pub mod cast; pub mod fill; +pub mod flatten; pub mod patch; pub mod repeat; pub mod scalar_at; @@ -16,15 +21,19 @@ pub mod search_sorted; pub mod take; pub trait ArrayCompute { + fn as_arrow(&self) -> Option<&dyn AsArrowArray> { + None + } + fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { None } - fn cast_bool(&self) -> Option<&dyn CastBoolFn> { + fn cast(&self) -> Option<&dyn CastFn> { None } - fn cast_primitive(&self) -> Option<&dyn CastPrimitiveFn> { + fn flatten(&self) -> Option<&dyn FlattenFn> { None } @@ -40,6 +49,10 @@ pub trait ArrayCompute { None } + fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { + None + } + fn take(&self) -> Option<&dyn TakeFn> { None } diff --git a/vortex-array/src/compute/repeat.rs b/vortex-array/src/compute/repeat.rs index 9b94508a63..3d383fc1c7 100644 --- a/vortex-array/src/compute/repeat.rs +++ b/vortex-array/src/compute/repeat.rs @@ -2,19 +2,18 @@ use crate::array::constant::ConstantArray; use crate::array::{Array, ArrayRef}; use crate::scalar::Scalar; -pub fn repeat(scalar: &dyn Scalar, n: usize) -> ArrayRef { - ConstantArray::new(dyn_clone::clone_box(scalar), n).boxed() +pub fn repeat(scalar: &Scalar, n: usize) -> ArrayRef { + ConstantArray::new(scalar.clone(), n).boxed() } #[cfg(test)] mod test { use super::*; - use crate::scalar::ScalarRef; #[test] fn test_repeat() { - let scalar: ScalarRef = 47.into(); - let array = repeat(scalar.as_ref(), 100); + let scalar: Scalar = 47.into(); + let array = repeat(&scalar, 100); assert_eq!(array.len(), 100); } } diff --git a/vortex-array/src/compute/scalar_at.rs b/vortex-array/src/compute/scalar_at.rs index f579246cfd..c3d601dbd0 100644 --- a/vortex-array/src/compute/scalar_at.rs +++ b/vortex-array/src/compute/scalar_at.rs @@ -1,12 +1,12 @@ use crate::array::Array; use crate::error::{VortexError, VortexResult}; -use crate::scalar::ScalarRef; +use crate::scalar::Scalar; pub trait ScalarAtFn { - fn scalar_at(&self, index: usize) -> VortexResult; + fn scalar_at(&self, index: usize) -> VortexResult; } -pub fn scalar_at(array: &dyn Array, index: usize) -> VortexResult { +pub fn scalar_at(array: &dyn Array, index: usize) -> VortexResult { if index >= array.len() { return Err(VortexError::OutOfBounds(index, 0, array.len())); } diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 694548eb45..b002d8fb9d 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -1,69 +1,29 @@ use crate::array::Array; -use crate::error::VortexResult; -use crate::polars::IntoPolarsSeries; -use crate::polars::IntoPolarsValue; -use crate::scalar::ScalarRef; -use polars_core::prelude::*; -use polars_ops::prelude::*; +use crate::error::{VortexError, VortexResult}; +use crate::scalar::Scalar; pub enum SearchSortedSide { Left, Right, } -impl From for polars_ops::prelude::SearchSortedSide { - fn from(side: SearchSortedSide) -> Self { - match side { - SearchSortedSide::Left => polars_ops::prelude::SearchSortedSide::Left, - SearchSortedSide::Right => polars_ops::prelude::SearchSortedSide::Right, - } - } +pub trait SearchSortedFn { + fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult; } -pub fn search_sorted_usize( - indices: &dyn Array, - index: usize, +pub fn search_sorted>( + array: &dyn Array, + target: T, side: SearchSortedSide, ) -> VortexResult { - let enc_scalar: ScalarRef = index.into(); - // Convert index into correctly typed Arrow scalar. - let enc_scalar = enc_scalar.cast(indices.dtype())?; - - let series: Series = indices.iter_arrow().into_polars(); - Ok(search_sorted( - &series, - &Series::from_any_values("needle", &[enc_scalar.into_polars()], true)?, - side.into(), - false, - )? - .get(0) - .unwrap() as usize) -} - -#[cfg(test)] -mod test { - use super::*; - use crate::array::ArrayRef; - - #[test] - fn test_searchsorted_scalar() { - let haystack: ArrayRef = vec![1, 2, 3].into(); - - assert_eq!( - search_sorted_usize(haystack.as_ref(), 0, SearchSortedSide::Left).unwrap(), - 0 - ); - assert_eq!( - search_sorted_usize(haystack.as_ref(), 1, SearchSortedSide::Left).unwrap(), - 0 - ); - assert_eq!( - search_sorted_usize(haystack.as_ref(), 1, SearchSortedSide::Right).unwrap(), - 1 - ); - assert_eq!( - search_sorted_usize(haystack.as_ref(), 4, SearchSortedSide::Left).unwrap(), - 3 - ); - } + let scalar = target.into().cast(array.dtype())?; + array + .search_sorted() + .map(|f| f.search_sorted(&scalar, side)) + .unwrap_or_else(|| { + Err(VortexError::NotImplemented( + "search_sorted", + array.encoding().id(), + )) + }) } diff --git a/vortex-array/src/dtype.rs b/vortex-array/src/dtype.rs index 04228e215d..4a01869b2c 100644 --- a/vortex-array/src/dtype.rs +++ b/vortex-array/src/dtype.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display, Formatter}; +use std::hash::Hash; use std::sync::Arc; use itertools::Itertools; @@ -122,28 +123,11 @@ impl Display for FloatWidth { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] -pub enum TimeUnit { - Ns, - Us, - Ms, - S, -} - -impl Display for TimeUnit { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - TimeUnit::Ns => write!(f, "ns"), - TimeUnit::Us => write!(f, "us"), - TimeUnit::Ms => write!(f, "ms"), - TimeUnit::S => write!(f, "s"), - } - } -} - pub type FieldNames = Vec>; -#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub type Metadata = Vec; + +#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)] pub enum DType { Null, Bool(Nullability), @@ -152,13 +136,9 @@ pub enum DType { Float(FloatWidth, Nullability), Utf8(Nullability), Binary(Nullability), - LocalTime(TimeUnit, Nullability), - LocalDate(Nullability), - Instant(TimeUnit, Nullability), - ZonedDateTime(TimeUnit, Nullability), Struct(FieldNames, Vec), List(Box, Nullability), - Map(Box, Box, Nullability), + Composite(Arc, Box, Metadata), } impl DType { @@ -180,13 +160,9 @@ impl DType { Float(_, n) => matches!(n, Nullable), Utf8(n) => matches!(n, Nullable), Binary(n) => matches!(n, Nullable), - LocalTime(_, n) => matches!(n, Nullable), - LocalDate(n) => matches!(n, Nullable), - Instant(_, n) => matches!(n, Nullable), - ZonedDateTime(_, n) => matches!(n, Nullable), Struct(_, fs) => fs.iter().all(|f| f.is_nullable()), List(_, n) => matches!(n, Nullable), - Map(_, _, n) => matches!(n, Nullable), + Composite(_, d, _) => d.is_nullable(), } } @@ -207,16 +183,16 @@ impl DType { Float(w, _) => Float(*w, nullability), Utf8(_) => Utf8(nullability), Binary(_) => Binary(nullability), - LocalTime(u, _) => LocalTime(*u, nullability), - LocalDate(_) => LocalDate(nullability), - Instant(u, _) => Instant(*u, nullability), - ZonedDateTime(u, _) => ZonedDateTime(*u, nullability), Struct(n, fs) => Struct( n.clone(), fs.iter().map(|f| f.with_nullability(nullability)).collect(), ), List(c, _) => List(c.clone(), nullability), - Map(k, v, _) => Map(k.clone(), v.clone(), nullability), + Composite(n, d, m) => Composite( + n.clone(), + Box::new(d.with_nullability(nullability)), + m.clone(), + ), } } @@ -240,10 +216,6 @@ impl Display for DType { Float(w, n) => write!(f, "float({}){}", w, n), Utf8(n) => write!(f, "utf8{}", n), Binary(n) => write!(f, "binary{}", n), - LocalTime(u, n) => write!(f, "localtime({}){}", u, n), - LocalDate(n) => write!(f, "localdate{}", n), - Instant(u, n) => write!(f, "instant({}){}", u, n), - ZonedDateTime(u, n) => write!(f, "zoned_date_time({}){}", u, n), Struct(n, dt) => write!( f, "{{{}}}", @@ -253,7 +225,8 @@ impl Display for DType { .join(", ") ), List(c, n) => write!(f, "list({}){}", c, n), - Map(k, v, n) => write!(f, "map({}, {}){}", k, v, n), + // TODO(robert): Print metadata + Composite(n, d, _) => write!(f, "composite({}, [{}])", n, d,), } } } @@ -299,3 +272,15 @@ impl From for DType { } } } + +#[cfg(test)] +mod test { + use std::mem; + + use crate::dtype::DType; + + #[test] + fn size_of() { + assert_eq!(mem::size_of::(), 56); + } +} diff --git a/vortex-array/src/encode.rs b/vortex-array/src/encode.rs index 020c2133d3..1d964037fe 100644 --- a/vortex-array/src/encode.rs +++ b/vortex-array/src/encode.rs @@ -1,35 +1,40 @@ use std::sync::Arc; -use arrow::array::cast::AsArray; -use arrow::array::types::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, +use arrow_array::array::{ + Array as ArrowArray, ArrayRef as ArrowArrayRef, BooleanArray as ArrowBooleanArray, + GenericByteArray, NullArray as ArrowNullArray, PrimitiveArray as ArrowPrimitiveArray, + StructArray as ArrowStructArray, }; -use arrow::array::{ - as_null_array, Array as ArrowArray, ArrayRef as ArrowArrayRef, - BooleanArray as ArrowBooleanArray, GenericByteArray, NullArray as ArrowNullArray, - PrimitiveArray as ArrowPrimitiveArray, StructArray as ArrowStructArray, +use arrow_array::array::{ArrowPrimitiveType, OffsetSizeTrait}; +use arrow_array::cast::{as_null_array, AsArray}; +use arrow_array::types::{ + ByteArrayType, Date32Type, Date64Type, DurationMicrosecondType, DurationMillisecondType, + DurationNanosecondType, DurationSecondType, Time32MillisecondType, Time32SecondType, + Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait}; -use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer}; -use arrow::datatypes::{ - ByteArrayType, DataType, Date32Type, Date64Type, DurationMicrosecondType, - DurationMillisecondType, DurationNanosecondType, DurationSecondType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimeUnit, - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, +use arrow_array::types::{ + Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; +use arrow_buffer::Buffer; +use arrow_schema::{DataType, Field, TimeUnit}; use crate::array::bool::BoolArray; +use crate::array::composite::CompositeArray; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; use crate::array::struct_::StructArray; -use crate::array::typed::TypedArray; use crate::array::varbin::VarBinArray; use crate::array::{Array, ArrayRef}; -use crate::arrow::convert::TryIntoDType; +use crate::dtype::DType; use crate::ptype::PType; -use crate::scalar::{NullScalar, Scalar}; +use crate::scalar::NullScalar; + +pub trait FromArrow { + fn from_arrow(array: A, nullable: bool) -> Self; +} impl From<&Buffer> for ArrayRef { fn from(value: &Buffer) -> Self { @@ -37,6 +42,12 @@ impl From<&Buffer> for ArrayRef { } } +impl From<&NullBuffer> for ArrayRef { + fn from(value: &NullBuffer) -> Self { + BoolArray::new(value.inner().to_owned(), None).boxed() + } +} + impl From<&OffsetBuffer> for ArrayRef { fn from(value: &OffsetBuffer) -> Self { let ptype = if O::IS_LARGE { PType::I64 } else { PType::I32 }; @@ -44,53 +55,58 @@ impl From<&OffsetBuffer> for ArrayRef { } } -impl From<&NullBuffer> for ArrayRef { - fn from(value: &NullBuffer) -> Self { - BoolArray::new(value.inner().to_owned(), None).boxed() - } -} - -impl From<&ArrowPrimitiveArray> for ArrayRef { - fn from(value: &ArrowPrimitiveArray) -> Self { +impl FromArrow<&ArrowPrimitiveArray> for ArrayRef { + fn from_arrow(value: &ArrowPrimitiveArray, nullable: bool) -> Self { let ptype: PType = (&T::DATA_TYPE).try_into().unwrap(); let arr = PrimitiveArray::new( ptype, value.values().inner().to_owned(), - value.nulls().map(|b| b.into()), + nulls(value.nulls(), nullable, value.len()), ) .boxed(); if T::DATA_TYPE.is_numeric() { arr } else { - TypedArray::new( - arr, - T::DATA_TYPE.try_into_dtype(value.is_nullable()).unwrap(), - ) - .boxed() + let DType::Composite(id, _, metadata) = (&Field::new("_", T::DATA_TYPE, false)).into() + else { + panic!("Expected composite DType") + }; + CompositeArray::new(id, metadata, arr).boxed() } } } -impl From<&GenericByteArray> for ArrayRef { - fn from(value: &GenericByteArray) -> Self { +impl FromArrow<&GenericByteArray> for ArrayRef { + fn from_arrow(value: &GenericByteArray, nullable: bool) -> Self { + let dtype = match T::DATA_TYPE { + DataType::Binary | DataType::LargeBinary => DType::Binary(nullable.into()), + DataType::Utf8 | DataType::LargeUtf8 => DType::Utf8(nullable.into()), + _ => panic!("Invalid data type for ByteArray"), + }; VarBinArray::new( value.offsets().into(), value.values().into(), - T::DATA_TYPE.try_into_dtype(value.is_nullable()).unwrap(), - value.nulls().map(|b| b.into()), + dtype, + nulls(value.nulls(), nullable, value.len()), ) .boxed() } } -impl From<&ArrowBooleanArray> for ArrayRef { - fn from(value: &ArrowBooleanArray) -> Self { - BoolArray::new(value.values().to_owned(), value.nulls().map(|b| b.into())).boxed() +impl FromArrow<&ArrowBooleanArray> for ArrayRef { + fn from_arrow(value: &ArrowBooleanArray, nullable: bool) -> Self { + BoolArray::new( + value.values().to_owned(), + nulls(value.nulls(), nullable, value.len()), + ) + .boxed() } } -impl From<&ArrowStructArray> for ArrayRef { - fn from(value: &ArrowStructArray) -> Self { +impl FromArrow<&ArrowStructArray> for ArrayRef { + fn from_arrow(value: &ArrowStructArray, nullable: bool) -> Self { + // TODO(ngates): how should we deal with Arrow "logical nulls"? + assert!(!nullable); StructArray::new( value .column_names() @@ -101,63 +117,108 @@ impl From<&ArrowStructArray> for ArrayRef { value .columns() .iter() - .map(|c| (*c).to_owned().into()) + .zip(value.fields()) + .map(|(c, field)| ArrayRef::from_arrow(c.clone(), field.is_nullable())) .collect(), ) .boxed() } } -impl From<&ArrowNullArray> for ArrayRef { - fn from(value: &ArrowNullArray) -> Self { - ConstantArray::new(NullScalar::new().boxed(), value.len()).boxed() +impl FromArrow<&ArrowNullArray> for ArrayRef { + fn from_arrow(value: &ArrowNullArray, nullable: bool) -> Self { + assert!(nullable); + ConstantArray::new(NullScalar::new().into(), value.len()).boxed() + } +} + +fn nulls(nulls: Option<&NullBuffer>, nullable: bool, len: usize) -> Option { + if nullable { + Some( + nulls + .map(|n| n.into()) + .unwrap_or_else(|| ConstantArray::new(true.into(), len).boxed()), + ) + } else { + assert!(nulls.is_none()); + None } } -impl From for ArrayRef { - fn from(array: ArrowArrayRef) -> Self { +impl FromArrow for ArrayRef { + fn from_arrow(array: ArrowArrayRef, nullable: bool) -> Self { match array.data_type() { - DataType::Boolean => array.as_boolean().into(), - DataType::UInt8 => array.as_primitive::().into(), - DataType::UInt16 => array.as_primitive::().into(), - DataType::UInt32 => array.as_primitive::().into(), - DataType::UInt64 => array.as_primitive::().into(), - DataType::Int8 => array.as_primitive::().into(), - DataType::Int16 => array.as_primitive::().into(), - DataType::Int32 => array.as_primitive::().into(), - DataType::Int64 => array.as_primitive::().into(), - DataType::Float16 => array.as_primitive::().into(), - DataType::Float32 => array.as_primitive::().into(), - DataType::Float64 => array.as_primitive::().into(), - DataType::Utf8 => array.as_string::().into(), - DataType::LargeUtf8 => array.as_string::().into(), - DataType::Binary => array.as_binary::().into(), - DataType::LargeBinary => array.as_binary::().into(), - DataType::Struct(_) => array.as_struct().into(), - DataType::Null => as_null_array(array.as_ref()).into(), + DataType::Boolean => ArrayRef::from_arrow(array.as_boolean(), nullable), + DataType::UInt8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Float16 => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + DataType::Float32 => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + DataType::Float64 => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + DataType::Utf8 => ArrayRef::from_arrow(array.as_string::(), nullable), + DataType::LargeUtf8 => ArrayRef::from_arrow(array.as_string::(), nullable), + DataType::Binary => ArrayRef::from_arrow(array.as_binary::(), nullable), + DataType::LargeBinary => ArrayRef::from_arrow(array.as_binary::(), nullable), + DataType::Struct(_) => ArrayRef::from_arrow(array.as_struct(), nullable), + DataType::Null => ArrayRef::from_arrow(as_null_array(array.as_ref()), nullable), DataType::Timestamp(u, _) => match u { - TimeUnit::Second => array.as_primitive::().into(), - TimeUnit::Millisecond => array.as_primitive::().into(), - TimeUnit::Microsecond => array.as_primitive::().into(), - TimeUnit::Nanosecond => array.as_primitive::().into(), + TimeUnit::Second => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } }, - DataType::Date32 => array.as_primitive::().into(), - DataType::Date64 => array.as_primitive::().into(), + DataType::Date32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Date64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), DataType::Time32(u) => match u { - TimeUnit::Second => array.as_primitive::().into(), - TimeUnit::Millisecond => array.as_primitive::().into(), + TimeUnit::Second => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } _ => unreachable!(), }, DataType::Time64(u) => match u { - TimeUnit::Microsecond => array.as_primitive::().into(), - TimeUnit::Nanosecond => array.as_primitive::().into(), + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } _ => unreachable!(), }, DataType::Duration(u) => match u { - TimeUnit::Second => array.as_primitive::().into(), - TimeUnit::Millisecond => array.as_primitive::().into(), - TimeUnit::Microsecond => array.as_primitive::().into(), - TimeUnit::Nanosecond => array.as_primitive::().into(), + TimeUnit::Second => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } }, _ => panic!( "TODO(robert): Missing array encoding for dtype {}", diff --git a/vortex-array/src/error.rs b/vortex-array/src/error.rs index c82575f124..391a7b35ff 100644 --- a/vortex-array/src/error.rs +++ b/vortex-array/src/error.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; -use std::env; use std::fmt::{self, Display, Formatter}; use std::ops::Deref; +use std::{env, io}; use crate::array::EncodingId; use crate::dtype::DType; @@ -70,11 +70,7 @@ pub enum VortexError { #[error("Expected type {0} but found type {1}")] MismatchedTypes(DType, DType), #[error("unexpected arrow data type: {0:?}")] - InvalidArrowDataType(arrow::datatypes::DataType), - #[error("polars error: {0:?}")] - PolarsError(PolarsError), - #[error("arrow error: {0:?}")] - ArrowError(ArrowError), + InvalidArrowDataType(arrow_schema::DataType), #[error("patch values may not be null for base dtype {0}")] NullPatchValuesNotAllowed(DType), #[error("unsupported DType {0} for data array")] @@ -83,38 +79,38 @@ pub enum VortexError { UnsupportedOffsetsArrayDType(DType), #[error("array containing indices or run ends must be strictly monotonically increasing")] IndexArrayMustBeStrictSorted, + #[error("arrow error: {0:?}")] + ArrowError(ArrowError), + #[error("io error: {0:?}")] + IOError(IOError), } pub type VortexResult = Result; -// Wrap up external errors so that we can implement a dumb PartialEq -#[derive(Debug)] -pub struct ArrowError(pub arrow::error::ArrowError); - -impl PartialEq for ArrowError { - fn eq(&self, _other: &Self) -> bool { - false +impl From<&str> for VortexError { + fn from(value: &str) -> Self { + VortexError::InvalidArgument(value.to_string().into()) } } -impl From for VortexError { - fn from(err: arrow::error::ArrowError) -> Self { - VortexError::ArrowError(ArrowError(err)) - } -} +macro_rules! wrapped_error { + ($E:ty, $e:ident) => { + #[derive(Debug)] + pub struct $e(pub $E); -#[derive(Debug)] -#[allow(dead_code)] -pub struct PolarsError(polars_core::error::PolarsError); + impl PartialEq for $e { + fn eq(&self, _other: &Self) -> bool { + false + } + } -impl PartialEq for PolarsError { - fn eq(&self, _other: &Self) -> bool { - false - } + impl From<$E> for VortexError { + fn from(err: $E) -> Self { + VortexError::$e($e(err)) + } + } + }; } -impl From for VortexError { - fn from(err: polars_core::error::PolarsError) -> Self { - VortexError::PolarsError(PolarsError(err)) - } -} +wrapped_error!(arrow_schema::ArrowError, ArrowError); +wrapped_error!(io::Error, IOError); diff --git a/vortex-array/src/iterator.rs b/vortex-array/src/iterator.rs new file mode 100644 index 0000000000..25d6469cd0 --- /dev/null +++ b/vortex-array/src/iterator.rs @@ -0,0 +1,56 @@ +use crate::accessor::ArrayAccessor; +use std::marker::PhantomData; + +pub struct ArrayIter, T> { + array: A, + current: usize, + end: usize, + phantom: PhantomData, +} + +impl, T> ArrayIter { + pub fn new(array: A) -> Self { + let len = array.len(); + ArrayIter { + array, + current: 0, + end: len, + phantom: PhantomData, + } + } +} + +impl, T> Iterator for ArrayIter { + type Item = Option; + + #[inline] + fn next(&mut self) -> Option { + if self.current == self.end { + None + } else { + let old = self.current; + self.current += 1; + Some(self.array.value(old)) + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + self.array.len() - self.current, + Some(self.array.len() - self.current), + ) + } +} + +impl, T> DoubleEndedIterator for ArrayIter { + fn next_back(&mut self) -> Option { + if self.end == self.current { + None + } else { + self.end -= 1; + Some(self.array.value(self.end)) + } + } +} + +impl, T> ExactSizeIterator for ArrayIter {} diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index fad5ab605f..fa84743448 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -2,13 +2,15 @@ pub mod array; pub mod arrow; pub mod scalar; +pub mod accessor; +pub mod composite_dtypes; pub mod compress; pub mod compute; pub mod dtype; pub mod encode; pub mod error; pub mod formatter; -mod polars; +pub mod iterator; pub mod ptype; mod sampling; pub mod serde; diff --git a/vortex-array/src/polars.rs b/vortex-array/src/polars.rs deleted file mode 100644 index 70b0845bcb..0000000000 --- a/vortex-array/src/polars.rs +++ /dev/null @@ -1,101 +0,0 @@ -use arrow::array::{Array as ArrowArray, ArrayRef as ArrowArrayRef}; -use polars_arrow::array::from_data; -use polars_core::prelude::{AnyValue, Series}; - -use crate::array::ArrowIterator; -use crate::dtype::DType; -use crate::scalar::{ - BinaryScalar, BoolScalar, NullableScalar, PScalar, Scalar, ScalarRef, Utf8Scalar, -}; - -pub trait IntoPolarsSeries { - fn into_polars(self) -> Series; -} - -impl IntoPolarsSeries for ArrowArrayRef { - fn into_polars(self) -> Series { - let polars_array = from_data(&self.to_data()); - ("array", polars_array).try_into().unwrap() - } -} - -impl IntoPolarsSeries for Vec { - fn into_polars(self) -> Series { - let chunks: Vec> = - self.iter().map(|a| from_data(&a.to_data())).collect(); - ("array", chunks).try_into().unwrap() - } -} - -impl IntoPolarsSeries for Box { - fn into_polars(self) -> Series { - let chunks: Vec> = - self.map(|a| from_data(&a.to_data())).collect(); - ("array", chunks).try_into().unwrap() - } -} - -pub trait IntoPolarsValue { - fn into_polars<'a>(self) -> AnyValue<'a>; -} - -impl IntoPolarsValue for ScalarRef { - fn into_polars<'a>(self) -> AnyValue<'a> { - self.as_ref().into_polars() - } -} - -impl IntoPolarsValue for &dyn Scalar { - fn into_polars<'a>(self) -> AnyValue<'a> { - if let Some(ns) = self.as_any().downcast_ref::() { - return match ns { - NullableScalar::Some(s, _) => s.as_ref().into_polars(), - NullableScalar::None(_) => AnyValue::Null, - }; - } - - match self.dtype() { - DType::Null => AnyValue::Null, - DType::Bool(_) => { - AnyValue::Boolean(self.as_any().downcast_ref::().unwrap().value()) - } - DType::Int(_, _, _) | DType::Float(_, _) => { - match self.as_any().downcast_ref::().unwrap() { - PScalar::U8(v) => AnyValue::UInt8(*v), - PScalar::U16(v) => AnyValue::UInt16(*v), - PScalar::U32(v) => AnyValue::UInt32(*v), - PScalar::U64(v) => AnyValue::UInt64(*v), - PScalar::I8(v) => AnyValue::Int8(*v), - PScalar::I16(v) => AnyValue::Int16(*v), - PScalar::I32(v) => AnyValue::Int32(*v), - PScalar::I64(v) => AnyValue::Int64(*v), - PScalar::F16(v) => AnyValue::Float32(v.to_f32()), - PScalar::F32(v) => AnyValue::Float32(*v), - PScalar::F64(v) => AnyValue::Float64(*v), - } - } - DType::Decimal(_, _, _) => todo!(), - DType::Utf8(_) => AnyValue::StringOwned( - self.as_any() - .downcast_ref::() - .unwrap() - .value() - .into(), - ), - DType::Binary(_) => AnyValue::BinaryOwned( - self.as_any() - .downcast_ref::() - .unwrap() - .value() - .clone(), - ), - DType::LocalTime(_, _) => todo!(), - DType::LocalDate(_) => todo!(), - DType::Instant(_, _) => todo!(), - DType::ZonedDateTime(_, _) => todo!(), - DType::Struct(_, _) => todo!(), - DType::List(_, _) => todo!(), - DType::Map(_, _, _) => todo!(), - } - } -} diff --git a/vortex-array/src/ptype.rs b/vortex-array/src/ptype.rs index 71f79d36e8..bcafbf236e 100644 --- a/vortex-array/src/ptype.rs +++ b/vortex-array/src/ptype.rs @@ -1,13 +1,13 @@ -use std::fmt::{Debug, Display}; +use arrow_buffer::ArrowNativeType; +use std::fmt::{Debug, Display, Formatter}; use std::panic::RefUnwindSafe; -use arrow::datatypes::ArrowNativeType; use half::f16; use num_traits::{Num, NumCast}; use crate::dtype::{DType, FloatWidth, IntWidth, Signedness}; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{PScalar, ScalarRef}; +use crate::scalar::{PScalar, Scalar}; #[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)] pub enum PType { @@ -37,8 +37,8 @@ pub trait NativePType: + RefUnwindSafe + Num + NumCast - + Into - + TryFrom + + Into + + TryFrom + Into { const PTYPE: PType; @@ -140,27 +140,27 @@ macro_rules! match_each_unsigned_integer_ptype { pub use match_each_unsigned_integer_ptype; impl PType { - pub fn is_unsigned_int(self) -> bool { + pub const fn is_unsigned_int(self) -> bool { matches!(self, PType::U8 | PType::U16 | PType::U32 | PType::U64) } - pub fn is_signed_int(self) -> bool { + pub const fn is_signed_int(self) -> bool { matches!(self, PType::I8 | PType::I16 | PType::I32 | PType::I64) } - pub fn is_int(self) -> bool { + pub const fn is_int(self) -> bool { self.is_unsigned_int() || self.is_signed_int() } - pub fn is_float(self) -> bool { + pub const fn is_float(self) -> bool { matches!(self, PType::F16 | PType::F32 | PType::F64) } - pub fn byte_width(&self) -> usize { + pub const fn byte_width(&self) -> usize { match_each_native_ptype!(self, |$T| std::mem::size_of::<$T>()) } - pub fn bit_width(&self) -> usize { + pub const fn bit_width(&self) -> usize { self.byte_width() * 8 } @@ -185,6 +185,24 @@ impl PType { } } +impl Display for PType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + PType::U8 => write!(f, "u8"), + PType::U16 => write!(f, "u16"), + PType::U32 => write!(f, "u32"), + PType::U64 => write!(f, "u64"), + PType::I8 => write!(f, "i8"), + PType::I16 => write!(f, "i16"), + PType::I32 => write!(f, "i32"), + PType::I64 => write!(f, "i64"), + PType::F16 => write!(f, "f16"), + PType::F32 => write!(f, "f32"), + PType::F64 => write!(f, "f64"), + } + } +} + impl TryFrom<&DType> for PType { type Error = VortexError; @@ -224,7 +242,9 @@ impl TryFrom<&DType> for PType { FloatWidth::_32 => Ok(PType::F32), FloatWidth::_64 => Ok(PType::F64), }, - _ => Err(VortexError::InvalidDType(value.clone())), + _ => Err(VortexError::InvalidArgument( + format!("Cannot convert DType {} into PType", value.clone()).into(), + )), } } } diff --git a/vortex-array/src/scalar/arrow.rs b/vortex-array/src/scalar/arrow.rs deleted file mode 100644 index 13ce1700c1..0000000000 --- a/vortex-array/src/scalar/arrow.rs +++ /dev/null @@ -1,72 +0,0 @@ -use arrow::array::types::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; -use arrow::array::Scalar as ArrowScalar; -use arrow::array::{Datum, PrimitiveArray}; - -use crate::scalar::{PScalar, Scalar}; - -impl From<&dyn Scalar> for Box { - fn from(value: &dyn Scalar) -> Self { - if let Some(pscalar) = value.as_any().downcast_ref::() { - return match pscalar { - PScalar::U8(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::U16(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::U32(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::U64(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::I8(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![*v]))) - } - PScalar::I16(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::I32(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::I64(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::F16(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::F32(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - PScalar::F64(v) => { - Box::new(ArrowScalar::new(PrimitiveArray::::from(vec![ - *v, - ]))) - } - }; - } - - todo!("implement other scalar types {:?}", value) - } -} diff --git a/vortex-array/src/scalar/binary.rs b/vortex-array/src/scalar/binary.rs index fd2f725087..3975ee1edd 100644 --- a/vortex-array/src/scalar/binary.rs +++ b/vortex-array/src/scalar/binary.rs @@ -1,97 +1,68 @@ +use std::fmt::{Display, Formatter}; + use crate::dtype::{DType, Nullability}; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{Scalar, ScalarRef}; -use std::any::Any; -use std::fmt::{Display, Formatter}; +use crate::scalar::Scalar; #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct BinaryScalar { - value: Vec, + value: Option>, } impl BinaryScalar { - pub fn new(value: Vec) -> Self { + pub fn new(value: Option>) -> Self { Self { value } } - pub fn value(&self) -> &Vec { - &self.value - } -} - -impl Scalar for BinaryScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) + pub fn none() -> Self { + Self { value: None } } - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) + pub fn some(value: Vec) -> Self { + Self { value: Some(value) } } - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) + pub fn value(&self) -> Option<&[u8]> { + self.value.as_deref() } #[inline] - fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &DType { &DType::Binary(Nullability::NonNullable) } - fn cast(&self, _dtype: &DType) -> VortexResult { + pub fn cast(&self, _dtype: &DType) -> VortexResult { todo!() } - fn nbytes(&self) -> usize { - self.value.len() + pub fn nbytes(&self) -> usize { + self.value().map(|s| s.len()).unwrap_or(1) } } -impl From> for ScalarRef { +impl From> for Scalar { fn from(value: Vec) -> Self { - BinaryScalar::new(value).boxed() + BinaryScalar::new(Some(value)).into() } } -impl TryFrom for Vec { +impl TryFrom for Vec { type Error = VortexError; - fn try_from(value: ScalarRef) -> Result { - let dtype = value.dtype().clone(); - let scalar = value - .into_any() - .downcast::() - .map_err(|_| VortexError::InvalidDType(dtype))?; - Ok(scalar.value) - } -} - -impl TryFrom<&dyn Scalar> for Vec { - type Error = VortexError; - - fn try_from(value: &dyn Scalar) -> Result { - if let Some(scalar) = value.as_any().downcast_ref::() { - Ok(scalar.value.clone()) - } else { - Err(VortexError::InvalidDType(value.dtype().clone())) - } + fn try_from(value: Scalar) -> VortexResult { + let Scalar::Binary(b) = value else { + return Err(VortexError::InvalidDType(value.dtype().clone())); + }; + let dtype = b.dtype().clone(); + b.value.ok_or_else(|| VortexError::InvalidDType(dtype)) } } impl Display for BinaryScalar { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "bytes[{}]", self.value.len()) + match self.value() { + None => write!(f, "bytes[none]"), + Some(b) => write!(f, "bytes[{}]", b.len()), + } } } diff --git a/vortex-array/src/scalar/bool.rs b/vortex-array/src/scalar/bool.rs index ea7c9f56bf..fb4093e727 100644 --- a/vortex-array/src/scalar/bool.rs +++ b/vortex-array/src/scalar/bool.rs @@ -1,105 +1,74 @@ -use std::any::Any; use std::fmt::{Display, Formatter}; use crate::dtype::{DType, Nullability}; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; +use crate::scalar::Scalar; #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct BoolScalar { - value: bool, + value: Option, } impl BoolScalar { - pub fn new(value: bool) -> Self { + pub fn new(value: Option) -> Self { Self { value } } - pub fn value(&self) -> bool { - self.value - } -} - -impl Scalar for BoolScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self + pub fn none() -> Self { + Self { value: None } } - #[inline] - fn into_any(self: Box) -> Box { - self + pub fn some(value: bool) -> Self { + Self { value: Some(value) } } - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) + pub fn value(&self) -> Option { + self.value } #[inline] - fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &DType { &DType::Bool(Nullability::NonNullable) } - fn cast(&self, dtype: &DType) -> VortexResult { + pub fn cast(&self, dtype: &DType) -> VortexResult { match dtype { - DType::Bool(Nullability::NonNullable) => Ok(self.clone().boxed()), - DType::Bool(Nullability::Nullable) => { - Ok(NullableScalar::some(self.clone().boxed()).boxed()) - } + DType::Bool(_) => Ok(self.clone().into()), _ => Err(VortexError::InvalidDType(dtype.clone())), } } - fn nbytes(&self) -> usize { + pub fn nbytes(&self) -> usize { 1 } } -impl From for ScalarRef { +impl From for Scalar { #[inline] fn from(value: bool) -> Self { - BoolScalar::new(value).boxed() + BoolScalar::new(Some(value)).into() } } -impl TryFrom for bool { +impl TryFrom for bool { type Error = VortexError; - #[inline] - fn try_from(value: ScalarRef) -> VortexResult { - value.as_ref().try_into() - } -} - -impl TryFrom<&dyn Scalar> for bool { - type Error = VortexError; + fn try_from(value: Scalar) -> VortexResult { + let Scalar::Bool(b) = value else { + return Err(VortexError::InvalidDType(value.dtype().clone())); + }; - fn try_from(value: &dyn Scalar) -> VortexResult { - if let Some(bool_scalar) = value - .as_nonnull() - .and_then(|v| v.as_any().downcast_ref::()) - { - Ok(bool_scalar.value()) - } else { - Err(VortexError::InvalidDType(value.dtype().clone())) - } + b.value() + .ok_or_else(|| VortexError::InvalidDType(b.dtype().clone())) } } impl Display for BoolScalar { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.value) + match self.value() { + None => write!(f, "null"), + Some(b) => Display::fmt(&b, f), + } } } @@ -109,7 +78,7 @@ mod test { #[test] fn into_from() { - let scalar: ScalarRef = false.into(); - assert_eq!(scalar.as_ref().try_into(), Ok(false)); + let scalar: Scalar = false.into(); + assert_eq!(scalar.try_into(), Ok(false)); } } diff --git a/vortex-array/src/scalar/composite.rs b/vortex-array/src/scalar/composite.rs new file mode 100644 index 0000000000..7563897f5f --- /dev/null +++ b/vortex-array/src/scalar/composite.rs @@ -0,0 +1,40 @@ +use std::fmt::{Display, Formatter}; + +use crate::dtype::DType; +use crate::error::VortexResult; +use crate::scalar::Scalar; + +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub struct CompositeScalar { + dtype: DType, + scalar: Box, +} + +impl CompositeScalar { + pub fn new(dtype: DType, scalar: Box) -> Self { + Self { dtype, scalar } + } + + #[inline] + pub fn dtype(&self) -> &DType { + &self.dtype + } + + pub fn scalar(&self) -> &Scalar { + self.scalar.as_ref() + } + + pub fn cast(&self, _dtype: &DType) -> VortexResult { + todo!() + } + + pub fn nbytes(&self) -> usize { + self.scalar.nbytes() + } +} + +impl Display for CompositeScalar { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.scalar, self.dtype) + } +} diff --git a/vortex-array/src/scalar/equal.rs b/vortex-array/src/scalar/equal.rs deleted file mode 100644 index 8ce8a7f3a2..0000000000 --- a/vortex-array/src/scalar/equal.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::sync::Arc; - -use crate::scalar::localtime::LocalTimeScalar; -use crate::scalar::{ - BinaryScalar, BoolScalar, NullableScalar, PScalar, Scalar, ScalarRef, StructScalar, Utf8Scalar, -}; - -impl PartialEq for dyn Scalar { - fn eq(&self, that: &dyn Scalar) -> bool { - equal(self, that) - } -} - -impl PartialEq for Arc { - fn eq(&self, that: &dyn Scalar) -> bool { - equal(&**self, that) - } -} - -impl PartialEq for ScalarRef { - fn eq(&self, that: &dyn Scalar) -> bool { - equal(self.as_ref(), that) - } -} - -impl Eq for dyn Scalar {} - -macro_rules! dyn_eq { - ($ty:ty, $lhs:expr, $rhs:expr) => {{ - let lhs = $lhs.as_any().downcast_ref::<$ty>().unwrap(); - let rhs = $rhs.as_any().downcast_ref::<$ty>().unwrap(); - lhs == rhs - }}; -} - -fn equal(lhs: &dyn Scalar, rhs: &dyn Scalar) -> bool { - if lhs.dtype() != rhs.dtype() { - return false; - } - - // If the dtypes are the same then both of the scalars are either nullable or plain scalar - if let Some(ls) = lhs.as_any().downcast_ref::() { - if let Some(rs) = rhs.as_any().downcast_ref::() { - return dyn_eq!(NullableScalar, ls, rs); - } else { - unreachable!("DTypes were equal, but only one was nullable") - } - } - - use crate::dtype::DType::*; - match lhs.dtype() { - Bool(_) => dyn_eq!(BoolScalar, lhs, rhs), - Int(_, _, _) => dyn_eq!(PScalar, lhs, rhs), - Float(_, _) => dyn_eq!(PScalar, lhs, rhs), - Struct(..) => dyn_eq!(StructScalar, lhs, rhs), - Utf8(_) => dyn_eq!(Utf8Scalar, lhs, rhs), - Binary(_) => dyn_eq!(BinaryScalar, lhs, rhs), - LocalTime(_, _) => dyn_eq!(LocalTimeScalar, lhs, rhs), - _ => todo!("Equal not yet implemented for {:?} {:?}", lhs, rhs), - } -} diff --git a/vortex-array/src/scalar/list.rs b/vortex-array/src/scalar/list.rs index 9c3b2435be..012d401e82 100644 --- a/vortex-array/src/scalar/list.rs +++ b/vortex-array/src/scalar/list.rs @@ -1,142 +1,117 @@ -use std::any::Any; use std::fmt::{Display, Formatter}; use itertools::Itertools; -use crate::dtype::{DType, Nullability}; +use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; +use crate::scalar::Scalar; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct ListScalar { dtype: DType, - values: Vec, + values: Option>, } impl ListScalar { #[inline] - pub fn new(dtype: DType, values: Vec) -> Self { + pub fn new(dtype: DType, values: Option>) -> Self { Self { dtype, values } } #[inline] - pub fn values(&self) -> &[ScalarRef] { - &self.values - } -} - -impl Scalar for ListScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) + pub fn values(&self) -> Option<&[Scalar]> { + self.values.as_deref() } #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } - #[inline] - fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &DType { &self.dtype } - fn cast(&self, dtype: &DType) -> VortexResult { + pub fn cast(&self, dtype: &DType) -> VortexResult { match dtype { DType::List(field_dtype, n) => { - let new_fields: Vec = self - .values - .iter() - .map(|field| field.cast(field_dtype)) - .try_collect()?; - - let new_type = if new_fields.is_empty() { - dtype.clone() + let new_fields: Option> = self + .values() + .map(|v| v.iter().map(|field| field.cast(field_dtype)).try_collect()) + .transpose()?; + + let new_type = if let Some(nf) = new_fields.as_ref() { + if nf.is_empty() { + dtype.clone() + } else { + DType::List(Box::new(nf[0].dtype().clone()), *n) + } } else { - DType::List(Box::new(new_fields[0].dtype().clone()), *n) + dtype.clone() }; - let list_scalar = ListScalar::new(new_type, new_fields).boxed(); - match n { - Nullability::NonNullable => Ok(list_scalar), - Nullability::Nullable => Ok(NullableScalar::some(list_scalar).boxed()), - } + Ok(ListScalar::new(new_type, new_fields).into()) } _ => Err(VortexError::InvalidDType(dtype.clone())), } } - fn nbytes(&self) -> usize { - self.values.iter().map(|s| s.nbytes()).sum() + pub fn nbytes(&self) -> usize { + self.values() + .map(|v| v.iter().map(|s| s.nbytes()).sum()) + .unwrap_or(0) } } #[derive(Debug, Clone, PartialEq)] pub struct ListScalarVec(pub Vec); -impl> From> for ScalarRef { +impl> From> for Scalar { fn from(value: ListScalarVec) -> Self { - let values: Vec = value.0.into_iter().map(|v| v.into()).collect(); + let values: Vec = value.0.into_iter().map(|v| v.into()).collect(); if values.is_empty() { panic!("Can't implicitly convert empty list into ListScalar"); } - ListScalar::new(values[0].dtype().clone(), values).boxed() + ListScalar::new(values[0].dtype().clone(), Some(values)).into() } } -impl> TryFrom<&dyn Scalar> for ListScalarVec { +impl> TryFrom for ListScalarVec { type Error = VortexError; - fn try_from(value: &dyn Scalar) -> Result { - if let Some(list_s) = value.as_any().downcast_ref::() { - Ok(ListScalarVec( - list_s - .values - .clone() - .into_iter() - .map(|v| v.try_into()) - .try_collect()?, - )) + fn try_from(value: Scalar) -> Result { + if let Scalar::List(ls) = value { + if let Some(vs) = ls.values { + Ok(ListScalarVec( + vs.into_iter().map(|v| v.try_into()).try_collect()?, + )) + } else { + Err(VortexError::InvalidDType(ls.dtype().clone())) + } } else { Err(VortexError::InvalidDType(value.dtype().clone())) } } } -impl> TryFrom for ListScalarVec { +impl<'a, T: TryFrom<&'a Scalar, Error = VortexError>> TryFrom<&'a Scalar> for ListScalarVec { type Error = VortexError; - fn try_from(value: ScalarRef) -> Result { - let value_dtype = value.dtype().clone(); - let list_s = value - .into_any() - .downcast::() - .map_err(|_| VortexError::InvalidDType(value_dtype))?; - - Ok(ListScalarVec( - list_s - .values - .into_iter() - .map(|v| v.try_into()) - .try_collect()?, - )) + fn try_from(value: &'a Scalar) -> Result { + if let Scalar::List(ls) = value { + if let Some(vs) = ls.values() { + Ok(ListScalarVec( + vs.iter().map(|v| v.try_into()).try_collect()?, + )) + } else { + Err(VortexError::InvalidDType(ls.dtype().clone())) + } + } else { + Err(VortexError::InvalidDType(value.dtype().clone())) + } } } impl Display for ListScalar { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.values.iter().format(", ")) + match self.values() { + None => write!(f, ""), + Some(vs) => write!(f, "{}", vs.iter().format(", ")), + } } } diff --git a/vortex-array/src/scalar/localtime.rs b/vortex-array/src/scalar/localtime.rs deleted file mode 100644 index 16a533bfdf..0000000000 --- a/vortex-array/src/scalar/localtime.rs +++ /dev/null @@ -1,91 +0,0 @@ -use crate::dtype::{DType, Nullability, TimeUnit}; -use crate::error::VortexResult; -use crate::scalar::{PScalar, Scalar, ScalarRef}; -use std::any::Any; -use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; - -#[derive(Debug, Clone, PartialEq)] -pub struct LocalTimeScalar { - value: PScalar, - dtype: DType, -} - -impl LocalTimeScalar { - pub fn new(value: PScalar, unit: TimeUnit) -> Self { - Self { - value, - dtype: DType::LocalTime(unit, Nullability::NonNullable), - } - } - - pub fn value(&self) -> &PScalar { - &self.value - } - - pub fn time_unit(&self) -> TimeUnit { - let DType::LocalTime(u, _) = self.dtype else { - unreachable!("unexpected dtype") - }; - u - } -} - -impl Scalar for LocalTimeScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - fn cast(&self, _dtype: &DType) -> VortexResult { - todo!() - } - - fn nbytes(&self) -> usize { - self.value.nbytes() - } -} - -impl PartialOrd for LocalTimeScalar { - fn partial_cmp(&self, other: &Self) -> Option { - if self.dtype() != other.dtype() { - None - } else { - self.value.partial_cmp(&other.value) - } - } -} - -impl Display for LocalTimeScalar { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let DType::LocalTime(u, _) = self.dtype() else { - unreachable!() - }; - write!(f, "localtime[{}, unit={}]", self.value, u) - } -} diff --git a/vortex-array/src/scalar/mod.rs b/vortex-array/src/scalar/mod.rs index ecfb0dc771..9351941731 100644 --- a/vortex-array/src/scalar/mod.rs +++ b/vortex-array/src/scalar/mod.rs @@ -1,57 +1,138 @@ -use std::any::Any; -use std::fmt::{Debug, Display}; +use std::fmt::{Debug, Display, Formatter}; pub use binary::*; pub use bool::*; +pub use composite::*; pub use list::*; -pub use localtime::*; pub use null::*; -pub use nullable::*; pub use primitive::*; pub use serde::*; pub use struct_::*; pub use utf8::*; -use crate::dtype::DType; +use crate::dtype::{DType, FloatWidth, IntWidth, Signedness}; use crate::error::VortexResult; -use crate::ptype::NativePType; +use crate::ptype::{NativePType, PType}; -mod arrow; mod binary; mod bool; -mod equal; +mod composite; mod list; -mod localtime; mod null; -mod nullable; -mod ord; mod primitive; mod serde; mod struct_; mod utf8; -pub type ScalarRef = Box; - -pub trait Scalar: Display + Debug + dyn_clone::DynClone + Send + Sync + 'static { - fn as_any(&self) -> &dyn Any; - - fn into_any(self: Box) -> Box; +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum Scalar { + Binary(BinaryScalar), + Bool(BoolScalar), + List(ListScalar), + Null(NullScalar), + Primitive(PrimitiveScalar), + Struct(StructScalar), + Utf8(Utf8Scalar), + Composite(CompositeScalar), +} - fn as_nonnull(&self) -> Option<&dyn Scalar>; +macro_rules! impls_for_scalars { + ($variant:tt, $E:ty) => { + impl From<$E> for Scalar { + fn from(arr: $E) -> Self { + Self::$variant(arr) + } + } + }; +} - fn into_nonnull(self: Box) -> Option; +impls_for_scalars!(Binary, BinaryScalar); +impls_for_scalars!(Bool, BoolScalar); +impls_for_scalars!(List, ListScalar); +impls_for_scalars!(Null, NullScalar); +impls_for_scalars!(Primitive, PrimitiveScalar); +impls_for_scalars!(Struct, StructScalar); +impls_for_scalars!(Utf8, Utf8Scalar); +impls_for_scalars!(Composite, CompositeScalar); + +macro_rules! match_each_scalar { + ($self:expr, | $_:tt $scalar:ident | $($body:tt)*) => ({ + macro_rules! __with_scalar__ {( $_ $scalar:ident ) => ( $($body)* )} + match $self { + Scalar::Binary(s) => __with_scalar__! { s }, + Scalar::Bool(s) => __with_scalar__! { s }, + Scalar::List(s) => __with_scalar__! { s }, + Scalar::Null(s) => __with_scalar__! { s }, + Scalar::Primitive(s) => __with_scalar__! { s }, + Scalar::Struct(s) => __with_scalar__! { s }, + Scalar::Utf8(s) => __with_scalar__! { s }, + Scalar::Composite(s) => __with_scalar__! { s }, + } + }) +} - fn boxed(self) -> ScalarRef; +impl Scalar { + pub fn dtype(&self) -> &DType { + match_each_scalar! { self, |$s| $s.dtype() } + } - /// the logical type. - fn dtype(&self) -> &DType; + pub fn cast(&self, dtype: &DType) -> VortexResult { + match_each_scalar! { self, |$s| $s.cast(dtype) } + } - fn cast(&self, dtype: &DType) -> VortexResult; + pub fn nbytes(&self) -> usize { + match_each_scalar! { self, |$s| $s.nbytes() } + } - fn nbytes(&self) -> usize; + pub fn null(dtype: &DType) -> Self { + match dtype { + DType::Null => NullScalar::new().into(), + DType::Bool(_) => BoolScalar::new(None).into(), + DType::Int(w, s, _) => match (w, s) { + (IntWidth::Unknown, Signedness::Unknown | Signedness::Signed) => { + PrimitiveScalar::none(PType::I64).into() + } + (IntWidth::_8, Signedness::Unknown | Signedness::Signed) => { + PrimitiveScalar::none(PType::I8).into() + } + (IntWidth::_16, Signedness::Unknown | Signedness::Signed) => { + PrimitiveScalar::none(PType::I16).into() + } + (IntWidth::_32, Signedness::Unknown | Signedness::Signed) => { + PrimitiveScalar::none(PType::I32).into() + } + (IntWidth::_64, Signedness::Unknown | Signedness::Signed) => { + PrimitiveScalar::none(PType::I64).into() + } + (IntWidth::Unknown, Signedness::Unsigned) => { + PrimitiveScalar::none(PType::U64).into() + } + (IntWidth::_8, Signedness::Unsigned) => PrimitiveScalar::none(PType::U8).into(), + (IntWidth::_16, Signedness::Unsigned) => PrimitiveScalar::none(PType::U16).into(), + (IntWidth::_32, Signedness::Unsigned) => PrimitiveScalar::none(PType::U32).into(), + (IntWidth::_64, Signedness::Unsigned) => PrimitiveScalar::none(PType::U64).into(), + }, + DType::Decimal(_, _, _) => unimplemented!("DecimalScalar"), + DType::Float(w, _) => match w { + FloatWidth::Unknown => PrimitiveScalar::none(PType::F64).into(), + FloatWidth::_16 => PrimitiveScalar::none(PType::F16).into(), + FloatWidth::_32 => PrimitiveScalar::none(PType::F32).into(), + FloatWidth::_64 => PrimitiveScalar::none(PType::F64).into(), + }, + DType::Utf8(_) => Utf8Scalar::new(None).into(), + DType::Binary(_) => BinaryScalar::new(None).into(), + DType::Struct(_, _) => StructScalar::new(dtype.clone(), vec![]).into(), + DType::List(_, _) => ListScalar::new(dtype.clone(), None).into(), + DType::Composite(_, _, _) => unimplemented!("CompositeScalar"), + } + } } -dyn_clone::clone_trait_object!(Scalar); +impl Display for Scalar { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match_each_scalar! { self, |$s| Display::fmt($s, f) } + } +} /// Allows conversion from Enc scalars to a byte slice. pub trait AsBytes { @@ -82,3 +163,15 @@ impl AsBytes for T { unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of::()) } } } + +#[cfg(test)] +mod test { + use std::mem; + + use crate::scalar::Scalar; + + #[test] + fn size_of() { + assert_eq!(mem::size_of::(), 88); + } +} diff --git a/vortex-array/src/scalar/null.rs b/vortex-array/src/scalar/null.rs index 6d28622992..817dbecccd 100644 --- a/vortex-array/src/scalar/null.rs +++ b/vortex-array/src/scalar/null.rs @@ -1,11 +1,10 @@ -use std::any::Any; use std::fmt::{Display, Formatter}; use crate::dtype::DType; use crate::error::VortexResult; -use crate::scalar::{NullableScalar, Scalar, ScalarRef}; +use crate::scalar::Scalar; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct NullScalar; impl Default for NullScalar { @@ -19,44 +18,17 @@ impl NullScalar { pub fn new() -> Self { Self {} } -} - -impl Scalar for NullScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - None - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - None - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } #[inline] - fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &DType { &DType::Null } - fn cast(&self, dtype: &DType) -> VortexResult { - Ok(NullableScalar::none(dtype.clone()).boxed()) + pub fn cast(&self, _dtype: &DType) -> VortexResult { + todo!() } - fn nbytes(&self) -> usize { + pub fn nbytes(&self) -> usize { 1 } } diff --git a/vortex-array/src/scalar/nullable.rs b/vortex-array/src/scalar/nullable.rs deleted file mode 100644 index 92815f7667..0000000000 --- a/vortex-array/src/scalar/nullable.rs +++ /dev/null @@ -1,162 +0,0 @@ -use std::any::Any; -use std::fmt::{Display, Formatter}; -use std::mem::size_of; - -use crate::dtype::DType; -use crate::error::{VortexError, VortexResult}; -use crate::scalar::{NullScalar, Scalar, ScalarRef}; - -#[derive(Debug, Clone, PartialEq, PartialOrd)] -pub enum NullableScalar { - None(DType), - Some(ScalarRef, DType), -} - -impl NullableScalar { - pub fn some(scalar: ScalarRef) -> Self { - let dtype = scalar.dtype().as_nullable(); - Self::Some(scalar, dtype) - } - - pub fn none(dtype: DType) -> Self { - Self::None(dtype.as_nullable()) - } -} - -impl Scalar for NullableScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - match self { - Self::Some(s, _) => Some(s.as_ref()), - Self::None(_) => None, - } - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - match *self { - Self::Some(s, _) => Some(s), - Self::None(_) => None, - } - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } - - #[inline] - fn dtype(&self) -> &DType { - match self { - Self::Some(_, dtype) => dtype, - Self::None(dtype) => dtype, - } - } - - fn cast(&self, dtype: &DType) -> VortexResult { - match self { - Self::Some(s, _dt) => { - if dtype.is_nullable() { - Ok(Self::Some(s.cast(&dtype.as_nonnullable())?, dtype.clone()).boxed()) - } else { - s.cast(&dtype.as_nonnullable()) - } - } - Self::None(_dt) => { - if dtype.is_nullable() { - Ok(Self::None(dtype.clone()).boxed()) - } else { - Err(VortexError::InvalidDType(dtype.clone())) - } - } - } - } - - fn nbytes(&self) -> usize { - match self { - NullableScalar::Some(s, _) => s.nbytes() + size_of::(), - NullableScalar::None(_) => size_of::(), - } - } -} - -impl Display for NullableScalar { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - NullableScalar::Some(p, _) => write!(f, "{}?", p), - NullableScalar::None(_) => write!(f, "null"), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub struct NullableScalarOption(pub Option); - -impl> From> for ScalarRef { - fn from(value: NullableScalarOption) -> Self { - match value.0 { - // TODO(robert): This should return NullableScalar::None - // but that's not possible with some type that holds the associated dtype - // We need to change the bound of T to be able to get datatype from it. - None => NullScalar::new().boxed(), - Some(v) => NullableScalar::some(v.into()).boxed(), - } - } -} - -impl> TryFrom<&dyn Scalar> for NullableScalarOption { - type Error = VortexError; - - fn try_from(value: &dyn Scalar) -> Result { - let Some(ns) = value.as_any().downcast_ref::() else { - return Err(VortexError::InvalidDType(value.dtype().clone())); - }; - - Ok(NullableScalarOption(match ns { - NullableScalar::None(_) => None, - NullableScalar::Some(v, _) => Some(v.clone().try_into()?), - })) - } -} - -impl> TryFrom for NullableScalarOption { - type Error = VortexError; - - fn try_from(value: ScalarRef) -> Result { - let dtype = value.dtype().clone(); - let ns = value - .into_any() - .downcast::() - .map_err(|_| VortexError::InvalidDType(dtype))?; - - Ok(NullableScalarOption(match *ns { - NullableScalar::None(_) => None, - NullableScalar::Some(v, _) => Some(v.try_into()?), - })) - } -} - -#[cfg(test)] -mod tests { - use crate::dtype::DType; - use crate::ptype::PType; - use crate::scalar::Scalar; - - #[test] - fn test_nullable_scalar_option() { - let ns: Box = Some(10i16).into(); - let nsi32 = ns.cast(&DType::from(PType::I32)).unwrap(); - let v: i32 = nsi32.try_into().unwrap(); - assert_eq!(v, 10); - } -} diff --git a/vortex-array/src/scalar/ord.rs b/vortex-array/src/scalar/ord.rs deleted file mode 100644 index cee8b2a4e0..0000000000 --- a/vortex-array/src/scalar/ord.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::scalar::{ - BinaryScalar, BoolScalar, LocalTimeScalar, NullableScalar, PScalar, Scalar, ScalarRef, - StructScalar, Utf8Scalar, -}; -use std::cmp::Ordering; -use std::sync::Arc; -macro_rules! dyn_ord { - ($ty:ty, $lhs:expr, $rhs:expr) => {{ - let lhs = $lhs.as_any().downcast_ref::<$ty>().unwrap(); - let rhs = $rhs.as_any().downcast_ref::<$ty>().unwrap(); - if lhs < rhs { - Ordering::Less - } else if lhs == rhs { - Ordering::Equal - } else { - Ordering::Greater - } - }}; -} - -fn cmp(lhs: &dyn Scalar, rhs: &dyn Scalar) -> Option { - if lhs.dtype() != rhs.dtype() { - return None; - } - - // If the dtypes are the same then both of the scalars are either nullable or plain scalar - if let Some(ls) = lhs.as_any().downcast_ref::() { - if let Some(rs) = rhs.as_any().downcast_ref::() { - return Some(dyn_ord!(NullableScalar, ls, rs)); - } else { - unreachable!("DTypes were equal, but only one was nullable") - } - } - - use crate::dtype::DType::*; - Some(match lhs.dtype() { - Bool(_) => dyn_ord!(BoolScalar, lhs, rhs), - Int(_, _, _) => dyn_ord!(PScalar, lhs, rhs), - Float(_, _) => dyn_ord!(PScalar, lhs, rhs), - Struct(..) => dyn_ord!(StructScalar, lhs, rhs), - Utf8(_) => dyn_ord!(Utf8Scalar, lhs, rhs), - Binary(_) => dyn_ord!(BinaryScalar, lhs, rhs), - LocalTime(_, _) => dyn_ord!(LocalTimeScalar, lhs, rhs), - _ => todo!("Cmp not yet implemented for {:?} {:?}", lhs, rhs), - }) -} - -impl PartialOrd for dyn Scalar { - fn partial_cmp(&self, that: &Self) -> Option { - cmp(self, that) - } -} - -impl PartialOrd for ScalarRef { - fn partial_cmp(&self, that: &dyn Scalar) -> Option { - cmp(self.as_ref(), that) - } -} - -impl PartialOrd for Arc { - fn partial_cmp(&self, that: &dyn Scalar) -> Option { - cmp(&**self, that) - } -} diff --git a/vortex-array/src/scalar/primitive.rs b/vortex-array/src/scalar/primitive.rs index 7e2c170308..bf526eb169 100644 --- a/vortex-array/src/scalar/primitive.rs +++ b/vortex-array/src/scalar/primitive.rs @@ -1,15 +1,97 @@ -use std::any::Any; use std::fmt::{Display, Formatter}; use std::mem::size_of; use half::f16; -use crate::dtype::{DType, Nullability}; +use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; use crate::ptype::{NativePType, PType}; -use crate::scalar::{LocalTimeScalar, NullableScalar, Scalar, ScalarRef}; +use crate::scalar::composite::CompositeScalar; +use crate::scalar::Scalar; #[derive(Debug, Clone, PartialEq, PartialOrd)] +pub struct PrimitiveScalar { + ptype: PType, + value: Option, + exponent: u8, +} + +impl PrimitiveScalar { + pub fn new(ptype: PType, value: Option) -> Self { + Self { + ptype, + value, + exponent: 0, + } + } + + pub fn some(value: PScalar) -> Self { + Self { + ptype: value.ptype(), + value: Some(value), + exponent: 0, + } + } + + pub fn none(ptype: PType) -> Self { + Self { + ptype, + value: None, + exponent: 0, + } + } + + #[inline] + pub fn value(&self) -> Option { + self.value + } + + #[inline] + pub fn factor(&self) -> u8 { + self.exponent + } + + #[inline] + pub fn ptype(&self) -> PType { + self.ptype + } + + #[inline] + pub fn dtype(&self) -> &DType { + self.ptype.into() + } + + pub fn cast(&self, dtype: &DType) -> VortexResult { + let ptype: VortexResult = dtype.try_into(); + ptype + .and_then(|p| match self.value() { + None => Ok(PrimitiveScalar::none(p).into()), + Some(ps) => ps.cast_ptype(p), + }) + .or_else(|_| self.cast_dtype(dtype)) + } + + // General conversion function that handles casting primitive scalar to non-primitive. + // TODO(robert): Implement storage conversions + fn cast_dtype(&self, dtype: &DType) -> VortexResult { + Ok(CompositeScalar::new(dtype.clone(), Box::new(self.clone().into())).into()) + } + + pub fn nbytes(&self) -> usize { + size_of::() + } +} + +impl Display for PrimitiveScalar { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.value() { + None => write!(f, "({}?)", self.ptype), + Some(v) => write!(f, "{}({})", v, self.ptype), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)] pub enum PScalar { U8(u8), U16(u16), @@ -41,30 +123,7 @@ impl PScalar { } } - // General conversion function that handles casting primitive scalar to non primitive. - // If target dtype can be converted to ptype you should use cast_ptype. - pub fn cast_dtype(&self, dtype: DType) -> VortexResult { - macro_rules! from_int { - ($dtype:ident , $ps:ident) => { - match $dtype { - DType::LocalTime(w, Nullability::NonNullable) => { - Ok(LocalTimeScalar::new($ps.clone(), w.clone()).boxed()) - } - _ => Err(VortexError::InvalidDType($dtype.clone())), - } - }; - } - - match self { - p @ PScalar::U32(_) - | p @ PScalar::U64(_) - | p @ PScalar::I32(_) - | p @ PScalar::I64(_) => from_int!(dtype, p), - _ => Err(VortexError::InvalidDType(dtype.clone())), - } - } - - pub fn cast_ptype(&self, ptype: PType) -> VortexResult { + pub fn cast_ptype(&self, ptype: PType) -> VortexResult { macro_rules! from_int { ($ptype:ident , $v:ident) => { match $ptype { @@ -120,49 +179,6 @@ fn is_negative(value: T) -> bool { value < T::default() } -impl Scalar for PScalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } - - #[inline] - fn dtype(&self) -> &DType { - self.ptype().into() - } - - fn cast(&self, dtype: &DType) -> VortexResult { - let ptype: VortexResult = dtype.try_into(); - ptype - .and_then(|p| self.cast_ptype(p)) - .or_else(|_| self.cast_dtype(dtype.clone())) - } - - fn nbytes(&self) -> usize { - size_of::() - } -} - macro_rules! pscalar { ($T:ty, $ptype:tt) => { impl From<$T> for PScalar { @@ -171,35 +187,42 @@ macro_rules! pscalar { } } - impl From<$T> for ScalarRef { + impl From<$T> for Scalar { fn from(value: $T) -> Self { - PScalar::from(value).boxed() + PrimitiveScalar::some(PScalar::from(value)).into() } } - impl TryFrom for $T { + impl TryFrom<&Scalar> for $T { type Error = VortexError; - #[inline] - fn try_from(value: ScalarRef) -> VortexResult { - value.as_ref().try_into() + fn try_from(value: &Scalar) -> VortexResult { + match value { + Scalar::Primitive(PrimitiveScalar { + value: Some(pscalar), + .. + }) => match pscalar { + PScalar::$ptype(v) => Ok(*v), + _ => Err(VortexError::InvalidDType(pscalar.ptype().into())), + }, + _ => Err(VortexError::InvalidDType(value.dtype().clone())), + } } } - impl TryFrom<&dyn Scalar> for $T { + impl TryFrom for $T { type Error = VortexError; - fn try_from(value: &dyn Scalar) -> VortexResult { - if let Some(pscalar) = value - .as_nonnull() - .and_then(|v| v.as_any().downcast_ref::()) - { - match pscalar { - PScalar::$ptype(v) => Ok(*v), + fn try_from(value: Scalar) -> VortexResult { + match value { + Scalar::Primitive(PrimitiveScalar { + value: Some(pscalar), + .. + }) => match pscalar { + PScalar::$ptype(v) => Ok(v), _ => Err(VortexError::InvalidDType(pscalar.ptype().into())), - } - } else { - Err(VortexError::InvalidDType(value.dtype().clone())) + }, + _ => Err(VortexError::InvalidDType(value.dtype().clone())), } } } @@ -218,34 +241,62 @@ pscalar!(f16, F16); pscalar!(f32, F32); pscalar!(f64, F64); -impl From> for ScalarRef { +impl From> for Scalar { fn from(value: Option) -> Self { match value { Some(value) => value.into(), - None => Box::new(NullableScalar::None(DType::from(T::PTYPE))), + None => PrimitiveScalar::new(T::PTYPE, None).into(), } } } -impl From for ScalarRef { +impl From for Scalar { #[inline] fn from(value: usize) -> Self { - PScalar::U64(value as u64).boxed() + PrimitiveScalar::new(PType::U64, Some(PScalar::U64(value as u64))).into() } } -impl TryFrom for usize { +impl TryFrom for usize { type Error = VortexError; - fn try_from(value: ScalarRef) -> VortexResult { - value.as_ref().try_into() + fn try_from(value: Scalar) -> VortexResult { + macro_rules! match_each_pscalar_integer { + ($self:expr, | $_:tt $pscalar:ident | $($body:tt)*) => ({ + macro_rules! __with_pscalar__ {( $_ $pscalar:ident ) => ( $($body)* )} + match $self { + PScalar::U8(v) => __with_pscalar__! { v }, + PScalar::U16(v) => __with_pscalar__! { v }, + PScalar::U32(v) => __with_pscalar__! { v }, + PScalar::U64(v) => __with_pscalar__! { v }, + PScalar::I8(v) => __with_pscalar__! { v }, + PScalar::I16(v) => __with_pscalar__! { v }, + PScalar::I32(v) => __with_pscalar__! { v }, + PScalar::I64(v) => __with_pscalar__! { v }, + _ => Err(VortexError::InvalidDType($self.ptype().into())), + } + }) + } + + match value { + Scalar::Primitive(PrimitiveScalar { + value: Some(pscalar), + .. + }) => match_each_pscalar_integer!(pscalar, |$V| { + if is_negative($V) { + return Err(VortexError::ComputeError("required positive integer".into())); + } + Ok($V as usize) + }), + _ => Err(VortexError::InvalidDType(value.dtype().clone())), + } } } -impl TryFrom<&dyn Scalar> for usize { +impl TryFrom<&Scalar> for usize { type Error = VortexError; - fn try_from(value: &dyn Scalar) -> VortexResult { + fn try_from(value: &Scalar) -> VortexResult { macro_rules! match_each_pscalar_integer { ($self:expr, | $_:tt $pscalar:ident | $($body:tt)*) => ({ macro_rules! __with_pscalar__ {( $_ $pscalar:ident ) => ( $($body)* )} @@ -263,18 +314,17 @@ impl TryFrom<&dyn Scalar> for usize { }) } - if let Some(pscalar) = value - .as_nonnull() - .and_then(|v| v.as_any().downcast_ref::()) - { - match_each_pscalar_integer!(pscalar, |$V| { + match value { + Scalar::Primitive(PrimitiveScalar { + value: Some(pscalar), + .. + }) => match_each_pscalar_integer!(pscalar, |$V| { if is_negative(*$V) { return Err(VortexError::ComputeError("required positive integer".into())); } Ok(*$V as usize) - }) - } else { - Err(VortexError::InvalidDType(value.dtype().clone())) + }), + _ => Err(VortexError::InvalidDType(value.dtype().clone())), } } } @@ -302,18 +352,18 @@ mod test { use crate::dtype::{DType, IntWidth, Nullability, Signedness}; use crate::error::VortexError; use crate::ptype::PType; - use crate::scalar::ScalarRef; + use crate::scalar::Scalar; #[test] fn into_from() { - let scalar: ScalarRef = 10u16.into(); - assert_eq!(scalar.as_ref().try_into(), Ok(10u16)); + let scalar: Scalar = 10u16.into(); + assert_eq!(scalar.clone().try_into(), Ok(10u16)); // All integers should be convertible to usize - assert_eq!(scalar.as_ref().try_into(), Ok(10usize)); + assert_eq!(scalar.try_into(), Ok(10usize)); - let scalar: ScalarRef = (-10i16).into(); + let scalar: Scalar = (-10i16).into(); assert_eq!( - scalar.as_ref().try_into(), + scalar.try_into(), Err::(VortexError::ComputeError( "required positive integer".into() )) @@ -322,7 +372,7 @@ mod test { #[test] fn cast() { - let scalar: ScalarRef = 10u16.into(); + let scalar: Scalar = 10u16.into(); let u32_scalar = scalar .cast(&DType::Int( IntWidth::_32, diff --git a/vortex-array/src/scalar/serde.rs b/vortex-array/src/scalar/serde.rs index 68012d9050..8b14a36cee 100644 --- a/vortex-array/src/scalar/serde.rs +++ b/vortex-array/src/scalar/serde.rs @@ -1,130 +1,135 @@ use std::io; -use std::io::{ErrorKind, Read}; +use std::sync::Arc; use half::f16; use num_enum::{IntoPrimitive, TryFromPrimitive}; -use crate::dtype::{DType, FloatWidth, IntWidth, Signedness, TimeUnit}; +use crate::dtype::DType; +use crate::error::VortexResult; +use crate::ptype::PType; +use crate::scalar::composite::CompositeScalar; use crate::scalar::{ - BinaryScalar, BoolScalar, ListScalar, LocalTimeScalar, NullScalar, NullableScalar, PScalar, - Scalar, ScalarRef, StructScalar, Utf8Scalar, + BinaryScalar, BoolScalar, ListScalar, NullScalar, PScalar, PrimitiveScalar, Scalar, + StructScalar, Utf8Scalar, }; -use crate::serde::{DTypeReader, TimeUnitTag, WriteCtx}; +use crate::serde::{ReadCtx, WriteCtx}; -pub struct ScalarReader<'a> { - reader: &'a mut dyn Read, +pub struct ScalarReader<'a, 'b> { + reader: &'b mut ReadCtx<'a>, } -impl<'a> ScalarReader<'a> { - pub fn new(reader: &'a mut dyn Read) -> Self { +impl<'a, 'b> ScalarReader<'a, 'b> { + pub fn new(reader: &'b mut ReadCtx<'a>) -> Self { Self { reader } } - fn read_nbytes(&mut self) -> io::Result<[u8; N]> { - let mut bytes: [u8; N] = [0; N]; - self.reader.read_exact(&mut bytes)?; - Ok(bytes) - } - - pub fn read(&mut self) -> io::Result { - let tag = ScalarTag::try_from(self.read_nbytes::<1>()?[0]) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; + pub fn read(&mut self) -> VortexResult { + let tag = ScalarTag::try_from(self.reader.read_nbytes::<1>()?[0]) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; match tag { ScalarTag::Binary => { - let len = leb128::read::unsigned(self.reader) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - let mut value = Vec::::with_capacity(len as usize); - self.reader.take(len).read_to_end(&mut value)?; - Ok(BinaryScalar::new(value).boxed()) - } - ScalarTag::Bool => Ok(BoolScalar::new(self.read_nbytes::<1>()?[0] != 0).boxed()), - ScalarTag::F16 => { - Ok(PScalar::F16(f16::from_le_bytes(self.read_nbytes::<2>()?)).boxed()) - } - ScalarTag::F32 => { - Ok(PScalar::F32(f32::from_le_bytes(self.read_nbytes::<4>()?)).boxed()) - } - ScalarTag::F64 => { - Ok(PScalar::F64(f64::from_le_bytes(self.read_nbytes::<8>()?)).boxed()) - } - ScalarTag::I16 => { - Ok(PScalar::I16(i16::from_le_bytes(self.read_nbytes::<2>()?)).boxed()) - } - ScalarTag::I32 => { - Ok(PScalar::I32(i32::from_le_bytes(self.read_nbytes::<4>()?)).boxed()) + let slice = self.reader.read_optional_slice()?; + Ok(BinaryScalar::new(slice).into()) } - ScalarTag::I64 => { - Ok(PScalar::I64(i64::from_le_bytes(self.read_nbytes::<8>()?)).boxed()) - } - ScalarTag::I8 => Ok(PScalar::I8(i8::from_le_bytes(self.read_nbytes::<1>()?)).boxed()), - ScalarTag::List => { - let elems = leb128::read::unsigned(self.reader) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - if elems == 0 { - let dtype = DTypeReader::new(self.reader).read()?; - Ok(ListScalar::new(dtype, Vec::new()).boxed()) + ScalarTag::Bool => { + let is_present = self.reader.read_option_tag()?; + if is_present { + Ok(BoolScalar::some(self.reader.read_nbytes::<1>()?[0] != 0).into()) } else { - let mut values = Vec::::with_capacity(elems as usize); - for value in values.iter_mut() { - *value = self.read()?; - } - Ok(ListScalar::new(values[0].dtype().clone(), values).boxed()) + Ok(BoolScalar::none().into()) } } - ScalarTag::LocalTime => { - let pscalar = self - .read()? - .into_any() - .downcast::() - .map_err(|_e| io::Error::new(ErrorKind::InvalidData, "invalid scalar"))?; - let time_unit = TimeUnitTag::try_from(self.read_nbytes::<1>()?[0]) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) - .map(TimeUnit::from)?; - - Ok(LocalTimeScalar::new(*pscalar, time_unit).boxed()) - } - ScalarTag::Null => Ok(NullScalar::new().boxed()), - ScalarTag::Nullable => { - let tag = self.read_nbytes::<1>()?[0]; - match tag { - 0x00 => Ok(NullableScalar::none(DTypeReader::new(self.reader).read()?).boxed()), - 0x01 => Ok(NullableScalar::some(self.read()?).boxed()), - _ => Err(io::Error::new( - ErrorKind::InvalidData, - "Invalid NullableScalar tag", - )), + ScalarTag::PrimitiveS => self.read_primitive_scalar().map(|p| p.into()), + ScalarTag::List => { + let is_present = self.reader.read_option_tag()?; + if is_present { + let elems = self.reader.read_usize()?; + let mut values = Vec::with_capacity(elems); + for _ in 0..elems { + values.push(self.read()?); + } + Ok(ListScalar::new(values[0].dtype().clone(), Some(values)).into()) + } else { + Ok(ListScalar::new(self.reader.dtype()?, None).into()) } } + ScalarTag::Null => Ok(NullScalar::new().into()), ScalarTag::Struct => { - let dtype = DTypeReader::new(self.reader).read()?; - let DType::Struct(ns, _fs) = &dtype else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid dtype")); - }; - let mut values = Vec::::with_capacity(ns.len()); - for value in values.iter_mut() { - *value = self.read()?; + let field_num = self.reader.read_usize()?; + let mut names = Vec::with_capacity(field_num); + for _ in 0..field_num { + names.push(Arc::new( + self.reader + .read_slice() + .map(|v| unsafe { String::from_utf8_unchecked(v) })?, + )); } - Ok(StructScalar::new(dtype, values).boxed()) - } - ScalarTag::U16 => { - Ok(PScalar::U16(u16::from_le_bytes(self.read_nbytes::<2>()?)).boxed()) - } - ScalarTag::U32 => { - Ok(PScalar::U32(u32::from_le_bytes(self.read_nbytes::<4>()?)).boxed()) - } - ScalarTag::U64 => { - Ok(PScalar::U64(u64::from_le_bytes(self.read_nbytes::<8>()?)).boxed()) + let mut values = Vec::with_capacity(field_num); + for _ in 0..field_num { + values.push(self.read()?); + } + let dtypes = values.iter().map(|s| s.dtype().clone()).collect::>(); + Ok(StructScalar::new(DType::Struct(names, dtypes), values).into()) } - ScalarTag::U8 => Ok(PScalar::U8(u8::from_le_bytes(self.read_nbytes::<1>()?)).boxed()), ScalarTag::Utf8 => { - let len = leb128::read::unsigned(self.reader) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - let mut value = Vec::::with_capacity(len as usize); - self.reader.take(len).read_to_end(&mut value)?; - Ok(Utf8Scalar::new(unsafe { String::from_utf8_unchecked(value) }).boxed()) + let value = self.reader.read_optional_slice()?; + Ok( + Utf8Scalar::new(value.map(|v| unsafe { String::from_utf8_unchecked(v) })) + .into(), + ) + } + ScalarTag::Composite => { + let dtype = self.reader.dtype()?; + let scalar = self.read()?; + Ok(CompositeScalar::new(dtype, Box::new(scalar)).into()) } } } + + fn read_primitive_scalar(&mut self) -> VortexResult { + let ptype = self.reader.ptype()?; + let is_present = self.reader.read_option_tag()?; + if is_present { + let pscalar = match ptype { + PType::U8 => PrimitiveScalar::some(PScalar::U8(u8::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::U16 => PrimitiveScalar::some(PScalar::U16(u16::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::U32 => PrimitiveScalar::some(PScalar::U32(u32::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::U64 => PrimitiveScalar::some(PScalar::U64(u64::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::I8 => PrimitiveScalar::some(PScalar::I8(i8::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::I16 => PrimitiveScalar::some(PScalar::I16(i16::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::I32 => PrimitiveScalar::some(PScalar::I32(i32::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::I64 => PrimitiveScalar::some(PScalar::I64(i64::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::F16 => PrimitiveScalar::some(PScalar::F16(f16::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::F32 => PrimitiveScalar::some(PScalar::F32(f32::from_le_bytes( + self.reader.read_nbytes()?, + ))), + PType::F64 => PrimitiveScalar::some(PScalar::F64(f64::from_le_bytes( + self.reader.read_nbytes()?, + ))), + }; + Ok(pscalar) + } else { + Ok(PrimitiveScalar::none(ptype)) + } + } } pub struct ScalarWriter<'a, 'b> { @@ -136,127 +141,72 @@ impl<'a, 'b> ScalarWriter<'a, 'b> { Self { writer } } - pub fn write(&mut self, scalar: &dyn Scalar) -> io::Result<()> { - let tag = ScalarTag::from(scalar); - self.writer.write_fixed_slice([tag.into()])?; - match tag { - ScalarTag::Binary => { - let binary = scalar.as_any().downcast_ref::().unwrap(); - self.writer.write_slice(binary.value().as_slice()) - } - ScalarTag::Bool => self.writer.write_fixed_slice([scalar - .as_any() - .downcast_ref::() - .unwrap() - .value() as u8]), - ScalarTag::F16 => { - let PScalar::F16(f) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(f.to_le_bytes()) - } - ScalarTag::F32 => { - let PScalar::F32(f) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(f.to_le_bytes()) - } - ScalarTag::F64 => { - let PScalar::F64(f) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(f.to_le_bytes()) - } - ScalarTag::I16 => { - let PScalar::I16(i) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(i.to_le_bytes()) - } - ScalarTag::I32 => { - let PScalar::I32(i) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(i.to_le_bytes()) - } - ScalarTag::I64 => { - let PScalar::I64(i) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(i.to_le_bytes()) - } - ScalarTag::I8 => { - let PScalar::I8(i) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(i.to_le_bytes()) + pub fn write(&mut self, scalar: &Scalar) -> VortexResult<()> { + self.writer + .write_fixed_slice([ScalarTag::from(scalar).into()])?; + match scalar { + Scalar::Binary(b) => self.writer.write_optional_slice(b.value()), + Scalar::Bool(b) => { + self.writer.write_option_tag(b.value().is_some())?; + if let Some(v) = b.value() { + self.writer.write_fixed_slice([v as u8])?; + } + Ok(()) } - ScalarTag::List => { - let ls = scalar.as_any().downcast_ref::().unwrap(); - self.writer.write_usize(ls.values().len())?; - if ls.values().is_empty() { - self.writer.dtype(ls.dtype())?; - Ok(()) - } else { - for elem in ls.values() { - self.write(elem.as_ref())?; + Scalar::List(ls) => { + self.writer.write_option_tag(ls.values().is_some())?; + if let Some(vs) = ls.values() { + self.writer.write_usize(vs.len())?; + for elem in vs { + self.write(elem)?; } - Ok(()) + } else { + self.writer.dtype(ls.dtype())?; } + Ok(()) } - ScalarTag::LocalTime => { - let lt = scalar.as_any().downcast_ref::().unwrap(); - self.write(lt.value())?; - self.writer - .write_fixed_slice([TimeUnitTag::from(lt.time_unit()).into()]) - } - ScalarTag::Null => Ok(()), - ScalarTag::Nullable => { - let ns = scalar.as_any().downcast_ref::().unwrap(); - self.writer - .write_option_tag(matches!(ns, NullableScalar::Some(_, _)))?; - match ns { - NullableScalar::None(d) => self.writer.dtype(d), - NullableScalar::Some(s, _) => self.write(s.as_ref()), + Scalar::Null(_) => Ok(()), + Scalar::Primitive(p) => self.write_primitive_scalar(p), + Scalar::Struct(s) => { + let names = s.names(); + self.writer.write_usize(names.len())?; + for n in names { + self.writer.write_slice(n.as_bytes())?; } - } - ScalarTag::Struct => { - let s = scalar.as_any().downcast_ref::().unwrap(); - self.writer.dtype(s.dtype())?; for field in s.values() { - self.write(field.as_ref())?; + self.write(field)?; } Ok(()) } - ScalarTag::U16 => { - let PScalar::U16(u) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(u.to_le_bytes()) - } - ScalarTag::U32 => { - let PScalar::U32(u) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(u.to_le_bytes()) - } - ScalarTag::U64 => { - let PScalar::U64(u) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(u.to_le_bytes()) - } - ScalarTag::U8 => { - let PScalar::U8(u) = scalar.as_any().downcast_ref::().unwrap() else { - return Err(io::Error::new(ErrorKind::InvalidData, "invalid scalar")); - }; - self.writer.write_fixed_slice(u.to_le_bytes()) + Scalar::Utf8(u) => self + .writer + .write_optional_slice(u.value().map(|s| s.as_bytes())), + Scalar::Composite(c) => { + self.writer.dtype(c.dtype())?; + self.write(c.scalar()) } - ScalarTag::Utf8 => { - let utf8 = scalar.as_any().downcast_ref::().unwrap(); - self.writer.write_slice(utf8.value().as_bytes()) + } + } + + fn write_primitive_scalar(&mut self, scalar: &PrimitiveScalar) -> VortexResult<()> { + self.writer.ptype(scalar.ptype())?; + self.writer.write_option_tag(scalar.value().is_some())?; + if let Some(ps) = scalar.value() { + match ps { + PScalar::F16(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, + PScalar::F32(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, + PScalar::F64(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, + PScalar::I16(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, + PScalar::I32(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, + PScalar::I64(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, + PScalar::I8(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, + PScalar::U16(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, + PScalar::U32(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, + PScalar::U64(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, + PScalar::U8(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, } } + Ok(()) } } @@ -265,62 +215,26 @@ impl<'a, 'b> ScalarWriter<'a, 'b> { enum ScalarTag { Binary, Bool, - F16, - F32, - F64, - I16, - I32, - I64, - I8, List, - LocalTime, Null, - Nullable, + // TODO(robert): rename to primitive once we stop using enum for serialization + PrimitiveS, Struct, - U16, - U32, - U64, - U8, Utf8, + Composite, } -impl From<&dyn Scalar> for ScalarTag { - fn from(value: &dyn Scalar) -> Self { - if value.dtype().is_nullable() { - return ScalarTag::Nullable; - } - - match value.dtype() { - DType::Null => ScalarTag::Null, - DType::Bool(_) => ScalarTag::Bool, - DType::Int(w, s, _) => match (w, s) { - (IntWidth::Unknown, Signedness::Unknown | Signedness::Signed) => ScalarTag::I64, - (IntWidth::_8, Signedness::Unknown | Signedness::Signed) => ScalarTag::I8, - (IntWidth::_16, Signedness::Unknown | Signedness::Signed) => ScalarTag::I16, - (IntWidth::_32, Signedness::Unknown | Signedness::Signed) => ScalarTag::I32, - (IntWidth::_64, Signedness::Unknown | Signedness::Signed) => ScalarTag::I64, - (IntWidth::Unknown, Signedness::Unsigned) => ScalarTag::U64, - (IntWidth::_8, Signedness::Unsigned) => ScalarTag::U8, - (IntWidth::_16, Signedness::Unsigned) => ScalarTag::U16, - (IntWidth::_32, Signedness::Unsigned) => ScalarTag::U32, - (IntWidth::_64, Signedness::Unsigned) => ScalarTag::U64, - }, - DType::Decimal(_, _, _) => unimplemented!("decimal scalar"), - DType::Float(w, _) => match w { - FloatWidth::Unknown => ScalarTag::F64, - FloatWidth::_16 => ScalarTag::F16, - FloatWidth::_32 => ScalarTag::F32, - FloatWidth::_64 => ScalarTag::F64, - }, - DType::Utf8(_) => ScalarTag::Utf8, - DType::Binary(_) => ScalarTag::Binary, - DType::LocalTime(_, _) => ScalarTag::LocalTime, - DType::LocalDate(_) => unimplemented!("local date"), - DType::Instant(_, _) => unimplemented!("instant scalar"), - DType::ZonedDateTime(_, _) => unimplemented!("zoned date time scalar"), - DType::Struct(_, _) => ScalarTag::Struct, - DType::List(_, _) => ScalarTag::List, - DType::Map(_, _, _) => unimplemented!("map scalar"), +impl From<&Scalar> for ScalarTag { + fn from(value: &Scalar) -> Self { + match value { + Scalar::Binary(_) => ScalarTag::Binary, + Scalar::Bool(_) => ScalarTag::Bool, + Scalar::List(_) => ScalarTag::List, + Scalar::Null(_) => ScalarTag::Null, + Scalar::Primitive(_) => ScalarTag::PrimitiveS, + Scalar::Struct(_) => ScalarTag::Struct, + Scalar::Utf8(_) => ScalarTag::Utf8, + Scalar::Composite(_) => ScalarTag::Composite, } } } diff --git a/vortex-array/src/scalar/struct_.rs b/vortex-array/src/scalar/struct_.rs index b000099f5a..f097304523 100644 --- a/vortex-array/src/scalar/struct_.rs +++ b/vortex-array/src/scalar/struct_.rs @@ -1,70 +1,50 @@ -use std::any::Any; use std::cmp::Ordering; use std::fmt::{Display, Formatter}; +use std::sync::Arc; use itertools::Itertools; use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{Scalar, ScalarRef}; +use crate::scalar::Scalar; #[derive(Debug, Clone, PartialEq)] pub struct StructScalar { dtype: DType, - values: Vec, + values: Vec, } impl StructScalar { #[inline] - pub fn new(dtype: DType, values: Vec) -> Self { + pub fn new(dtype: DType, values: Vec) -> Self { Self { dtype, values } } #[inline] - pub fn values(&self) -> &[ScalarRef] { - &self.values + pub fn values(&self) -> &[Scalar] { + self.values.as_ref() } -} -impl Scalar for StructScalar { #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) - } - - #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) + pub fn dtype(&self) -> &DType { + &self.dtype } - #[inline] - fn dtype(&self) -> &DType { - &self.dtype + pub fn names(&self) -> &[Arc] { + let DType::Struct(ns, _) = self.dtype() else { + unreachable!("Not a scalar dtype"); + }; + ns.as_slice() } - fn cast(&self, dtype: &DType) -> VortexResult { + pub fn cast(&self, dtype: &DType) -> VortexResult { match dtype { DType::Struct(names, field_dtypes) => { if field_dtypes.len() != self.values.len() { return Err(VortexError::InvalidDType(dtype.clone())); } - let new_fields: Vec = self + let new_fields: Vec = self .values .iter() .zip_eq(field_dtypes.iter()) @@ -75,14 +55,14 @@ impl Scalar for StructScalar { names.clone(), new_fields.iter().map(|x| x.dtype().clone()).collect(), ); - Ok(StructScalar::new(new_type, new_fields).boxed()) + Ok(StructScalar::new(new_type, new_fields).into()) } _ => Err(VortexError::InvalidDType(dtype.clone())), } } - fn nbytes(&self) -> usize { - self.values.iter().map(|s| s.nbytes()).sum() + pub fn nbytes(&self) -> usize { + self.values().iter().map(|s| s.nbytes()).sum() } } diff --git a/vortex-array/src/scalar/utf8.rs b/vortex-array/src/scalar/utf8.rs index 95109016d8..5153ab5511 100644 --- a/vortex-array/src/scalar/utf8.rs +++ b/vortex-array/src/scalar/utf8.rs @@ -1,103 +1,82 @@ -use std::any::Any; use std::fmt::{Display, Formatter}; use crate::dtype::{DType, Nullability}; use crate::error::{VortexError, VortexResult}; -use crate::scalar::{Scalar, ScalarRef}; +use crate::scalar::Scalar; #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Utf8Scalar { - value: String, + value: Option, } impl Utf8Scalar { - pub fn new(value: String) -> Self { + pub fn new(value: Option) -> Self { Self { value } } - pub fn value(&self) -> &str { - self.value.as_str() - } -} - -impl Scalar for Utf8Scalar { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn as_nonnull(&self) -> Option<&dyn Scalar> { - Some(self) + pub fn value(&self) -> Option<&str> { + self.value.as_deref() } #[inline] - fn into_nonnull(self: Box) -> Option { - Some(self) - } - - #[inline] - fn boxed(self) -> ScalarRef { - Box::new(self) - } - - #[inline] - fn dtype(&self) -> &DType { + pub fn dtype(&self) -> &DType { &DType::Utf8(Nullability::NonNullable) } - fn cast(&self, _dtype: &DType) -> VortexResult { + pub fn cast(&self, _dtype: &DType) -> VortexResult { todo!() } - fn nbytes(&self) -> usize { - self.value.len() + pub fn nbytes(&self) -> usize { + self.value().map(|v| v.len()).unwrap_or(0) } } -impl From for ScalarRef { +impl From for Scalar { fn from(value: String) -> Self { - Utf8Scalar::new(value).boxed() + Utf8Scalar::new(Some(value)).into() } } -impl From<&str> for ScalarRef { +impl From<&str> for Scalar { fn from(value: &str) -> Self { - Utf8Scalar::new(value.to_string()).boxed() + Utf8Scalar::new(Some(value.to_string())).into() } } -impl TryFrom for String { +impl TryFrom for String { type Error = VortexError; - fn try_from(value: ScalarRef) -> Result { - let dtype = value.dtype().clone(); - let scalar = value - .into_any() - .downcast::() - .map_err(|_| VortexError::InvalidDType(dtype))?; - Ok(scalar.value) + fn try_from(value: Scalar) -> Result { + let Scalar::Utf8(u) = value else { + return Err(VortexError::InvalidDType(value.dtype().clone())); + }; + match u.value { + None => Err(VortexError::InvalidDType(u.dtype().clone())), + Some(s) => Ok(s), + } } } -impl TryFrom<&dyn Scalar> for String { +impl TryFrom<&Scalar> for String { type Error = VortexError; - fn try_from(value: &dyn Scalar) -> Result { - if let Some(scalar) = value.as_any().downcast_ref::() { - Ok(scalar.value().to_string()) - } else { - Err(VortexError::InvalidDType(value.dtype().clone())) + fn try_from(value: &Scalar) -> Result { + let Scalar::Utf8(u) = value else { + return Err(VortexError::InvalidDType(value.dtype().clone())); + }; + match u.value() { + None => Err(VortexError::InvalidDType(u.dtype().clone())), + Some(s) => Ok(s.to_string()), } } } impl Display for Utf8Scalar { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.value) + match self.value() { + None => write!(f, ""), + Some(v) => Display::fmt(v, f), + } } } diff --git a/vortex-array/src/serde/dtype.rs b/vortex-array/src/serde/dtype.rs index 19caca7e11..d53793f016 100644 --- a/vortex-array/src/serde/dtype.rs +++ b/vortex-array/src/serde/dtype.rs @@ -1,11 +1,12 @@ -use std::io; -use std::io::{ErrorKind, Read}; +use leb128::read::Error; +use std::io::Read; use std::sync::Arc; use num_enum::{IntoPrimitive, TryFromPrimitive}; use crate::dtype::DType::*; -use crate::dtype::{DType, FloatWidth, IntWidth, Nullability, Signedness, TimeUnit}; +use crate::dtype::{DType, FloatWidth, IntWidth, Nullability, Signedness}; +use crate::error::{VortexError, VortexResult}; use crate::serde::WriteCtx; pub struct DTypeReader<'a> { @@ -17,15 +18,34 @@ impl<'a> DTypeReader<'a> { Self { reader } } - fn read_byte(&mut self) -> io::Result { - let mut buf: [u8; 1] = [0; 1]; - self.reader.read_exact(&mut buf)?; - Ok(buf[0]) + fn read_nbytes(&mut self) -> VortexResult<[u8; N]> { + let mut bytes: [u8; N] = [0; N]; + self.reader.read_exact(&mut bytes)?; + Ok(bytes) } - pub fn read(&mut self) -> io::Result { - let dtype = DTypeTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidInput, e))?; + fn read_usize(&mut self) -> VortexResult { + leb128::read::unsigned(self.reader) + .map_err(|e| match e { + Error::IoError(io_err) => io_err.into(), + Error::Overflow => VortexError::InvalidArgument("overflow".into()), + }) + .map(|u| u as usize) + } + + fn read_slice(&mut self) -> VortexResult> { + let len = self.read_usize()?; + let mut slice = Vec::with_capacity(len); + self.reader + .take(len as u64) + .read_to_end(&mut slice) + .map_err(VortexError::from)?; + Ok(slice) + } + + pub fn read(&mut self) -> VortexResult { + let dtype = DTypeTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|_| VortexError::InvalidArgument("Failed to parse dtype tag".into()))?; match dtype { DTypeTag::Null => Ok(Null), DTypeTag::Bool => Ok(Bool(self.read_nullability()?)), @@ -45,89 +65,63 @@ impl<'a> DTypeReader<'a> { DTypeTag::Binary => Ok(Binary(self.read_nullability()?)), DTypeTag::Decimal => { let nullability = self.read_nullability()?; - let mut precision_scale: [u8; 2] = [0; 2]; - self.reader.read_exact(&mut precision_scale)?; + let precision_scale: [u8; 2] = self.read_nbytes()?; Ok(Decimal( precision_scale[0], precision_scale[1] as i8, nullability, )) } - DTypeTag::LocalTime => { - let nullability = self.read_nullability()?; - Ok(LocalTime(self.read_time_unit()?, nullability)) - } - DTypeTag::LocalDate => Ok(LocalDate(self.read_nullability()?)), - DTypeTag::Instant => { - let nullability = self.read_nullability()?; - Ok(Instant(self.read_time_unit()?, nullability)) - } - DTypeTag::ZonedDateTime => { - let nullability = self.read_nullability()?; - Ok(ZonedDateTime(self.read_time_unit()?, nullability)) - } DTypeTag::List => { let nullability = self.read_nullability()?; Ok(List(Box::new(self.read()?), nullability)) } - DTypeTag::Map => { - let nullability = self.read_nullability()?; - Ok(Map( - Box::new(self.read()?), - Box::new(self.read()?), - nullability, - )) - } DTypeTag::Struct => { - let field_num = leb128::read::unsigned(self.reader) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - let mut names = Vec::>::with_capacity(field_num as usize); + let field_num = self.read_usize()?; + let mut names = Vec::with_capacity(field_num); for _ in 0..field_num { - let len = leb128::read::unsigned(self.reader) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - let mut name = String::with_capacity(len as usize); - self.reader.take(len).read_to_string(&mut name)?; + let name = unsafe { String::from_utf8_unchecked(self.read_slice()?) }; names.push(Arc::new(name)); } - let mut fields = Vec::::with_capacity(field_num as usize); + let mut fields = Vec::with_capacity(field_num); for _ in 0..field_num { fields.push(self.read()?); } Ok(Struct(names, fields)) } + DTypeTag::Composite => { + let name = unsafe { String::from_utf8_unchecked(self.read_slice()?) }; + let dtype = self.read()?; + let metadata = self.read_slice()?; + Ok(Composite(Arc::new(name), Box::new(dtype), metadata)) + } } } - fn read_signedness(&mut self) -> io::Result { - SignednessTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) + fn read_signedness(&mut self) -> VortexResult { + SignednessTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|_| VortexError::InvalidArgument("Failed to parse signedness tag".into())) .map(Signedness::from) } - fn read_nullability(&mut self) -> io::Result { - NullabilityTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) + fn read_nullability(&mut self) -> VortexResult { + NullabilityTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|_| VortexError::InvalidArgument("Failed to parse nullability tag".into())) .map(Nullability::from) } - fn read_int_width(&mut self) -> io::Result { - IntWidthTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) + fn read_int_width(&mut self) -> VortexResult { + IntWidthTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|_| VortexError::InvalidArgument("Failed to parse int width tag".into())) .map(IntWidth::from) } - fn read_float_width(&mut self) -> io::Result { - FloatWidthTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) + fn read_float_width(&mut self) -> VortexResult { + FloatWidthTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|_| VortexError::InvalidArgument("Failed to parse float width tag".into())) .map(FloatWidth::from) } - - fn read_time_unit(&mut self) -> io::Result { - TimeUnitTag::try_from(self.read_byte()?) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) - .map(TimeUnit::from) - } } pub struct DTypeWriter<'a, 'b> { @@ -139,7 +133,7 @@ impl<'a, 'b> DTypeWriter<'a, 'b> { Self { writer } } - pub fn write(&mut self, dtype: &DType) -> io::Result<()> { + pub fn write(&mut self, dtype: &DType) -> VortexResult<()> { self.writer .write_fixed_slice([DTypeTag::from(dtype).into()])?; match dtype { @@ -160,19 +154,6 @@ impl<'a, 'b> DTypeWriter<'a, 'b> { } Utf8(n) => self.write_nullability(*n)?, Binary(n) => self.write_nullability(*n)?, - LocalTime(u, n) => { - self.write_nullability(*n)?; - self.write_time_unit(*u)? - } - LocalDate(n) => self.write_nullability(*n)?, - Instant(u, n) => { - self.write_nullability(*n)?; - self.write_time_unit(*u)? - } - ZonedDateTime(u, n) => { - self.write_nullability(*n)?; - self.write_time_unit(*u)? - } Struct(ns, fs) => { self.writer.write_usize(ns.len())?; for name in ns { @@ -186,40 +167,35 @@ impl<'a, 'b> DTypeWriter<'a, 'b> { self.write_nullability(*n)?; self.write(e.as_ref())? } - Map(k, v, n) => { - self.write_nullability(*n)?; - self.write(k.as_ref())?; - self.write(v.as_ref())? + Composite(n, d, m) => { + self.writer.write_slice(n.as_bytes())?; + self.writer.dtype(d)?; + self.writer.write_slice(m)? } } Ok(()) } - fn write_signedness(&mut self, signedness: Signedness) -> io::Result<()> { + fn write_signedness(&mut self, signedness: Signedness) -> VortexResult<()> { self.writer .write_fixed_slice([SignednessTag::from(signedness).into()]) } - fn write_nullability(&mut self, nullability: Nullability) -> io::Result<()> { + fn write_nullability(&mut self, nullability: Nullability) -> VortexResult<()> { self.writer .write_fixed_slice([NullabilityTag::from(nullability).into()]) } - fn write_int_width(&mut self, int_width: IntWidth) -> io::Result<()> { + fn write_int_width(&mut self, int_width: IntWidth) -> VortexResult<()> { self.writer .write_fixed_slice([IntWidthTag::from(int_width).into()]) } - fn write_float_width(&mut self, float_width: FloatWidth) -> io::Result<()> { + fn write_float_width(&mut self, float_width: FloatWidth) -> VortexResult<()> { self.writer .write_fixed_slice([FloatWidthTag::from(float_width).into()]) } - - fn write_time_unit(&mut self, time_unit: TimeUnit) -> io::Result<()> { - self.writer - .write_fixed_slice([TimeUnitTag::from(time_unit).into()]) - } } #[derive(IntoPrimitive, TryFromPrimitive)] @@ -232,13 +208,9 @@ enum DTypeTag { Utf8, Binary, Decimal, - LocalTime, - LocalDate, - Instant, - ZonedDateTime, List, - Map, Struct, + Composite, } impl From<&DType> for DTypeTag { @@ -251,13 +223,9 @@ impl From<&DType> for DTypeTag { Utf8(_) => DTypeTag::Utf8, Binary(_) => DTypeTag::Binary, Decimal(_, _, _) => DTypeTag::Decimal, - LocalTime(_, _) => DTypeTag::LocalTime, - LocalDate(_) => DTypeTag::LocalDate, - Instant(_, _) => DTypeTag::Instant, - ZonedDateTime(_, _) => DTypeTag::ZonedDateTime, List(_, _) => DTypeTag::List, - Map(_, _, _) => DTypeTag::Map, Struct(_, _) => DTypeTag::Struct, + Composite(_, _, _) => DTypeTag::Composite, } } } @@ -390,39 +358,6 @@ impl From for IntWidth { } } -#[derive(IntoPrimitive, TryFromPrimitive)] -#[repr(u8)] -pub enum TimeUnitTag { - Ns, - Us, - Ms, - S, -} - -impl From for TimeUnitTag { - fn from(value: TimeUnit) -> Self { - use TimeUnit::*; - match value { - Ns => TimeUnitTag::Ns, - Us => TimeUnitTag::Us, - Ms => TimeUnitTag::Ms, - S => TimeUnitTag::S, - } - } -} - -impl From for TimeUnit { - fn from(value: TimeUnitTag) -> Self { - use TimeUnit::*; - match value { - TimeUnitTag::Ns => Ns, - TimeUnitTag::Us => Us, - TimeUnitTag::Ms => Ms, - TimeUnitTag::S => S, - } - } -} - #[cfg(test)] mod test { use crate::dtype::DType::Int; diff --git a/vortex-array/src/serde/mod.rs b/vortex-array/src/serde/mod.rs index a2bc1d517c..c5d300b0c9 100644 --- a/vortex-array/src/serde/mod.rs +++ b/vortex-array/src/serde/mod.rs @@ -1,21 +1,25 @@ use std::io; use std::io::{ErrorKind, Read, Write}; -use arrow::buffer::{Buffer, MutableBuffer}; +use arrow_buffer::buffer::{Buffer, MutableBuffer}; use crate::array::{Array, ArrayRef, EncodingId, ENCODINGS}; use crate::dtype::{DType, IntWidth, Nullability, Signedness}; -use crate::scalar::{Scalar, ScalarReader, ScalarRef, ScalarWriter}; -pub use crate::serde::dtype::{DTypeReader, DTypeWriter, TimeUnitTag}; +use crate::error::{VortexError, VortexResult}; +use crate::ptype::PType; +use crate::scalar::{Scalar, ScalarReader, ScalarWriter}; +pub use crate::serde::dtype::{DTypeReader, DTypeWriter}; +use crate::serde::ptype::PTypeTag; mod dtype; +mod ptype; pub trait ArraySerde { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()>; + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()>; } pub trait EncodingSerde { - fn read(&self, ctx: &mut ReadCtx) -> io::Result; + fn read(&self, ctx: &mut ReadCtx) -> VortexResult; } pub struct ReadCtx<'a> { @@ -66,16 +70,27 @@ impl<'a> ReadCtx<'a> { } #[inline] - pub fn dtype(&mut self) -> io::Result { + pub fn dtype(&mut self) -> VortexResult { DTypeReader::new(self.r).read() } + pub fn ptype(&mut self) -> VortexResult { + let typetag = PTypeTag::try_from(self.read_nbytes::<1>()?[0]) + .map_err(|e| io::Error::new(ErrorKind::InvalidInput, e))?; + Ok(typetag.into()) + } + #[inline] - pub fn scalar(&mut self) -> io::Result { - ScalarReader::new(self.r).read() + pub fn scalar(&mut self) -> VortexResult { + ScalarReader::new(self).read() + } + + pub fn read_optional_slice(&mut self) -> VortexResult>> { + let is_present = self.read_option_tag()?; + is_present.then(|| self.read_slice()).transpose() } - pub fn read_slice(&mut self) -> io::Result> { + pub fn read_slice(&mut self) -> VortexResult> { let len = self.read_usize()?; let mut data = Vec::::with_capacity(len); self.r.take(len as u64).read_to_end(&mut data)?; @@ -85,7 +100,7 @@ impl<'a> ReadCtx<'a> { pub fn read_buffer usize>( &mut self, byte_len: F, - ) -> io::Result<(usize, Buffer)> { + ) -> VortexResult<(usize, Buffer)> { let logical_len = self.read_usize()?; let buffer_len = byte_len(logical_len); let mut buffer = MutableBuffer::from_len_zeroed(buffer_len); @@ -93,25 +108,25 @@ impl<'a> ReadCtx<'a> { Ok((logical_len, buffer.into())) } - pub fn read_nbytes(&mut self) -> io::Result<[u8; N]> { + pub fn read_nbytes(&mut self) -> VortexResult<[u8; N]> { let mut bytes: [u8; N] = [0; N]; self.r.read_exact(&mut bytes)?; Ok(bytes) } - pub fn read_usize(&mut self) -> io::Result { + pub fn read_usize(&mut self) -> VortexResult { leb128::read::unsigned(self.r) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) + .map_err(|_| VortexError::InvalidArgument("Failed to parse leb128 usize".into())) .map(|u| u as usize) } - pub fn read_option_tag(&mut self) -> io::Result { + pub fn read_option_tag(&mut self) -> VortexResult { let mut tag = [0; 1]; self.r.read_exact(&mut tag)?; Ok(tag[0] == 0x01) } - pub fn read_optional_array(&mut self) -> io::Result> { + pub fn read_optional_array(&mut self) -> VortexResult> { if self.read_option_tag()? { self.read().map(Some) } else { @@ -119,7 +134,7 @@ impl<'a> ReadCtx<'a> { } } - pub fn read(&mut self) -> io::Result { + pub fn read(&mut self) -> VortexResult { let encoding_id = self.read_usize()?; if let Some(serde) = ENCODINGS .iter() @@ -129,7 +144,9 @@ impl<'a> ReadCtx<'a> { { serde.read(self) } else { - Err(io::Error::new(ErrorKind::InvalidData, "unknown encoding")) + Err(VortexError::InvalidArgument( + "Failed to recognize encoding ID".into(), + )) } } } @@ -148,37 +165,54 @@ impl<'a> WriteCtx<'a> { } } - pub fn dtype(&mut self, dtype: &DType) -> io::Result<()> { + pub fn dtype(&mut self, dtype: &DType) -> VortexResult<()> { DTypeWriter::new(self).write(dtype) } - pub fn scalar(&mut self, scalar: &dyn Scalar) -> io::Result<()> { + pub fn ptype(&mut self, ptype: PType) -> VortexResult<()> { + self.write_fixed_slice([PTypeTag::from(ptype).into()]) + } + + pub fn scalar(&mut self, scalar: &Scalar) -> VortexResult<()> { ScalarWriter::new(self).write(scalar) } - pub fn write_usize(&mut self, u: usize) -> io::Result<()> { - leb128::write::unsigned(self.w, u as u64).map(|_| ()) + pub fn write_usize(&mut self, u: usize) -> VortexResult<()> { + leb128::write::unsigned(self.w, u as u64) + .map_err(|_| VortexError::InvalidArgument("Failed to write leb128 usize".into())) + .map(|_| ()) } - pub fn write_fixed_slice(&mut self, slice: [u8; N]) -> io::Result<()> { - self.w.write_all(&slice) + pub fn write_fixed_slice(&mut self, slice: [u8; N]) -> VortexResult<()> { + self.w.write_all(&slice).map_err(|e| e.into()) } - pub fn write_slice(&mut self, slice: &[u8]) -> io::Result<()> { + pub fn write_slice(&mut self, slice: &[u8]) -> VortexResult<()> { self.write_usize(slice.len())?; - self.w.write_all(slice) + self.w.write_all(slice).map_err(|e| e.into()) } - pub fn write_buffer(&mut self, logical_len: usize, buf: &Buffer) -> io::Result<()> { + pub fn write_optional_slice(&mut self, slice: Option<&[u8]>) -> VortexResult<()> { + self.write_option_tag(slice.is_some())?; + if let Some(s) = slice { + self.write_slice(s) + } else { + Ok(()) + } + } + + pub fn write_buffer(&mut self, logical_len: usize, buf: &Buffer) -> VortexResult<()> { self.write_usize(logical_len)?; - self.w.write_all(buf.as_slice()) + self.w.write_all(buf.as_slice()).map_err(|e| e.into()) } - pub fn write_option_tag(&mut self, present: bool) -> io::Result<()> { - self.w.write_all(&[if present { 0x01 } else { 0x00 }]) + pub fn write_option_tag(&mut self, present: bool) -> VortexResult<()> { + self.w + .write_all(&[if present { 0x01 } else { 0x00 }]) + .map_err(|e| e.into()) } - pub fn write_optional_array(&mut self, array: Option<&dyn Array>) -> io::Result<()> { + pub fn write_optional_array(&mut self, array: Option<&dyn Array>) -> VortexResult<()> { self.write_option_tag(array.is_some())?; if let Some(array) = array { self.write(array) @@ -187,25 +221,28 @@ impl<'a> WriteCtx<'a> { } } - pub fn write(&mut self, array: &dyn Array) -> io::Result<()> { + pub fn write(&mut self, array: &dyn Array) -> VortexResult<()> { let encoding_id = self .available_encodings .iter() .position(|e| e.name() == array.encoding().id().name()) .ok_or(io::Error::new(ErrorKind::InvalidInput, "unknown encoding"))?; self.write_usize(encoding_id)?; - array.serde().write(self) + array.serde().map(|s| s.write(self)).unwrap_or_else(|| { + Err(VortexError::InvalidArgument( + format!("Serialization not supported for {}", array.encoding().id()).into(), + )) + }) } } #[cfg(test)] pub mod test { - use std::io; - use crate::array::{Array, ArrayRef}; + use crate::error::VortexResult; use crate::serde::{ReadCtx, WriteCtx}; - pub fn roundtrip_array(array: &dyn Array) -> io::Result { + pub fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-array/src/serde/ptype.rs b/vortex-array/src/serde/ptype.rs new file mode 100644 index 0000000000..4481a6b3c0 --- /dev/null +++ b/vortex-array/src/serde/ptype.rs @@ -0,0 +1,55 @@ +use num_enum::{IntoPrimitive, TryFromPrimitive}; + +use crate::ptype::PType; + +#[derive(IntoPrimitive, TryFromPrimitive)] +#[repr(u8)] +pub enum PTypeTag { + U8, + U16, + U32, + U64, + I8, + I16, + I32, + I64, + F16, + F32, + F64, +} + +impl From for PTypeTag { + fn from(value: PType) -> Self { + match value { + PType::U8 => PTypeTag::U8, + PType::U16 => PTypeTag::U16, + PType::U32 => PTypeTag::U32, + PType::U64 => PTypeTag::U64, + PType::I8 => PTypeTag::I8, + PType::I16 => PTypeTag::I16, + PType::I32 => PTypeTag::I32, + PType::I64 => PTypeTag::I64, + PType::F16 => PTypeTag::F16, + PType::F32 => PTypeTag::F32, + PType::F64 => PTypeTag::F64, + } + } +} + +impl From for PType { + fn from(value: PTypeTag) -> Self { + match value { + PTypeTag::U8 => PType::U8, + PTypeTag::U16 => PType::U16, + PTypeTag::U32 => PType::U32, + PTypeTag::U64 => PType::U64, + PTypeTag::I8 => PType::I8, + PTypeTag::I16 => PType::I16, + PTypeTag::I32 => PType::I32, + PTypeTag::I64 => PType::I64, + PTypeTag::F16 => PType::F16, + PTypeTag::F32 => PType::F32, + PTypeTag::F64 => PType::F64, + } + } +} diff --git a/vortex-array/src/stats.rs b/vortex-array/src/stats.rs index 0ccf75a21e..6c94bbb1c3 100644 --- a/vortex-array/src/stats.rs +++ b/vortex-array/src/stats.rs @@ -3,16 +3,17 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::RwLock; -use crate::dtype::DType; use itertools::Itertools; +use crate::dtype::DType; use crate::error::{VortexError, VortexResult}; use crate::ptype::NativePType; -use crate::scalar::{ListScalarVec, ScalarRef}; +use crate::scalar::{ListScalarVec, Scalar}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum Stat { BitWidthFreq, + TrailingZeroFreq, IsConstant, IsSorted, IsStrictSorted, @@ -24,29 +25,29 @@ pub enum Stat { } #[derive(Debug, Clone, Default)] -pub struct StatsSet(HashMap); +pub struct StatsSet(HashMap); impl StatsSet { pub fn new() -> Self { StatsSet(HashMap::new()) } - pub fn from(map: HashMap) -> Self { + pub fn from(map: HashMap) -> Self { StatsSet(map) } - pub fn of(stat: Stat, value: ScalarRef) -> Self { + pub fn of(stat: Stat, value: Scalar) -> Self { StatsSet(HashMap::from([(stat, value)])) } - fn get_as>( + fn get_as>( &self, stat: &Stat, ) -> VortexResult> { self.0.get(stat).map(|v| T::try_from(v.clone())).transpose() } - pub fn set(&mut self, stat: Stat, value: ScalarRef) { + pub fn set(&mut self, stat: Stat, value: Scalar) { self.0.insert(stat, value); } @@ -68,7 +69,7 @@ impl StatsSet { match self.0.entry(Stat::Min) { Entry::Occupied(mut e) => { if let Some(omin) = other.0.get(&Stat::Min) { - match omin.partial_cmp(e.get().as_ref()) { + match omin.partial_cmp(e.get()) { None => { e.remove(); } @@ -91,7 +92,7 @@ impl StatsSet { match self.0.entry(Stat::Max) { Entry::Occupied(mut e) => { if let Some(omin) = other.0.get(&Stat::Max) { - match omin.partial_cmp(e.get().as_ref()) { + match omin.partial_cmp(e.get()) { None => { e.remove(); } @@ -148,7 +149,7 @@ impl StatsSet { match self.0.entry(stat.clone()) { Entry::Occupied(mut e) => { if let Some(other_value) = other.get_as::(stat).unwrap() { - let self_value: usize = e.get().as_ref().try_into().unwrap(); + let self_value: usize = e.get().try_into().unwrap(); e.insert((self_value + other_value).into()); } } @@ -168,7 +169,7 @@ impl StatsSet { .unwrap() { // TODO(robert): Avoid the copy here. We could e.get_mut() but need to figure out casting - let self_value: ListScalarVec = e.get().as_ref().try_into().unwrap(); + let self_value: ListScalarVec = e.get().try_into().unwrap(); e.insert( ListScalarVec( self_value @@ -195,7 +196,7 @@ impl StatsSet { match self.0.entry(Stat::RunCount) { Entry::Occupied(mut e) => { if let Some(other_value) = other.get_as::(&Stat::RunCount).unwrap() { - let self_value: usize = e.get().as_ref().try_into().unwrap(); + let self_value: usize = e.get().try_into().unwrap(); e.insert((self_value + other_value + 1).into()); } } @@ -232,7 +233,7 @@ impl<'a> Stats<'a> { }); } - pub fn set(&self, stat: Stat, value: ScalarRef) { + pub fn set(&self, stat: Stat, value: Scalar) { self.cache.write().unwrap().set(stat, value); } @@ -240,15 +241,15 @@ impl<'a> Stats<'a> { self.cache.read().unwrap().clone() } - pub fn get(&self, stat: &Stat) -> Option { + pub fn get(&self, stat: &Stat) -> Option { self.cache.read().unwrap().0.get(stat).cloned() } - pub fn get_as>(&self, stat: &Stat) -> Option { + pub fn get_as>(&self, stat: &Stat) -> Option { self.get(stat).map(|v| T::try_from(v).unwrap()) } - pub fn get_or_compute(&self, stat: &Stat) -> Option { + pub fn get_or_compute(&self, stat: &Stat) -> Option { if let Some(value) = self.cache.read().unwrap().0.get(stat) { return Some(value.clone()); } @@ -264,18 +265,18 @@ impl<'a> Stats<'a> { pub fn get_or_compute_cast(&self, stat: &Stat) -> Option { self.get_or_compute(stat) // TODO(ngates): fix the API so we don't convert the result to optional - .and_then(|v: ScalarRef| v.cast(&DType::from(T::PTYPE)).ok()) + .and_then(|v: Scalar| v.cast(&DType::from(T::PTYPE)).ok()) .and_then(|v| T::try_from(v).ok()) } - pub fn get_or_compute_as>( + pub fn get_or_compute_as>( &self, stat: &Stat, ) -> Option { self.get_or_compute(stat).and_then(|v| T::try_from(v).ok()) } - pub fn get_or_compute_or>( + pub fn get_or_compute_or>( &self, default: T, stat: &Stat, diff --git a/vortex-dict/src/compute.rs b/vortex-dict/src/compute.rs index 288068efe4..31eb6419dd 100644 --- a/vortex-dict/src/compute.rs +++ b/vortex-dict/src/compute.rs @@ -1,8 +1,9 @@ -use crate::DictArray; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::error::VortexResult; -use vortex::scalar::ScalarRef; +use vortex::scalar::Scalar; + +use crate::DictArray; impl ArrayCompute for DictArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -11,7 +12,7 @@ impl ArrayCompute for DictArray { } impl ScalarAtFn for DictArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { let dict_index: usize = scalar_at(self.codes(), index)?.try_into()?; scalar_at(self.dict(), dict_index) } diff --git a/vortex-dict/src/dict.rs b/vortex-dict/src/dict.rs index 0bacfe60cd..ff09dab4d8 100644 --- a/vortex-dict/src/dict.rs +++ b/vortex-dict/src/dict.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use vortex::array::{check_slice_bounds, Array, ArrayRef, ArrowIterator, Encoding, EncodingId}; +use vortex::array::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId}; use vortex::compress::EncodingCompression; use vortex::dtype::{DType, Signedness}; use vortex::error::{VortexError, VortexResult}; @@ -72,10 +72,6 @@ impl Array for DictArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - // TODO(robert): Add function to trim the dictionary fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -90,8 +86,8 @@ impl Array for DictArray { self.codes().nbytes() + self.dict().nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-dict/src/serde.rs b/vortex-dict/src/serde.rs index 9c09cd5e03..f1155db747 100644 --- a/vortex-dict/src/serde.rs +++ b/vortex-dict/src/serde.rs @@ -1,12 +1,11 @@ -use std::io; - use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{DictArray, DictEncoding}; impl ArraySerde for DictArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write(self.dict())?; // TODO(robert): Stop writing this ctx.dtype(self.codes().dtype())?; @@ -15,7 +14,7 @@ impl ArraySerde for DictArray { } impl EncodingSerde for DictEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let dict = ctx.read()?; let codes_dtype = ctx.dtype()?; let codes = ctx.with_schema(&codes_dtype).read()?; @@ -25,16 +24,15 @@ impl EncodingSerde for DictEncoding { #[cfg(test)] mod test { - use std::io; - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::{Array, ArrayRef}; + use vortex::error::VortexResult; use vortex::serde::{ReadCtx, WriteCtx}; use crate::downcast::DowncastDict; use crate::DictArray; - fn roundtrip_array(array: &dyn Array) -> io::Result { + fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-fastlanes/Cargo.toml b/vortex-fastlanes/Cargo.toml index 7f21bf95c2..37410f1014 100644 --- a/vortex-fastlanes/Cargo.toml +++ b/vortex-fastlanes/Cargo.toml @@ -24,4 +24,4 @@ fastlanez-sys = { path = "../fastlanez-sys" } log = "0.4.20" [dev-dependencies] -simplelog = { version = "0.12.1", features = ["paris"] } \ No newline at end of file +simplelog = { version = "0.12.1", features = ["paris"] } diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index 209680037e..c4b80cfabf 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -4,10 +4,14 @@ use fastlanez_sys::TryBitPack; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; -use vortex::array::{Array, ArrayRef}; +use vortex::array::{Array, ArrayRef, CloneOptionalArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::compute::cast::cast; +use vortex::compute::flatten::flatten_primitive; +use vortex::compute::patch::patch; use vortex::error::VortexResult; use vortex::match_each_integer_ptype; +use vortex::ptype::PType::{I16, I32, I64, I8, U16, U32, U64, U8}; use vortex::ptype::{NativePType, PType}; use vortex::scalar::ListScalarVec; use vortex::stats::Stat; @@ -134,10 +138,12 @@ fn bitpack_primitive(array: &[T], bit_width: usize) }); // Pad the last chunk with zeros to a full 1024 elements. - let last_chunk_size = array.len() - ((num_chunks - 1) * 1024); - let mut last_chunk: [T; 1024] = [T::default(); 1024]; - last_chunk[..last_chunk_size].copy_from_slice(&array[array.len() - last_chunk_size..]); - TryBitPack::try_bitpack_into(&last_chunk, bit_width, &mut output).unwrap(); + let last_chunk_size = array.len() % 1024; + if last_chunk_size > 0 { + let mut last_chunk: [T; 1024] = [T::default(); 1024]; + last_chunk[..last_chunk_size].copy_from_slice(&array[array.len() - last_chunk_size..]); + TryBitPack::try_bitpack_into(&last_chunk, bit_width, &mut output).unwrap(); + } output } @@ -165,6 +171,84 @@ fn bitpack_patches( }) } +pub fn bitunpack(array: &BitPackedArray) -> VortexResult { + let bit_width = array.bit_width(); + let length = array.len(); + let encoded = flatten_primitive(cast(array.encoded(), &PType::U8.into())?.as_ref())?; + let ptype: PType = array.dtype().try_into()?; + + let mut unpacked = match ptype { + I8 | U8 => PrimitiveArray::from_nullable( + bitunpack_primitive::(encoded.typed_data::(), bit_width, length), + array.validity().clone_optional(), + ), + I16 | U16 => PrimitiveArray::from_nullable( + bitunpack_primitive::(encoded.typed_data::(), bit_width, length), + array.validity().clone_optional(), + ), + I32 | U32 => PrimitiveArray::from_nullable( + bitunpack_primitive::(encoded.typed_data::(), bit_width, length), + array.validity().clone_optional(), + ), + I64 | U64 => PrimitiveArray::from_nullable( + bitunpack_primitive::(encoded.typed_data::(), bit_width, length), + array.validity().clone_optional(), + ), + _ => panic!("Unsupported ptype {:?}", ptype), + } + .boxed(); + + // Cast to signed if necessary + // TODO(ngates): do this more efficiently since we know it's a safe cast. unchecked_cast maybe? + if ptype.is_signed_int() { + unpacked = cast(unpacked.as_ref(), &ptype.into())? + } + + if let Some(patches) = array.patches() { + unpacked = patch(unpacked.as_ref(), patches)?; + } + + Ok(unpacked.as_primitive().clone()) +} + +fn bitunpack_primitive( + packed: &[u8], + bit_width: usize, + length: usize, +) -> Vec { + if bit_width == 0 { + return vec![T::default(); length]; + } + + // How many fastlanes vectors we will process. + let num_chunks = (length + 1023) / 1024; + + // Allocate a result vector. + let mut output = Vec::with_capacity(length); + + // Loop over all but the last chunk. + let bytes_per_chunk = 128 * bit_width; + (0..num_chunks - 1).for_each(|i| { + let chunk: &[u8] = &packed[i * bytes_per_chunk..][0..bytes_per_chunk]; + TryBitPack::try_bitunpack_into(chunk, bit_width, &mut output).unwrap(); + }); + + // Handle the final chunk which may contain padding. + let last_chunk_size = length % 1024; + if last_chunk_size > 0 { + let mut last_output = Vec::with_capacity(1024); + TryBitPack::try_bitunpack_into( + &packed[(num_chunks - 1) * bytes_per_chunk..], + bit_width, + &mut last_output, + ) + .unwrap(); + output.extend_from_slice(&last_output[..last_chunk_size]); + } + + output +} + /// Assuming exceptions cost 1 value + 1 u32 index, figure out the best bit-width to use. /// We could try to be clever, but we can never really predict how the exceptions will compress. fn best_bit_width(bit_width_freq: &[usize], bytes_per_exception: usize) -> usize { @@ -234,4 +318,17 @@ mod test { .unwrap(); assert_eq!(bp.bit_width(), 6); } + + #[test] + fn test_decompress() { + let cfg = CompressConfig::new(HashSet::from([BitPackedEncoding.id()]), HashSet::default()); + let ctx = CompressCtx::new(Arc::new(cfg)); + + let values = PrimitiveArray::from(Vec::from_iter((0..10_000).map(|i| (i % 63) as u8))); + let compressed = ctx.compress(&values, None).unwrap(); + assert_eq!(compressed.encoding().id(), BitPackedEncoding.id()); + + let decompressed = flatten_primitive(compressed.as_ref()).unwrap(); + assert_eq!(decompressed.typed_data::(), values.typed_data::()); + } } diff --git a/vortex-fastlanes/src/bitpacking/compute.rs b/vortex-fastlanes/src/bitpacking/compute.rs new file mode 100644 index 0000000000..9efbcc34fc --- /dev/null +++ b/vortex-fastlanes/src/bitpacking/compute.rs @@ -0,0 +1,17 @@ +use crate::bitpacking::compress::bitunpack; +use crate::BitPackedArray; +use vortex::compute::flatten::{FlattenFn, FlattenedArray}; +use vortex::compute::ArrayCompute; +use vortex::error::VortexResult; + +impl ArrayCompute for BitPackedArray { + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } +} + +impl FlattenFn for BitPackedArray { + fn flatten(&self) -> VortexResult { + bitunpack(self).map(FlattenedArray::Primitive) + } +} diff --git a/vortex-fastlanes/src/bitpacking/mod.rs b/vortex-fastlanes/src/bitpacking/mod.rs index ae0a43c9ea..63b5cef05f 100644 --- a/vortex-fastlanes/src/bitpacking/mod.rs +++ b/vortex-fastlanes/src/bitpacking/mod.rs @@ -1,12 +1,9 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use vortex::array::{ - check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef, -}; +use vortex::array::{check_validity_buffer, Array, ArrayRef, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; use vortex::compute::scalar_at::scalar_at; -use vortex::compute::ArrayCompute; use vortex::dtype::DType; use vortex::error::VortexResult; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; @@ -14,6 +11,7 @@ use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; mod compress; +mod compute; mod serde; #[derive(Debug, Clone)] @@ -115,10 +113,6 @@ impl Array for BitPackedArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, _start: usize, _stop: usize) -> VortexResult { unimplemented!("BitPackedArray::slice") } @@ -137,13 +131,11 @@ impl Array for BitPackedArray { + self.validity().map(|v| v.nbytes()).unwrap_or(0) } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } -impl ArrayCompute for BitPackedArray {} - impl<'arr> AsRef<(dyn Array + 'arr)> for BitPackedArray { fn as_ref(&self) -> &(dyn Array + 'arr) { self diff --git a/vortex-fastlanes/src/bitpacking/serde.rs b/vortex-fastlanes/src/bitpacking/serde.rs index f55763dd3b..b399a7764e 100644 --- a/vortex-fastlanes/src/bitpacking/serde.rs +++ b/vortex-fastlanes/src/bitpacking/serde.rs @@ -1,12 +1,11 @@ -use std::io; - use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{BitPackedArray, BitPackedEncoding}; impl ArraySerde for BitPackedArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write(self.encoded())?; ctx.write_optional_array(self.validity())?; ctx.write_optional_array(self.patches())?; @@ -17,7 +16,7 @@ impl ArraySerde for BitPackedArray { } impl EncodingSerde for BitPackedEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let encoded = ctx.read()?; let validity = ctx.read_optional_array()?; let patches = ctx.read_optional_array()?; diff --git a/vortex-fastlanes/src/delta/compress.rs b/vortex-fastlanes/src/delta/compress.rs index 61345a51eb..2a8857cbb0 100644 --- a/vortex-fastlanes/src/delta/compress.rs +++ b/vortex-fastlanes/src/delta/compress.rs @@ -75,14 +75,13 @@ impl EncodingCompression for DeltaEncoding { PrimitiveArray::from(delta_primitive(filled.as_primitive().typed_data::<$T>())) }); - let encoded = ctx.named("deltas").compress( - delta_encoded.as_ref(), - like_delta.map(|d| d.encoded().as_ref()), - )?; + let encoded = ctx + .named("deltas") + .compress(delta_encoded.as_ref(), like_delta.map(|d| d.encoded()))?; - return Ok(DeltaArray::try_new(array.len(), encoded, validity) + Ok(DeltaArray::try_new(array.len(), encoded, validity) .unwrap() - .boxed()); + .boxed()) } } diff --git a/vortex-fastlanes/src/delta/mod.rs b/vortex-fastlanes/src/delta/mod.rs index b3a465671a..c73bd24ae0 100644 --- a/vortex-fastlanes/src/delta/mod.rs +++ b/vortex-fastlanes/src/delta/mod.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; use vortex::compute::scalar_at::scalar_at; use vortex::compute::ArrayCompute; @@ -81,7 +81,7 @@ impl Array for DeltaArray { #[inline] fn dtype(&self) -> &DType { - &self.encoded.dtype() + self.encoded.dtype() } #[inline] @@ -89,10 +89,6 @@ impl Array for DeltaArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, _start: usize, _stop: usize) -> VortexResult { unimplemented!("DeltaArray::slice") } @@ -107,8 +103,8 @@ impl Array for DeltaArray { self.encoded().nbytes() + self.validity().map(|v| v.nbytes()).unwrap_or(0) } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-fastlanes/src/delta/serde.rs b/vortex-fastlanes/src/delta/serde.rs index e72f50d951..a69a1217c1 100644 --- a/vortex-fastlanes/src/delta/serde.rs +++ b/vortex-fastlanes/src/delta/serde.rs @@ -1,12 +1,11 @@ -use std::io; - use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{DeltaArray, DeltaEncoding}; impl ArraySerde for DeltaArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.len())?; ctx.write(self.encoded())?; ctx.write_optional_array(self.validity()) @@ -14,7 +13,7 @@ impl ArraySerde for DeltaArray { } impl EncodingSerde for DeltaEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let len = ctx.read_usize()?; let encoded = ctx.read()?; let validity = ctx.read_optional_array()?; diff --git a/vortex-fastlanes/src/for/compress.rs b/vortex-fastlanes/src/for/compress.rs index 44048fd959..bcbd66acd6 100644 --- a/vortex-fastlanes/src/for/compress.rs +++ b/vortex-fastlanes/src/for/compress.rs @@ -1,20 +1,22 @@ use itertools::Itertools; -use num_traits::CheckedSub; +use num_traits::PrimInt; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef}; +use vortex::array::{Array, ArrayRef, CloneOptionalArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::compute::flatten::flatten_primitive; use vortex::error::VortexResult; use vortex::match_each_integer_ptype; -use vortex::ptype::NativePType; +use vortex::ptype::{NativePType, PType}; +use vortex::scalar::ListScalarVec; use vortex::stats::Stat; use crate::{FoRArray, FoREncoding}; impl EncodingCompression for FoREncoding { fn cost(&self) -> u8 { - 1 + 0 } fn can_compress( @@ -30,26 +32,12 @@ impl EncodingCompression for FoREncoding { return None; } - match_each_integer_ptype!(parray.ptype(), |$T| { - let min = parray - .stats() - .get_or_compute_as::<$T>(&Stat::Min) - .unwrap_or(<$T>::default()); - - // Nothing for us to do if the min is already zero - if min == 0 { - return None; - } - - // Check for overflow - let max = parray - .stats() - .get_or_compute_as::<$T>(&Stat::Max) - .unwrap_or(<$T>::default()); - if max.checked_sub(min).is_none() { - return None; - } - }); + // Nothing for us to do if the min is already zero and tz == 0 + let shift = trailing_zeros(parray); + let min = parray.stats().get_or_compute_cast::(&Stat::Min)?; + if min == 0 && shift == 0 { + return None; + } Some(self) } @@ -61,9 +49,9 @@ impl EncodingCompression for FoREncoding { ctx: CompressCtx, ) -> VortexResult { let parray = array.as_primitive(); - + let shift = trailing_zeros(parray); let child = match_each_integer_ptype!(parray.ptype(), |$T| { - compress_primitive::<$T>(parray) + compress_primitive::<$T>(parray, shift) }); // TODO(ngates): remove FoR as a potential encoding from the ctx @@ -71,47 +59,83 @@ impl EncodingCompression for FoREncoding { // worth trying. let compressed_child = ctx.named("for").excluding(&FoREncoding::ID).compress( child.as_ref(), - like.map(|l| l.as_any().downcast_ref::().unwrap().child()), + like.map(|l| l.as_any().downcast_ref::().unwrap().encoded()), )?; let reference = parray.stats().get(&Stat::Min).unwrap(); - Ok(FoRArray::try_new(compressed_child, reference)?.boxed()) + Ok(FoRArray::try_new(compressed_child, reference, shift)?.boxed()) } } -fn compress_primitive(parray: &PrimitiveArray) -> PrimitiveArray { + +fn compress_primitive( + parray: &PrimitiveArray, + shift: u8, +) -> PrimitiveArray { let min = parray .stats() .get_or_compute_as::(&Stat::Min) - .unwrap_or(::default()); - let max = parray - .stats() - .get_or_compute_as::(&Stat::Max) - .unwrap_or(::default()); + .unwrap_or_default(); + + let values = if shift > 0 { + let shifted_min = min >> shift as usize; + parray + .typed_data::() + .iter() + .map(|&v| v >> shift as usize) + .map(|v| v - shifted_min) + .collect_vec() + } else { + parray + .typed_data::() + .iter() + .map(|&v| v - min) + .collect_vec() + }; + + PrimitiveArray::from(values) +} - let _buffer = parray.typed_data::(); - if max.checked_sub(&min).is_none() { - // Delta would cause overflow - return parray.clone(); +pub fn decompress(array: &FoRArray) -> VortexResult { + let shift = array.shift(); + let ptype: PType = array.dtype().try_into()?; + let encoded = flatten_primitive(array.encoded())?; + Ok(match_each_integer_ptype!(ptype, |$T| { + let reference: $T = array.reference().try_into()?; + PrimitiveArray::from_nullable( + decompress_primitive(encoded.typed_data::<$T>(), reference, shift), + encoded.validity().clone_optional(), + ) + })) +} + +fn decompress_primitive(values: &[T], reference: T, shift: u8) -> Vec { + if shift > 0 { + let shifted_reference = reference << shift as usize; + values + .iter() + .map(|&v| v << shift as usize) + .map(|v| v + shifted_reference) + .collect_vec() + } else { + values.iter().map(|&v| v + reference).collect_vec() } +} - // TODO(ngates): check for overflow - let values = parray - .typed_data::() +fn trailing_zeros(array: &dyn Array) -> u8 { + let tz_freq = array + .stats() + .get_or_compute_as::>(&Stat::TrailingZeroFreq) + .map(|v| v.0) + .unwrap_or(vec![0]); + tz_freq .iter() - .map(|&v| { - v.checked_sub(&min) - .unwrap_or_else(|| panic!("Underflow when compressing FoR")) - }) - // TODO(ngates): cast to unsigned - // .map(|v| v as parray.ptype().to_unsigned()::T) - .collect_vec(); - - PrimitiveArray::from(values) + .enumerate() + .find_or_first(|(_, &v)| v > 0) + .map(|(i, _freq)| i) + .unwrap_or(0) as u8 } #[cfg(test)] mod test { - use log::LevelFilter; - use simplelog::{ColorChoice, Config, TermLogger, TerminalMode}; use std::collections::HashSet; use std::sync::Arc; @@ -122,16 +146,7 @@ mod test { use super::*; - #[test] - fn test_compress() { - TermLogger::init( - LevelFilter::Debug, - Config::default(), - TerminalMode::Mixed, - ColorChoice::Auto, - ) - .unwrap(); - + fn compress_ctx() -> CompressCtx { let cfg = CompressConfig::new( // We need some BitPacking else we will need choose FoR. HashSet::from([ @@ -141,7 +156,12 @@ mod test { ]), HashSet::default(), ); - let ctx = CompressCtx::new(Arc::new(cfg)); + CompressCtx::new(Arc::new(cfg)) + } + + #[test] + fn test_compress() { + let ctx = compress_ctx(); // Create a range offset by a million let array = PrimitiveArray::from((0u32..10_000).map(|v| v + 1_000_000).collect_vec()); @@ -151,4 +171,17 @@ mod test { let fa = compressed.as_any().downcast_ref::().unwrap(); assert_eq!(fa.reference().try_into(), Ok(1_000_000u32)); } + + #[test] + fn test_decompress() { + let ctx = compress_ctx(); + + // Create a range offset by a million + let array = PrimitiveArray::from((0u32..10_000).map(|v| v + 1_000_000).collect_vec()); + let compressed = ctx.compress(&array, None).unwrap(); + assert_eq!(compressed.encoding().id(), FoREncoding.id()); + + let decompressed = flatten_primitive(compressed.as_ref()).unwrap(); + assert_eq!(decompressed.typed_data::(), array.typed_data::()); + } } diff --git a/vortex-fastlanes/src/for/compute.rs b/vortex-fastlanes/src/for/compute.rs new file mode 100644 index 0000000000..a62321ce04 --- /dev/null +++ b/vortex-fastlanes/src/for/compute.rs @@ -0,0 +1,17 @@ +use crate::r#for::compress::decompress; +use crate::FoRArray; +use vortex::compute::flatten::{FlattenFn, FlattenedArray}; +use vortex::compute::ArrayCompute; +use vortex::error::VortexResult; + +impl ArrayCompute for FoRArray { + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } +} + +impl FlattenFn for FoRArray { + fn flatten(&self) -> VortexResult { + decompress(self).map(FlattenedArray::Primitive) + } +} diff --git a/vortex-fastlanes/src/for/mod.rs b/vortex-fastlanes/src/for/mod.rs index 0ffc717f1f..47b90e48a7 100644 --- a/vortex-fastlanes/src/for/mod.rs +++ b/vortex-fastlanes/src/for/mod.rs @@ -1,44 +1,51 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; use vortex::dtype::DType; use vortex::error::VortexResult; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::scalar::{Scalar, ScalarRef}; +use vortex::scalar::Scalar; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; mod compress; +mod compute; mod serde; #[derive(Debug, Clone)] pub struct FoRArray { - child: ArrayRef, - reference: ScalarRef, + encoded: ArrayRef, + reference: Scalar, + shift: u8, stats: Arc>, } impl FoRArray { - pub fn try_new(child: ArrayRef, reference: ScalarRef) -> VortexResult { + pub fn try_new(child: ArrayRef, reference: Scalar, shift: u8) -> VortexResult { // TODO(ngates): check the dtype of reference == child.dtype() Ok(Self { - child, + encoded: child, reference, + shift, stats: Arc::new(RwLock::new(StatsSet::new())), }) } #[inline] - pub fn child(&self) -> &dyn Array { - self.child.as_ref() + pub fn encoded(&self) -> &dyn Array { + self.encoded.as_ref() } #[inline] - pub fn reference(&self) -> &dyn Scalar { - self.reference.as_ref() + pub fn reference(&self) -> &Scalar { + &self.reference + } + + #[inline] + pub fn shift(&self) -> u8 { + self.shift } } @@ -60,17 +67,17 @@ impl Array for FoRArray { #[inline] fn len(&self) -> usize { - self.child.len() + self.encoded.len() } #[inline] fn is_empty(&self) -> bool { - self.child.is_empty() + self.encoded.is_empty() } #[inline] fn dtype(&self) -> &DType { - self.child.dtype() + self.encoded.dtype() } #[inline] @@ -78,14 +85,11 @@ impl Array for FoRArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self { - child: self.child.slice(start, stop)?, + encoded: self.encoded.slice(start, stop)?, reference: self.reference.clone(), + shift: self.shift, stats: Arc::new(RwLock::new(StatsSet::new())), } .boxed()) @@ -98,16 +102,14 @@ impl Array for FoRArray { #[inline] fn nbytes(&self) -> usize { - self.child.nbytes() + self.reference.nbytes() + self.encoded.nbytes() + self.reference.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } -impl ArrayCompute for FoRArray {} - impl<'arr> AsRef<(dyn Array + 'arr)> for FoRArray { fn as_ref(&self) -> &(dyn Array + 'arr) { self @@ -117,7 +119,8 @@ impl<'arr> AsRef<(dyn Array + 'arr)> for FoRArray { impl ArrayDisplay for FoRArray { fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { f.property("reference", self.reference())?; - f.child("shifted", self.child()) + f.property("shift", self.shift())?; + f.child("encoded", self.encoded()) } } diff --git a/vortex-fastlanes/src/for/serde.rs b/vortex-fastlanes/src/for/serde.rs index 7a03b425c5..b882e39d0c 100644 --- a/vortex-fastlanes/src/for/serde.rs +++ b/vortex-fastlanes/src/for/serde.rs @@ -1,34 +1,37 @@ -use std::io; - use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{FoRArray, FoREncoding}; impl ArraySerde for FoRArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.scalar(self.reference())?; - ctx.write(self.child()) + ctx.write_usize(self.shift() as usize)?; + ctx.write(self.encoded()) } } impl EncodingSerde for FoREncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let reference = ctx.scalar()?; + let shift = ctx.read_usize()? as u8; let child = ctx.read()?; - Ok(FoRArray::try_new(child, reference).unwrap().boxed()) + Ok(FoRArray::try_new(child, reference, shift).unwrap().boxed()) } } #[cfg(test)] mod test { - use crate::FoRArray; - use std::io; + use vortex::array::{Array, ArrayRef}; - use vortex::scalar::ScalarRef; + use vortex::error::VortexResult; + use vortex::scalar::Scalar; use vortex::serde::{ReadCtx, WriteCtx}; - fn roundtrip_array(array: &dyn Array) -> io::Result { + use crate::FoRArray; + + fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; @@ -41,7 +44,8 @@ mod test { fn roundtrip() { let arr = FoRArray::try_new( vec![-7i64, -13, 17, 23].into(), - >::into(-7i64), + >::into(-7i64), + 2, ) .unwrap(); roundtrip_array(arr.as_ref()).unwrap(); diff --git a/vortex-ree/Cargo.toml b/vortex-ree/Cargo.toml index d42401cf0e..99109d847b 100644 --- a/vortex-ree/Cargo.toml +++ b/vortex-ree/Cargo.toml @@ -12,8 +12,9 @@ edition = { workspace = true } rust-version = { workspace = true } [dependencies] -arrow = { version = "50.0.0" } vortex-array = { path = "../vortex-array" } +arrow-array = "50.0.0" +arrow-buffer = "50.0.0" linkme = "0.3.22" half = "2.3.1" num-traits = "0.2.17" diff --git a/vortex-ree/src/compress.rs b/vortex-ree/src/compress.rs index c87d89cf2a..f02ef53887 100644 --- a/vortex-ree/src/compress.rs +++ b/vortex-ree/src/compress.rs @@ -3,7 +3,8 @@ use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; use vortex::array::{Array, ArrayRef, Encoding}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::compute::cast::cast_primitive; +use vortex::compute::cast::cast; +use vortex::compute::flatten::flatten_primitive; use vortex::error::VortexResult; use vortex::ptype::{match_each_native_ptype, NativePType}; use vortex::stats::Stat; @@ -110,7 +111,6 @@ fn ree_encode_primitive(elements: &[T]) -> (Vec, Vec) { (ends, values) } -#[allow(dead_code)] pub fn ree_decode( ends: &PrimitiveArray, values: &PrimitiveArray, @@ -119,7 +119,7 @@ pub fn ree_decode( // TODO(ngates): switch over ends without necessarily casting match_each_native_ptype!(values.ptype(), |$P| { Ok(PrimitiveArray::from_nullable(ree_decode_primitive( - cast_primitive(ends, &PType::U64)?.typed_data(), + flatten_primitive(cast(ends, &PType::U64.into())?.as_ref())?.typed_data(), values.typed_data::<$P>(), ), validity)) }) @@ -135,7 +135,7 @@ pub fn ree_decode_primitive(run_ends: &[u64], values: &[T]) -> V #[cfg(test)] mod test { - use arrow::buffer::BooleanBuffer; + use arrow_buffer::buffer::BooleanBuffer; use vortex::array::bool::BoolArray; use vortex::array::downcast::DowncastArrayBuiltin; diff --git a/vortex-ree/src/compute.rs b/vortex-ree/src/compute.rs index 94a5d75576..b59e25a1eb 100644 --- a/vortex-ree/src/compute.rs +++ b/vortex-ree/src/compute.rs @@ -1,17 +1,53 @@ -use crate::REEArray; +use std::cmp::min; +use vortex::array::primitive::PrimitiveArray; +use vortex::array::{Array, CloneOptionalArray}; +use vortex::compute::cast::cast; +use vortex::compute::flatten::{flatten, flatten_primitive, FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; -use vortex::error::VortexResult; -use vortex::scalar::ScalarRef; +use vortex::error::{VortexError, VortexResult}; +use vortex::ptype::PType; +use vortex::scalar::Scalar; + +use crate::compress::ree_decode; +use crate::REEArray; impl ArrayCompute for REEArray { + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } } +impl FlattenFn for REEArray { + fn flatten(&self) -> VortexResult { + let ends: PrimitiveArray = + flatten_primitive(cast(self.ends(), &PType::U64.into())?.as_ref())? + .typed_data::() + .iter() + .map(|v| v - self.offset() as u64) + .map(|v| min(v, self.len() as u64)) + .take_while(|v| *v <= (self.len() as u64)) + .collect::>() + .into(); + + let values = flatten(self.values())?; + if let FlattenedArray::Primitive(pvalues) = values { + ree_decode(&ends, &pvalues, self.validity().clone_optional()) + .map(FlattenedArray::Primitive) + } else { + Err(VortexError::InvalidArgument( + "Cannot yet flatten non-primitive REE array".into(), + )) + } + } +} + impl ScalarAtFn for REEArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { scalar_at(self.values(), self.find_physical_index(index)?) } } diff --git a/vortex-ree/src/ree.rs b/vortex-ree/src/ree.rs index a0256d2ed2..f2fe6a9dcb 100644 --- a/vortex-ree/src/ree.rs +++ b/vortex-ree/src/ree.rs @@ -1,30 +1,20 @@ use std::any::Any; -use std::cmp::min; -use std::marker::PhantomData; use std::sync::{Arc, RwLock}; -use arrow::array::ArrowPrimitiveType; -use arrow::array::{Array as ArrowArray, ArrayRef as ArrowArrayRef, AsArray}; -use num_traits::AsPrimitive; - -use vortex::array::primitive::PrimitiveArray; use vortex::array::{ - check_slice_bounds, check_validity_buffer, Array, ArrayKind, ArrayRef, ArrowIterator, - CloneOptionalArray, Encoding, EncodingId, EncodingRef, + check_slice_bounds, check_validity_buffer, Array, ArrayKind, ArrayRef, CloneOptionalArray, + Encoding, EncodingId, EncodingRef, }; -use vortex::arrow::match_arrow_numeric_type; use vortex::compress::EncodingCompression; use vortex::compute; -use vortex::compute::scalar_at::scalar_at; use vortex::compute::search_sorted::SearchSortedSide; -use vortex::dtype::{DType, Nullability, Signedness}; +use vortex::dtype::DType; use vortex::error::{VortexError, VortexResult}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::ptype::NativePType; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; -use crate::compress::{ree_decode_primitive, ree_encode}; +use crate::compress::ree_encode; #[derive(Debug, Clone)] pub struct REEArray { @@ -54,13 +44,6 @@ impl REEArray { ) -> VortexResult { check_validity_buffer(validity.as_deref(), length)?; - if !matches!( - ends.dtype(), - DType::Int(_, Signedness::Unsigned, Nullability::NonNullable) - ) { - return Err(VortexError::InvalidDType(ends.dtype().clone())); - } - if !ends .stats() .get_as::(&Stat::IsStrictSorted) @@ -69,8 +52,7 @@ impl REEArray { return Err(VortexError::IndexArrayMustBeStrictSorted); } - // see https://github.com/fulcrum-so/spiral/issues/873 - // let length = run_ends_logical_length(&ends); + // TODO(ngates): https://github.com/fulcrum-so/spiral/issues/873 Ok(Self { ends, values, @@ -82,7 +64,7 @@ impl REEArray { } pub fn find_physical_index(&self, index: usize) -> VortexResult { - compute::search_sorted::search_sorted_usize( + compute::search_sorted::search_sorted( self.ends(), index + self.offset, SearchSortedSide::Right, @@ -105,6 +87,11 @@ impl REEArray { } } + #[inline] + pub fn offset(&self) -> usize { + self.offset + } + #[inline] pub fn ends(&self) -> &dyn Array { self.ends.as_ref() @@ -157,30 +144,6 @@ impl Array for REEArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - let ends: Vec = self - .ends - .iter_arrow() - .flat_map(|c| { - match_arrow_numeric_type!(self.ends.dtype(), |$E| { - let ends = c.as_primitive::<$E>() - .values() - .iter() - .map(|v| AsPrimitive::::as_(*v)) - .map(|v| v - self.offset as u64) - .map(|v| min(v, self.length as u64)) - .take_while(|v| *v <= (self.length as u64)) - .collect::>(); - ends.into_iter() - }) - }) - .collect(); - - match_arrow_numeric_type!(self.values.dtype(), |$N| { - Box::new(REEArrowIterator::<$N>::new(ends, self.values.iter_arrow())) - }) - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; let slice_begin = self.find_physical_index(start)?; @@ -211,8 +174,8 @@ impl Array for REEArray { self.values.nbytes() + self.ends.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -252,74 +215,18 @@ impl ArrayDisplay for REEArray { } } -pub struct REEArrowIterator -where - T::Native: NativePType, -{ - ends: Vec, - values: Box, - current_idx: usize, - _marker: PhantomData, -} - -impl REEArrowIterator -where - T::Native: NativePType, -{ - pub fn new(ends: Vec, values: Box) -> Self { - Self { - ends, - values, - current_idx: 0, - _marker: PhantomData, - } - } -} - -impl Iterator for REEArrowIterator -where - T::Native: NativePType, -{ - type Item = ArrowArrayRef; - - fn next(&mut self) -> Option { - self.values.next().and_then(|vs| { - let batch_ends = &self.ends[self.current_idx..self.current_idx + vs.len()]; - self.current_idx += vs.len(); - let decoded = - ree_decode_primitive(batch_ends, vs.as_primitive::().values().as_ref()); - // TODO(ngates): avoid going back into PrimitiveArray? - PrimitiveArray::from(decoded).iter_arrow().next() - }) - } -} - -/// Gets the logical end from the ends array. -#[allow(dead_code)] -fn run_ends_logical_length>(ends: &T) -> usize { - if ends.as_ref().is_empty() { - 0 - } else { - scalar_at(ends.as_ref(), ends.as_ref().len() - 1) - .and_then(|end| end.try_into()) - .unwrap_or_else(|_| panic!("Couldn't convert ends to usize")) - } -} - #[cfg(test)] mod test { - use arrow::array::cast::AsArray; - use arrow::array::types::Int32Type; - use itertools::Itertools; use vortex::array::Array; + use vortex::compute::flatten::flatten_primitive; use vortex::compute::scalar_at::scalar_at; + use vortex::dtype::{DType, IntWidth, Nullability, Signedness}; use crate::REEArray; - use vortex::dtype::{DType, IntWidth, Nullability, Signedness}; #[test] fn new() { - let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1, 2, 3].into(), None, 10); + let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1i32, 2, 3].into(), None, 10); assert_eq!(arr.len(), 10); assert_eq!( arr.dtype(), @@ -337,7 +244,7 @@ mod test { #[test] fn slice() { - let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1, 2, 3].into(), None, 10) + let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1i32, 2, 3].into(), None, 10) .slice(3, 8) .unwrap(); assert_eq!( @@ -346,20 +253,18 @@ mod test { ); assert_eq!(arr.len(), 5); - arr.iter_arrow() - .zip_eq([vec![2, 2, 3, 3, 3]]) - .for_each(|(from_iter, orig)| { - assert_eq!(*from_iter.as_primitive::().values(), orig); - }); + assert_eq!( + flatten_primitive(arr.as_ref()).unwrap().typed_data::(), + vec![2, 2, 3, 3, 3] + ); } #[test] - fn iter_arrow() { - let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1, 2, 3].into(), None, 10); - arr.iter_arrow() - .zip_eq([vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3]]) - .for_each(|(from_iter, orig)| { - assert_eq!(*from_iter.as_primitive::().values(), orig); - }); + fn flatten() { + let arr = REEArray::new(vec![2u32, 5, 10].into(), vec![1i32, 2, 3].into(), None, 10); + assert_eq!( + flatten_primitive(arr.as_ref()).unwrap().typed_data::(), + vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3] + ); } } diff --git a/vortex-ree/src/serde.rs b/vortex-ree/src/serde.rs index 822e549072..7b8dc01fdd 100644 --- a/vortex-ree/src/serde.rs +++ b/vortex-ree/src/serde.rs @@ -1,12 +1,11 @@ -use std::io; - use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{REEArray, REEEncoding}; impl ArraySerde for REEArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.len())?; if let Some(v) = self.validity() { ctx.write(v.as_ref())?; @@ -19,7 +18,7 @@ impl ArraySerde for REEArray { } impl EncodingSerde for REEEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let len = ctx.read_usize()?; let validity = if ctx.schema().is_nullable() { Some(ctx.validity().read()?) @@ -35,16 +34,16 @@ impl EncodingSerde for REEEncoding { #[cfg(test)] mod test { - use std::io; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::{Array, ArrayRef}; + use vortex::error::VortexResult; use vortex::serde::{ReadCtx, WriteCtx}; use crate::downcast::DowncastREE; use crate::REEArray; - fn roundtrip_array(array: &dyn Array) -> io::Result { + fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-roaring/src/boolean/compute.rs b/vortex-roaring/src/boolean/compute.rs index 8f0db54106..7a41a02193 100644 --- a/vortex-roaring/src/boolean/compute.rs +++ b/vortex-roaring/src/boolean/compute.rs @@ -1,8 +1,9 @@ -use crate::RoaringBoolArray; use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::ArrayCompute; use vortex::error::VortexResult; -use vortex::scalar::ScalarRef; +use vortex::scalar::Scalar; + +use crate::RoaringBoolArray; impl ArrayCompute for RoaringBoolArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -11,7 +12,7 @@ impl ArrayCompute for RoaringBoolArray { } impl ScalarAtFn for RoaringBoolArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { if self.bitmap.contains(index as u32) { Ok(true.into()) } else { diff --git a/vortex-roaring/src/boolean/mod.rs b/vortex-roaring/src/boolean/mod.rs index 9c11cdd256..9923057103 100644 --- a/vortex-roaring/src/boolean/mod.rs +++ b/vortex-roaring/src/boolean/mod.rs @@ -5,8 +5,7 @@ use croaring::{Bitmap, Native}; use compress::roaring_encode; use vortex::array::{ - check_slice_bounds, Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, - EncodingRef, + check_slice_bounds, Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef, }; use vortex::compress::EncodingCompression; use vortex::dtype::DType; @@ -84,10 +83,6 @@ impl Array for RoaringBoolArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; @@ -113,8 +108,8 @@ impl Array for RoaringBoolArray { self.bitmap.get_serialized_size_in_bytes::() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } @@ -157,7 +152,7 @@ mod test { use vortex::array::Array; use vortex::compute::scalar_at::scalar_at; use vortex::error::VortexResult; - use vortex::scalar::ScalarRef; + use vortex::scalar::Scalar; use crate::RoaringBoolArray; @@ -177,8 +172,8 @@ mod test { let bool: &dyn Array = &BoolArray::from(vec![true, false, true, true]); let array = RoaringBoolArray::encode(bool)?; - let truthy: ScalarRef = true.into(); - let falsy: ScalarRef = false.into(); + let truthy: Scalar = true.into(); + let falsy: Scalar = false.into(); assert_eq!(scalar_at(array.as_ref(), 0)?, truthy); assert_eq!(scalar_at(array.as_ref(), 1)?, falsy); diff --git a/vortex-roaring/src/boolean/serde.rs b/vortex-roaring/src/boolean/serde.rs index b624b734c4..910d937e22 100644 --- a/vortex-roaring/src/boolean/serde.rs +++ b/vortex-roaring/src/boolean/serde.rs @@ -4,12 +4,13 @@ use std::io::ErrorKind; use croaring::{Bitmap, Portable}; use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{RoaringBoolArray, RoaringBoolEncoding}; impl ArraySerde for RoaringBoolArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write_usize(self.len())?; let mut data = Vec::new(); self.bitmap().serialize_into::(&mut data); @@ -18,7 +19,7 @@ impl ArraySerde for RoaringBoolArray { } impl EncodingSerde for RoaringBoolEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let len = ctx.read_usize()?; let bitmap_data = ctx.read_slice()?; Ok(RoaringBoolArray::new( diff --git a/vortex-roaring/src/integer/compute.rs b/vortex-roaring/src/integer/compute.rs index 45a97969a4..b05508d4ff 100644 --- a/vortex-roaring/src/integer/compute.rs +++ b/vortex-roaring/src/integer/compute.rs @@ -1,9 +1,10 @@ -use crate::RoaringIntArray; use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::ArrayCompute; use vortex::error::VortexResult; use vortex::ptype::PType; -use vortex::scalar::ScalarRef; +use vortex::scalar::Scalar; + +use crate::RoaringIntArray; impl ArrayCompute for RoaringIntArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -12,10 +13,10 @@ impl ArrayCompute for RoaringIntArray { } impl ScalarAtFn for RoaringIntArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { // Unwrap since we know the index is valid let bitmap_value = self.bitmap.select(index as u32).unwrap(); - let scalar: ScalarRef = match self.ptype { + let scalar: Scalar = match self.ptype { PType::U8 => (bitmap_value as u8).into(), PType::U16 => (bitmap_value as u16).into(), PType::U32 => bitmap_value.into(), diff --git a/vortex-roaring/src/integer/mod.rs b/vortex-roaring/src/integer/mod.rs index b2be9ae685..1a6b71e36a 100644 --- a/vortex-roaring/src/integer/mod.rs +++ b/vortex-roaring/src/integer/mod.rs @@ -5,8 +5,7 @@ use croaring::{Bitmap, Native}; use compress::roaring_encode; use vortex::array::{ - check_slice_bounds, Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, - EncodingRef, + check_slice_bounds, Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef, }; use vortex::compress::EncodingCompression; use vortex::dtype::DType; @@ -96,10 +95,6 @@ impl Array for RoaringIntArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { check_slice_bounds(self, start, stop)?; todo!() @@ -115,8 +110,8 @@ impl Array for RoaringIntArray { self.bitmap.get_serialized_size_in_bytes::() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } } diff --git a/vortex-roaring/src/integer/serde.rs b/vortex-roaring/src/integer/serde.rs index dd99d59715..0d5709cedd 100644 --- a/vortex-roaring/src/integer/serde.rs +++ b/vortex-roaring/src/integer/serde.rs @@ -4,13 +4,14 @@ use std::io::ErrorKind; use croaring::{Bitmap, Portable}; use vortex::array::{Array, ArrayRef}; +use vortex::error::VortexResult; use vortex::ptype::PType; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{RoaringIntArray, RoaringIntEncoding}; impl ArraySerde for RoaringIntArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { let mut data = Vec::new(); self.bitmap().serialize_into::(&mut data); ctx.write_slice(data.as_slice()) @@ -18,7 +19,7 @@ impl ArraySerde for RoaringIntArray { } impl EncodingSerde for RoaringIntEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let bitmap_data = ctx.read_slice()?; let ptype: PType = ctx .schema() diff --git a/vortex-roaring/src/serde_tests.rs b/vortex-roaring/src/serde_tests.rs index 71a06a9a7a..a7310b5fdb 100644 --- a/vortex-roaring/src/serde_tests.rs +++ b/vortex-roaring/src/serde_tests.rs @@ -1,10 +1,10 @@ #[cfg(test)] pub mod test { - use std::io; use vortex::array::{Array, ArrayRef}; + use vortex::error::VortexResult; use vortex::serde::{ReadCtx, WriteCtx}; - pub fn roundtrip_array(array: &dyn Array) -> io::Result { + pub fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-zigzag/src/compute.rs b/vortex-zigzag/src/compute.rs index b6516e97b6..5f11e6a7d9 100644 --- a/vortex-zigzag/src/compute.rs +++ b/vortex-zigzag/src/compute.rs @@ -1,11 +1,12 @@ -use crate::ZigZagArray; +use zigzag::ZigZag; + use vortex::array::Array; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; -use vortex::dtype::{DType, IntWidth, Signedness}; use vortex::error::{VortexError, VortexResult}; -use vortex::scalar::{NullableScalar, Scalar, ScalarRef}; -use zigzag::ZigZag; +use vortex::scalar::{PScalar, Scalar}; + +use crate::ZigZagArray; impl ArrayCompute for ZigZagArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -14,24 +15,19 @@ impl ArrayCompute for ZigZagArray { } impl ScalarAtFn for ZigZagArray { - fn scalar_at(&self, index: usize) -> VortexResult { + fn scalar_at(&self, index: usize) -> VortexResult { let scalar = scalar_at(self.encoded(), index)?; - let Some(scalar) = scalar.as_nonnull() else { - return Ok(NullableScalar::none(self.dtype().clone()).boxed()); - }; - match self.dtype() { - DType::Int(IntWidth::_8, Signedness::Signed, _) => { - Ok(i8::decode(scalar.try_into()?).into()) - } - DType::Int(IntWidth::_16, Signedness::Signed, _) => { - Ok(i16::decode(scalar.try_into()?).into()) - } - DType::Int(IntWidth::_32, Signedness::Signed, _) => { - Ok(i32::decode(scalar.try_into()?).into()) - } - DType::Int(IntWidth::_64, Signedness::Signed, _) => { - Ok(i64::decode(scalar.try_into()?).into()) - } + match scalar { + Scalar::Primitive(p) => match p.value() { + None => Ok(Scalar::null(self.dtype())), + Some(p) => match p { + PScalar::U8(u) => Ok(i8::decode(u).into()), + PScalar::U16(u) => Ok(i16::decode(u).into()), + PScalar::U32(u) => Ok(i32::decode(u).into()), + PScalar::U64(u) => Ok(i64::decode(u).into()), + _ => Err(VortexError::InvalidDType(self.dtype().clone())), + }, + }, _ => Err(VortexError::InvalidDType(self.dtype().clone())), } } diff --git a/vortex-zigzag/src/serde.rs b/vortex-zigzag/src/serde.rs index f86352889e..dce75cb996 100644 --- a/vortex-zigzag/src/serde.rs +++ b/vortex-zigzag/src/serde.rs @@ -1,28 +1,21 @@ -use std::io; -use std::io::ErrorKind; - use vortex::array::{Array, ArrayRef}; use vortex::dtype::{DType, Signedness}; +use vortex::error::{VortexError, VortexResult}; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::{ZigZagArray, ZigZagEncoding}; impl ArraySerde for ZigZagArray { - fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { + fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write(self.encoded()) } } impl EncodingSerde for ZigZagEncoding { - fn read(&self, ctx: &mut ReadCtx) -> io::Result { + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let encoded_dtype = match ctx.schema() { DType::Int(w, Signedness::Signed, n) => DType::Int(*w, Signedness::Unsigned, *n), - _ => { - return Err(io::Error::new( - ErrorKind::InvalidData, - "Invalid zigzag encoded dtype, not an signed integer", - )); - } + _ => return Err(VortexError::InvalidDType(ctx.schema().clone())), }; let encoded = ctx.with_schema(&encoded_dtype).read()?; Ok(ZigZagArray::new(encoded).boxed()) @@ -31,17 +24,16 @@ impl EncodingSerde for ZigZagEncoding { #[cfg(test)] mod test { - use std::io; - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::{Array, ArrayRef}; + use vortex::error::VortexResult; use vortex::serde::{ReadCtx, WriteCtx}; use crate::compress::zigzag_encode; use crate::downcast::DowncastZigzag; - fn roundtrip_array(array: &dyn Array) -> io::Result { + fn roundtrip_array(array: &dyn Array) -> VortexResult { let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); write_ctx.write(array)?; diff --git a/vortex-zigzag/src/zigzag.rs b/vortex-zigzag/src/zigzag.rs index aed91579ee..2e04657908 100644 --- a/vortex-zigzag/src/zigzag.rs +++ b/vortex-zigzag/src/zigzag.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; use vortex::dtype::{DType, Signedness}; use vortex::error::{VortexError, VortexResult}; @@ -85,10 +85,6 @@ impl Array for ZigZagArray { Stats::new(&self.stats, self) } - fn iter_arrow(&self) -> Box { - todo!() - } - fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new(self.encoded.slice(start, stop)?)?.boxed()) } @@ -103,8 +99,8 @@ impl Array for ZigZagArray { self.encoded.nbytes() } - fn serde(&self) -> &dyn ArraySerde { - self + fn serde(&self) -> Option<&dyn ArraySerde> { + Some(self) } }