diff --git a/Cargo.lock b/Cargo.lock index 1d30ba4fda..30acb68d80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,9 +68,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.16" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" [[package]] name = "android-tzdata" @@ -101,9 +101,9 @@ checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "anyhow" -version = "1.0.81" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" +checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" [[package]] name = "arc-swap" @@ -730,18 +730,18 @@ checksum = "30c5ef0ede93efbf733c1a727f3b6b5a1060bbedd5600183e66f6e4be4af0ec5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] name = "async-trait" -version = "0.1.79" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507401cad91ec6a857ed5513a2073c82a9b9048762b885bb98655b306964681" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -1149,8 +1149,7 @@ dependencies = [ "uuid", "vortex-alp", "vortex-array", - "vortex-array2", - "vortex-datetime", + "vortex-datetime-parts", "vortex-dict", "vortex-error", "vortex-fastlanes", @@ -1158,7 +1157,6 @@ dependencies = [ "vortex-ree", "vortex-roaring", "vortex-schema", - "vortex-zigzag", ] [[package]] @@ -1180,7 +1178,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.58", + "syn 2.0.59", "which", ] @@ -1259,7 +1257,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", "syn_derive", ] @@ -1406,9 +1404,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41" +checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" dependencies = [ "jobserver", "libc", @@ -1437,15 +1435,15 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" [[package]] name = "chrono" -version = "0.4.37" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -2034,7 +2032,7 @@ checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -2063,15 +2061,15 @@ dependencies = [ [[package]] name = "either" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" [[package]] name = "encoding_rs" -version = "0.8.33" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" dependencies = [ "cfg-if", ] @@ -2093,7 +2091,7 @@ checksum = "03cdc46ec28bd728e67540c528013c6a10eb69a02eb31078a1bda695438cbfb8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -2346,7 +2344,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -2642,9 +2640,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +checksum = "9f24ce812868d86d19daa79bf3bf9175bc44ea323391147a5e3abde2a283871b" dependencies = [ "bytes", "futures-channel", @@ -2684,7 +2682,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes", "http-body-util", - "hyper 1.2.0", + "hyper 1.3.0", "hyper-util", "native-tls", "tokio", @@ -2703,7 +2701,7 @@ dependencies = [ "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.2.0", + "hyper 1.3.0", "pin-project-lite", "socket2", "tokio", @@ -2810,9 +2808,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.28" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" +checksum = "685a7d121ee3f65ae4fddd72b25a04bb36b6af81bc0828f7d5434c0fe60fa3a2" dependencies = [ "libc", ] @@ -2828,9 +2826,9 @@ dependencies = [ [[package]] name = "lance" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c98b63628fcea2758905f904658e55216d71727d981f096a5bcf0d89a2fdc05e" +checksum = "afa389ab3681069a870f0185c17e64a06735223917465592fb15df196b086cfb" dependencies = [ "arrow 50.0.0", "arrow-arith 50.0.0", @@ -2886,9 +2884,9 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925c99b16e32debe8ecd506f348c2ba295905e52f61a76ecf3e686b960a7f548" +checksum = "d17e7fa583788b0d1a21c56b5de319b12e86d9dd229e44e64e13eb15dc260fe6" dependencies = [ "arrow-array 50.0.0", "arrow-buffer 50.0.0", @@ -2905,9 +2903,9 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "442480abf2854a75c8679e6219bacb783541e3efe7fa4c14b1bcab6e5d760f46" +checksum = "128df4adeae4371bc42e1f679c4386f1f1f307435c3ec7bbf4eee6a66390600c" dependencies = [ "arrow-array 50.0.0", "arrow-buffer 50.0.0", @@ -2940,9 +2938,9 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6beda2d782f9f6a51fb27400bfeed44de700cecd5cca7c80d7566339c4649d2" +checksum = "a6a8c151f6c0c4f4e7d42c15e28f823b197ce482b5da71091fd3a20c45adbba2" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", @@ -2955,6 +2953,7 @@ dependencies = [ "futures", "lance-arrow", "lance-core", + "log", "prost", "snafu", "tokio", @@ -2962,15 +2961,16 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e4664b07bfd2bc8244440d6061e2045790993b7dbbcb3176fb434f96672852" +checksum = "adb27ff596acf08fd1c43ddd9aa5860ff1877b0de8d87816b863f9c2f3f737b5" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", "arrow-cast 50.0.0", "arrow-schema 50.0.0", "chrono", + "futures", "hex", "rand", "rand_xoshiro", @@ -2978,9 +2978,9 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49fc1e71d9e55728fe5a2eb16f42b0443e243a06b0650d1b27ac7f552371d402" +checksum = "46923b38df52639c3e7c9ca5bd5a98b5d7efa6695dde88cba16cb811922abd8a" dependencies = [ "arrow-arith 50.0.0", "arrow-array 50.0.0", @@ -3004,9 +3004,9 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aaf97c4b7ffa77b7bbfb112443a59c02f6d49b6e810f0d9669881c1c3c572d82" +checksum = "42c8ff50c7f6981a07cd9ec1e370c5122167840377e55669cd4b56eedd9352f2" dependencies = [ "arrow-arith 50.0.0", "arrow-array 50.0.0", @@ -3015,11 +3015,13 @@ dependencies = [ "arrow-select 50.0.0", "async-recursion", "async-trait", + "byteorder", "bytes", "datafusion-common", "futures", "lance-arrow", "lance-core", + "lance-datagen", "lance-encoding", "lance-io", "num-traits", @@ -3036,9 +3038,9 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e56f45cdea0bc572137d10f2df312da3c6a2b8315e77b3c50030135af667395" +checksum = "03931b8a5419878233b6abe30f398d8c3fbb9e0c54d3b186af74088130ca7d58" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", @@ -3082,9 +3084,9 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89d5f037379dea94a698604dcb67180fde93c401276ad2eef0895bd7ee063d9" +checksum = "5e76227d1b1c4e5f6de54b99b50e1ea4d51af62f894b370d17e1845fdb1c38e1" dependencies = [ "arrow 50.0.0", "arrow-arith 50.0.0", @@ -3120,9 +3122,9 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5af3dc5774dd5c268782547d9f0b9a6e801a6f4d4fed8e586e1d9b4a6238bf43" +checksum = "70a7745db846a0446bf3dfaa8365709908d4a015f96b2b8885f9a5ce5e5c82e2" dependencies = [ "arrow-array 50.0.0", "arrow-ord 50.0.0", @@ -3143,10 +3145,11 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.10.10" +version = "0.10.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39acb106ab5855a4d14e5d7d575426bc2d68a3d62d7220ab812cee8dbbbddf27" +checksum = "0d046f4d07c37ed8e56bfd4685b64289c7a5a4667f7df1f551c1e1fb8b05b01d" dependencies = [ + "arrow 50.0.0", "arrow-array 50.0.0", "arrow-buffer 50.0.0", "arrow-ipc 50.0.0", @@ -3159,6 +3162,7 @@ dependencies = [ "futures", "lance-arrow", "lance-core", + "lance-datagen", "lance-file", "lance-io", "log", @@ -3313,7 +3317,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -3349,7 +3353,7 @@ checksum = "adf157a4dc5a29b7b464aa8fe7edeff30076e07e13646a1c3874f58477dc99f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -3565,9 +3569,9 @@ dependencies = [ [[package]] name = "num" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +checksum = "3135b08af27d103b0a51f2ae0f8632117b7b185ccf931445affa8df530576a41" dependencies = [ "num-bigint", "num-complex", @@ -3694,7 +3698,7 @@ dependencies = [ "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -3781,7 +3785,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4011,7 +4015,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4086,12 +4090,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.17" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" +checksum = "5ac2cf0f2e4f42b49f5ffd07dae8d746508ef7526c13940e5f524012ae6c6550" dependencies = [ "proc-macro2", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4138,9 +4142,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "a56dea16b0a29e94408b9aa5e2940a4eedbd128a1ba20e8f7ae60fd3d465af0e" dependencies = [ "unicode-ident", ] @@ -4172,7 +4176,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.58", + "syn 2.0.59", "tempfile", ] @@ -4186,7 +4190,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4287,7 +4291,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4300,7 +4304,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -4321,7 +4325,6 @@ dependencies = [ "vortex-ree", "vortex-roaring", "vortex-schema", - "vortex-zigzag", ] [[package]] @@ -4352,9 +4355,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -4556,7 +4559,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.2.0", + "hyper 1.3.0", "hyper-tls", "hyper-util", "ipnet", @@ -4877,14 +4880,14 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "itoa", "ryu", @@ -5051,7 +5054,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5094,7 +5097,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5107,7 +5110,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5129,9 +5132,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.58" +version = "2.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a" dependencies = [ "proc-macro2", "quote", @@ -5147,7 +5150,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5254,7 +5257,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5270,9 +5273,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.34" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ "deranged", "itoa", @@ -5293,9 +5296,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ "num-conv", "time-core", @@ -5360,7 +5363,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5484,7 +5487,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -5645,6 +5648,8 @@ dependencies = [ "linkme", "log", "num-traits", + "paste", + "serde", "vortex-array", "vortex-error", "vortex-schema", @@ -5682,36 +5687,14 @@ dependencies = [ ] [[package]] -name = "vortex-array2" +name = "vortex-datetime-parts" version = "0.1.0" dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-schema 51.0.0", - "flatbuffers", - "flexbuffers", - "half", - "humansize", - "itertools 0.12.1", "linkme", "log", - "num-traits", - "paste", "serde", "vortex-array", "vortex-error", - "vortex-flatbuffers", - "vortex-schema", -] - -[[package]] -name = "vortex-datetime" -version = "0.1.0" -dependencies = [ - "linkme", - "log", - "vortex-array", - "vortex-error", "vortex-schema", ] @@ -5726,7 +5709,9 @@ dependencies = [ "linkme", "log", "num-traits", + "paste", "rand", + "serde", "simplelog", "vortex-array", "vortex-error", @@ -5755,7 +5740,9 @@ dependencies = [ "linkme", "log", "num-traits", + "paste", "rand", + "serde", "simplelog", "vortex-array", "vortex-error", @@ -5785,7 +5772,6 @@ dependencies = [ "simplelog", "streaming-iterator", "vortex-array", - "vortex-array2", "vortex-error", "vortex-flatbuffers", "vortex-schema", @@ -5802,6 +5788,8 @@ dependencies = [ "itertools 0.12.1", "linkme", "num-traits", + "paste", + "serde", "vortex-array", "vortex-error", "vortex-schema", @@ -5816,6 +5804,8 @@ dependencies = [ "linkme", "log", "num-traits", + "paste", + "serde", "vortex-array", "vortex-error", "vortex-schema", @@ -5837,18 +5827,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "vortex-zigzag" -version = "0.1.0" -dependencies = [ - "linkme", - "vortex-alloc", - "vortex-array", - "vortex-error", - "vortex-schema", - "zigzag", -] - [[package]] name = "vsimd" version = "0.8.0" @@ -5901,7 +5879,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", "wasm-bindgen-shared", ] @@ -5935,7 +5913,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6018,7 +5996,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -6036,7 +6014,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -6056,17 +6034,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -6077,9 +6056,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" @@ -6089,9 +6068,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" @@ -6101,9 +6080,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" @@ -6113,9 +6098,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" @@ -6125,9 +6110,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" @@ -6137,9 +6122,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" @@ -6149,9 +6134,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "winnow" @@ -6225,7 +6210,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.58", + "syn 2.0.59", ] [[package]] @@ -6234,15 +6219,6 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" -[[package]] -name = "zigzag" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70b40401a28d86ce16a330b863b86fd7dbee4d7c940587ab09ab8c019f9e3fdf" -dependencies = [ - "num-traits", -] - [[package]] name = "zstd" version = "0.13.1" diff --git a/Cargo.toml b/Cargo.toml index 965c0cf538..e6936a9084 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,7 @@ members = [ "vortex-alloc", "vortex-alp", "vortex-array", - "vortex-array2", - "vortex-datetime", + "vortex-datetime-parts", "vortex-dict", "vortex-error", "vortex-fastlanes", @@ -17,7 +16,7 @@ members = [ "vortex-ree", "vortex-roaring", "vortex-schema", - "vortex-zigzag", + #"vortex-zigzag", ] resolver = "2" @@ -31,7 +30,7 @@ keywords = ["vortex"] include = [ "benches/*.rs", "src/**/*.rs", - "Carsgo.toml", + "Cargo.toml", ] edition = "2021" rust-version = "1.76" diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index bc49a9ebac..973b4dc199 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -15,10 +15,13 @@ rust-version = { workspace = true } workspace = true [dependencies] +#vortex-alp = { path = "../vortex-alp" } +vortex-roaring = { path = "../vortex-roaring" } +#vortex-zigzag = { path = "../vortex-zigzag" } arrow = { workspace = true } arrow-array = { workspace = true } -arrow-data = { workspace = true } arrow-csv = { workspace = true } +arrow-data = { workspace = true } arrow-select = { workspace = true } bzip2 = { workspace = true } csv = { workspace = true } @@ -39,16 +42,13 @@ tokio = { workspace = true } uuid = { workspace = true } vortex-alp = { path = "../vortex-alp" } vortex-array = { path = "../vortex-array" } -vortex-array2 = { path = "../vortex-array2" } -vortex-datetime = { path = "../vortex-datetime" } +vortex-datetime-parts = { path = "../vortex-datetime-parts" } vortex-dict = { path = "../vortex-dict" } vortex-error = { path = "../vortex-error", features = ["parquet"] } vortex-fastlanes = { path = "../vortex-fastlanes" } vortex-ipc = { path = "../vortex-ipc" } vortex-ree = { path = "../vortex-ree" } -vortex-roaring = { path = "../vortex-roaring" } vortex-schema = { path = "../vortex-schema" } -vortex-zigzag = { path = "../vortex-zigzag" } [dev-dependencies] criterion = { workspace = true } diff --git a/bench-vortex/src/bin/compress.rs b/bench-vortex/src/bin/compress.rs index 0c112922f4..deccdaf45e 100644 --- a/bench-vortex/src/bin/compress.rs +++ b/bench-vortex/src/bin/compress.rs @@ -9,11 +9,10 @@ use bench_vortex::reader::{open_vortex, rewrite_parquet_as_vortex}; use bench_vortex::taxi_data::taxi_data_parquet; use bench_vortex::{setup_logger, IdempotentPath}; use log::{info, LevelFilter}; -use vortex::array::Array; pub fn main() { setup_logger(LevelFilter::Info); - compress_pbi(PBIDataset::Medicare1); + // compress_pbi(PBIDataset::Medicare1); compress_taxi(); } @@ -25,6 +24,7 @@ fn compress_taxi() { } let taxi_vortex = open_vortex(&path).unwrap(); + info!("{}", taxi_vortex.tree_display()); let pq_size = taxi_data_parquet().metadata().unwrap().size(); let vx_size = taxi_vortex.nbytes(); @@ -33,6 +33,7 @@ fn compress_taxi() { info!("Compression ratio: {}", vx_size as f32 / pq_size as f32); } +#[allow(dead_code)] fn compress_pbi(which_pbi: PBIDataset) { let dataset = PBI(which_pbi); dataset.write_as_vortex(); diff --git a/bench-vortex/src/data_downloads.rs b/bench-vortex/src/data_downloads.rs index d1ccd0fd52..62de6809dd 100644 --- a/bench-vortex/src/data_downloads.rs +++ b/bench-vortex/src/data_downloads.rs @@ -5,18 +5,16 @@ use std::path::{Path, PathBuf}; use arrow_array::RecordBatchReader; use bzip2::read::BzDecoder; -use itertools::Itertools; use lance::dataset::WriteParams; use lance::Dataset; use lance_parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder as LanceParquetRecordBatchReaderBuilder; use log::info; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use tokio::runtime::Runtime; -use vortex::array::chunked::ChunkedArray; -use vortex::array::IntoArray; use vortex::arrow::FromArrowType; -use vortex::serde::WriteCtx; +use vortex::{IntoArray, SerdeContext, ToArrayData}; use vortex_error::{VortexError, VortexResult}; +use vortex_ipc::writer::StreamWriter; use vortex_schema::DType; use crate::idempotent; @@ -37,7 +35,7 @@ pub fn download_data(fname: PathBuf, data_url: &str) -> PathBuf { pub fn parquet_to_lance(lance_fname: &Path, parquet_file: &Path) -> VortexResult { let write_params = WriteParams::default(); - let read = File::open(parquet_file).unwrap(); + let read = File::open(parquet_file)?; let reader = LanceParquetRecordBatchReaderBuilder::try_new(read) .unwrap() .build() @@ -62,18 +60,19 @@ pub fn data_vortex_uncompressed(fname_out: &str, downloaded_data: PathBuf) -> Pa // FIXME(ngates): #157 the compressor should handle batch size. let reader = builder.with_batch_size(BATCH_SIZE).build().unwrap(); - let dtype = DType::from_arrow(reader.schema()); + let ctx = SerdeContext::default(); + let mut write = File::create(path).unwrap(); + let mut writer = StreamWriter::try_new(&mut write, ctx).unwrap(); - let chunks = reader - .map(|batch_result| batch_result.unwrap()) - .map(|record_batch| record_batch.into_array()) - .collect_vec(); - let chunked = ChunkedArray::new(chunks, dtype.clone()); + let dtype = DType::from_arrow(reader.schema()); + writer.write_schema(&dtype).unwrap(); + for batch_result in reader { + writer + .write_batch(&batch_result.unwrap().to_array_data().into_array()) + .unwrap(); + } - let mut write = File::create(path).unwrap(); - let mut write_ctx = WriteCtx::new(&mut write); - write_ctx.dtype(&dtype)?; - write_ctx.write(&chunked) + Ok::<(), VortexError>(()) }) .unwrap() } diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index 0bf29a7afa..17765e0804 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -11,13 +11,12 @@ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet::arrow::ProjectionMask; use simplelog::{ColorChoice, Config, TermLogger, TerminalMode}; use vortex::array::chunked::ChunkedArray; -use vortex::array::IntoArray; -use vortex::array::{Array, ArrayRef}; use vortex::arrow::FromArrowType; use vortex::compress::{CompressConfig, CompressCtx}; -use vortex::encoding::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, VORTEX_ENCODINGS}; +use vortex::{IntoArray, OwnedArray, ToArrayData}; use vortex_alp::ALPEncoding; -use vortex_datetime::DateTimeEncoding; +use vortex_datetime_parts::DateTimePartsEncoding; use vortex_dict::DictEncoding; use vortex_fastlanes::{BitPackedEncoding, FoREncoding}; use vortex_ree::REEEncoding; @@ -104,17 +103,20 @@ pub fn setup_logger(level: LevelFilter) { } pub fn enumerate_arrays() -> Vec { - println!("FOUND {:?}", ENCODINGS.iter().map(|e| e.id()).collect_vec()); + println!( + "FOUND {:?}", + VORTEX_ENCODINGS.iter().map(|e| e.id()).collect_vec() + ); vec![ &ALPEncoding, &DictEncoding, &BitPackedEncoding, &FoREncoding, - &DateTimeEncoding, + &DateTimePartsEncoding, // &DeltaEncoding, Blows up the search space too much. &REEEncoding, &RoaringBoolEncoding, - // RoaringIntEncoding, + // &RoaringIntEncoding, // Doesn't offer anything more than FoR really // ZigZagEncoding, ] @@ -126,10 +128,10 @@ pub fn compress_ctx() -> CompressCtx { CompressCtx::new(Arc::new(cfg)) } -pub fn compress_taxi_data() -> ArrayRef { +pub fn compress_taxi_data() -> OwnedArray { let file = File::open(taxi_data_parquet()).unwrap(); let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); - let _mask = ProjectionMask::roots(builder.parquet_schema(), [1]); + let _mask = ProjectionMask::roots(builder.parquet_schema(), [6]); let _no_datetime_mask = ProjectionMask::roots( builder.parquet_schema(), [0, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], @@ -149,14 +151,16 @@ pub fn compress_taxi_data() -> ArrayRef { let chunks = reader .into_iter() .map(|batch_result| batch_result.unwrap()) - .map(|batch| batch.into_array()) + .map(|batch| batch.to_array_data().into_array()) .map(|array| { uncompressed_size += array.nbytes(); ctx.clone().compress(&array, None).unwrap() }) .collect_vec(); - let compressed = ChunkedArray::new(chunks.clone(), DType::from_arrow(schema)).into_array(); + let compressed = ChunkedArray::try_new(chunks.clone(), DType::from_arrow(schema)) + .unwrap() + .into_array(); info!( "{}, Bytes: {}, Ratio {}", @@ -219,10 +223,11 @@ mod test { use arrow_array::{ArrayRef as ArrowArrayRef, StructArray as ArrowStructArray}; use log::LevelFilter; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; - use vortex::array::ArrayRef; + use vortex::arrow::FromArrowArray; use vortex::compute::as_arrow::as_arrow; - use vortex::encode::FromArrowArray; - use vortex::serde::{ReadCtx, WriteCtx}; + use vortex::{ArrayData, IntoArray}; + use vortex_ipc::reader::StreamReader; + use vortex_ipc::writer::StreamWriter; use crate::taxi_data::taxi_data_parquet; use crate::{compress_ctx, compress_taxi_data, setup_logger}; @@ -244,15 +249,17 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); + let vortex_array = ArrayData::from_arrow(arrow_array.clone(), false).into_array(); let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(vortex_array.as_ref()).unwrap(); + { + let mut writer = StreamWriter::try_new(&mut buf, Default::default()).unwrap(); + writer.write_array(&vortex_array).unwrap(); + } let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(vortex_array.dtype(), &mut read); - read_ctx.read().unwrap(); + let mut reader = StreamReader::try_new(&mut read).unwrap(); + reader.read_array().unwrap(); } } @@ -266,8 +273,8 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); - let vortex_as_arrow = as_arrow(vortex_array.as_ref()).unwrap(); + let vortex_array = ArrayData::from_arrow(arrow_array.clone(), false).into_array(); + let vortex_as_arrow = as_arrow(&vortex_array).unwrap(); assert_eq!(vortex_as_arrow.deref(), arrow_array.deref()); } } @@ -285,10 +292,10 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); + let vortex_array = ArrayData::from_arrow(arrow_array.clone(), false).into_array(); - let compressed = ctx.clone().compress(vortex_array.as_ref(), None).unwrap(); - let compressed_as_arrow = as_arrow(compressed.as_ref()).unwrap(); + let compressed = ctx.clone().compress(&vortex_array, None).unwrap(); + let compressed_as_arrow = as_arrow(&compressed).unwrap(); assert_eq!(compressed_as_arrow.deref(), arrow_array.deref()); } } diff --git a/bench-vortex/src/public_bi_data.rs b/bench-vortex/src/public_bi_data.rs index 0c6a814c3b..06c190383a 100644 --- a/bench-vortex/src/public_bi_data.rs +++ b/bench-vortex/src/public_bi_data.rs @@ -10,7 +10,7 @@ use humansize::{format_size, DECIMAL}; use itertools::Itertools; use log::info; use reqwest::Url; -use vortex::array::Array; +use vortex::ArrayTrait; use vortex_error::VortexResult; use crate::data_downloads::{ diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index 8d4dde86ea..a87add2c56 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -19,26 +19,30 @@ use log::info; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use tokio::runtime::Runtime; use vortex::array::chunked::ChunkedArray; -use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef, IntoArray}; use vortex::arrow::FromArrowType; -use vortex::compute::flatten::flatten; use vortex::compute::take::take; -use vortex::ptype::PType; -use vortex::serde::{ReadCtx, WriteCtx}; +use vortex::{IntoArray, OwnedArray, SerdeContext, ToArrayData, ToStatic}; use vortex_error::VortexResult; +use vortex_ipc::iter::FallibleLendingIterator; +use vortex_ipc::reader::StreamReader; +use vortex_ipc::writer::StreamWriter; use vortex_schema::DType; use crate::compress_ctx; pub const BATCH_SIZE: usize = 65_536; -pub fn open_vortex(path: &Path) -> VortexResult { +pub fn open_vortex(path: &Path) -> VortexResult { let mut file = File::open(path)?; - let dummy_dtype: DType = PType::U8.into(); - let mut read_ctx = ReadCtx::new(&dummy_dtype, &mut file); - let dtype = read_ctx.dtype()?; - read_ctx.with_schema(&dtype).read() + + let mut reader = StreamReader::try_new(&mut file).unwrap(); + let mut reader = reader.next()?.unwrap(); + let dtype = reader.dtype().clone(); + let mut chunks = vec![]; + while let Some(chunk) = reader.next()? { + chunks.push(chunk.to_static()) + } + Ok(ChunkedArray::try_new(chunks, dtype)?.into_array()) } pub fn rewrite_parquet_as_vortex( @@ -47,9 +51,8 @@ pub fn rewrite_parquet_as_vortex( ) -> VortexResult<()> { let chunked = compress_parquet_to_vortex(parquet_path.as_path())?; - let mut write_ctx = WriteCtx::new(write); - write_ctx.dtype(chunked.dtype()).unwrap(); - write_ctx.write(&chunked).unwrap(); + let mut writer = StreamWriter::try_new(write, SerdeContext::default()).unwrap(); + writer.write_array(&chunked.into_array()).unwrap(); Ok(()) } @@ -66,7 +69,7 @@ pub fn compress_parquet_to_vortex(parquet_path: &Path) -> VortexResult VortexResu Ok(()) } -pub fn take_vortex(path: &Path, indices: &[u64]) -> VortexResult { +pub fn take_vortex(path: &Path, indices: &[u64]) -> VortexResult { let array = open_vortex(path)?; - let taken = take(&array, &PrimitiveArray::from(indices.to_vec()))?; + let taken = take(&array, &indices.to_vec().into_array())?; // For equivalence.... we flatten to make sure we're not cheating too much. - flatten(&taken).map(|x| x.into_array()) + taken.flatten().map(|x| x.into_array()) } pub fn take_parquet(path: &Path, indices: &[u64]) -> VortexResult { diff --git a/bench-vortex/src/vortex_utils.rs b/bench-vortex/src/vortex_utils.rs index 5159e52e3b..8fd5cbcc68 100644 --- a/bench-vortex/src/vortex_utils.rs +++ b/bench-vortex/src/vortex_utils.rs @@ -2,8 +2,9 @@ use std::fs::File; use std::os::unix::prelude::MetadataExt; use std::path::Path; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::Array; +use vortex::array::chunked::ChunkedArray; +use vortex::array::r#struct::StructArray; +use vortex::ArrayDType; use vortex_error::VortexResult; use vortex_schema::DType; @@ -20,14 +21,16 @@ pub fn vortex_chunk_sizes(path: &Path) -> VortexResult { }; let mut compressed_sizes = vec![0; ns.len()]; - for chunk in vortex.as_chunked().chunks() { - for (i, f) in chunk.as_struct().fields().iter().enumerate() { + let chunked_array = ChunkedArray::try_from(vortex).unwrap(); + for chunk in chunked_array.chunks() { + let struct_arr = StructArray::try_from(chunk).unwrap(); + for (i, f) in (0..struct_arr.nfields()).map(|i| (i, struct_arr.child(i).unwrap())) { compressed_sizes[i] += f.nbytes() as u64; } } let stats = CompressionRunStats { - schema: vortex.dtype().clone(), + schema: chunked_array.dtype().clone(), file_type: FileType::Vortex, total_compressed_size: Some(total_compressed_size), compressed_sizes, diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 566f9d3464..ec8cb2c5f1 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -28,7 +28,7 @@ vortex-fastlanes = { path = "../vortex-fastlanes" } vortex-ree = { path = "../vortex-ree" } vortex-roaring = { path = "../vortex-roaring" } vortex-schema = { path = "../vortex-schema" } -vortex-zigzag = { path = "../vortex-zigzag" } +#vortex-zigzag = { path = "../vortex-zigzag" } itertools = { workspace = true } log = { workspace = true } paste = { workspace = true } diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 52d25335cf..034c0c7892 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -1,27 +1,32 @@ -use std::sync::Arc; - use paste::paste; use pyo3::prelude::*; -use vortex::array::bool::{BoolArray, BoolEncoding}; -use vortex::array::chunked::{ChunkedArray, ChunkedEncoding}; -use vortex::array::composite::{CompositeArray, CompositeEncoding}; -use vortex::array::constant::{ConstantArray, ConstantEncoding}; -use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::sparse::{SparseArray, SparseEncoding}; -use vortex::array::struct_::{StructArray, StructEncoding}; -use vortex::array::varbin::{VarBinArray, VarBinEncoding}; -use vortex::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; -use vortex::array::{Array, ArrayKind, ArrayRef}; +use vortex::array::bool::{Bool, BoolArray, BoolEncoding, OwnedBoolArray}; +use vortex::array::chunked::{Chunked, ChunkedArray, ChunkedEncoding, OwnedChunkedArray}; +use vortex::array::composite::{Composite, CompositeArray, CompositeEncoding, OwnedCompositeArray}; +use vortex::array::constant::{Constant, ConstantArray, ConstantEncoding, OwnedConstantArray}; +use vortex::array::primitive::{OwnedPrimitiveArray, Primitive, PrimitiveArray, PrimitiveEncoding}; +use vortex::array::r#struct::{OwnedStructArray, Struct, StructArray, StructEncoding}; +use vortex::array::sparse::{OwnedSparseArray, Sparse, SparseArray, SparseEncoding}; +use vortex::array::varbin::{OwnedVarBinArray, VarBin, VarBinArray, VarBinEncoding}; +use vortex::array::varbinview::{ + OwnedVarBinViewArray, VarBinView, VarBinViewArray, VarBinViewEncoding, +}; use vortex::compute::take::take; use vortex::encoding::EncodingRef; -use vortex_alp::{ALPArray, ALPEncoding}; -use vortex_dict::{DictArray, DictEncoding}; +use vortex::ToStatic; +use vortex::{ArrayDType, ArrayData, IntoArray, OwnedArray}; +use vortex::{ArrayDef, IntoArrayData}; +use vortex_alp::{ALPArray, ALPEncoding, OwnedALPArray, ALP}; +use vortex_dict::{Dict, DictArray, DictEncoding, OwnedDictArray}; use vortex_fastlanes::{ - BitPackedArray, BitPackedEncoding, DeltaArray, DeltaEncoding, FoRArray, FoREncoding, + BitPacked, BitPackedArray, BitPackedEncoding, Delta, DeltaArray, DeltaEncoding, FoR, FoRArray, + FoREncoding, OwnedBitPackedArray, OwnedDeltaArray, OwnedFoRArray, +}; +use vortex_ree::{OwnedREEArray, REEArray, REEEncoding, REE}; +use vortex_roaring::{ + OwnedRoaringBoolArray, OwnedRoaringIntArray, RoaringBool, RoaringBoolArray, + RoaringBoolEncoding, RoaringInt, RoaringIntArray, RoaringIntEncoding, }; -use vortex_ree::{REEArray, REEEncoding}; -use vortex_roaring::{RoaringBoolArray, RoaringBoolEncoding, RoaringIntArray, RoaringIntEncoding}; -use vortex_zigzag::{ZigZagArray, ZigZagEncoding}; use crate::dtype::PyDType; use crate::error::PyVortexError; @@ -29,7 +34,7 @@ use crate::vortex_arrow; #[pyclass(name = "Array", module = "vortex", sequence, subclass)] pub struct PyArray { - inner: ArrayRef, + inner: OwnedArray, } macro_rules! pyarray { @@ -37,14 +42,14 @@ macro_rules! pyarray { paste! { #[pyclass(name = $TName, module = "vortex", extends = PyArray, sequence, subclass)] pub struct [] { - inner: Arc<$T>, + inner: [], #[allow(dead_code)] encoding: EncodingRef, } impl [] { - pub fn wrap(py: Python<'_>, inner: Arc<$T>) -> PyResult> { - let init = PyClassInitializer::from(PyArray { inner: inner.clone() }) + pub fn wrap(py: Python<'_>, inner: []) -> PyResult> { + let init = PyClassInitializer::from(PyArray { inner: inner.array().to_static() }) .add_subclass([] { inner, encoding: &$E }); Py::new(py, init) } @@ -75,97 +80,137 @@ pyarray!(DictEncoding, DictArray, "DictArray"); pyarray!(REEEncoding, REEArray, "REEArray"); pyarray!(RoaringBoolEncoding, RoaringBoolArray, "RoaringBoolArray"); pyarray!(RoaringIntEncoding, RoaringIntArray, "RoaringIntArray"); -pyarray!(ZigZagEncoding, ZigZagArray, "ZigZagArray"); +// pyarray!(ZigZagEncoding, ZigZagArray, "ZigZagArray"); impl PyArray { - pub fn wrap(py: Python<'_>, inner: ArrayRef) -> PyResult> { + pub fn wrap(py: Python<'_>, inner: ArrayData) -> PyResult> { // This is the one place where we'd want to have owned kind enum but there's no other place this is used - match ArrayKind::from(inner.as_ref()) { - ArrayKind::Bool(_) => { - PyBoolArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Chunked(_) => { - PyChunkedArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Composite(_) => { - PyCompositeArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Constant(_) => { - PyConstantArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Primitive(_) => { - PyPrimitiveArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Sparse(_) => { - PySparseArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::Struct(_) => { - PyStructArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::VarBin(_) => { - PyVarBinArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - ArrayKind::VarBinView(_) => PyVarBinViewArray::wrap( - py, - inner.into_any().downcast::().unwrap(), - )? - .extract(py), - ArrayKind::Other(other) => match other.encoding().id() { - // PyEnc chooses to expose certain encodings as first-class objects. - // For the remainder, we should have a generic EncArray implementation that supports basic functions. - ALPEncoding::ID => { - PyALPArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - DeltaEncoding::ID => { - PyDeltaArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - DictEncoding::ID => { - PyDictArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - FoREncoding::ID => { - PyFoRArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - BitPackedEncoding::ID => PyBitPackedArray::wrap( - py, - inner.into_any().downcast::().unwrap(), - )? - .extract(py), - REEEncoding::ID => { - PyREEArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - RoaringBoolEncoding::ID => PyRoaringBoolArray::wrap( - py, - inner.into_any().downcast::().unwrap(), - )? - .extract(py), - RoaringIntEncoding::ID => PyRoaringIntArray::wrap( - py, - inner.into_any().downcast::().unwrap(), - )? - .extract(py), - ZigZagEncoding::ID => { - PyZigZagArray::wrap(py, inner.into_any().downcast::().unwrap())? - .extract(py) - } - _ => Py::new(py, Self { inner }), - }, + match inner.encoding().id() { + Bool::ID => PyBoolArray::wrap( + py, + OwnedBoolArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + Chunked::ID => PyChunkedArray::wrap( + py, + OwnedChunkedArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + Composite::ID => PyCompositeArray::wrap( + py, + OwnedCompositeArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + Constant::ID => PyConstantArray::wrap( + py, + OwnedConstantArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + Primitive::ID => PyPrimitiveArray::wrap( + py, + OwnedPrimitiveArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + Sparse::ID => PySparseArray::wrap( + py, + OwnedSparseArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + Struct::ID => PyStructArray::wrap( + py, + OwnedStructArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + VarBin::ID => PyVarBinArray::wrap( + py, + OwnedVarBinArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + VarBinView::ID => PyVarBinViewArray::wrap( + py, + OwnedVarBinViewArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + Dict::ID => PyDictArray::wrap( + py, + OwnedDictArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + REE::ID => PyREEArray::wrap( + py, + OwnedREEArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + Delta::ID => PyDeltaArray::wrap( + py, + OwnedDeltaArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + FoR::ID => PyFoRArray::wrap( + py, + OwnedFoRArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + BitPacked::ID => PyBitPackedArray::wrap( + py, + OwnedBitPackedArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + + ALP::ID => PyALPArray::wrap( + py, + OwnedALPArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, + )? + .extract(py), + RoaringBool::ID => PyBitPackedArray::wrap( + py, + OwnedBitPackedArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + RoaringInt::ID => PyBitPackedArray::wrap( + py, + OwnedBitPackedArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + _ => Py::new( + py, + Self { + inner: inner.into_array(), + }, + ), + // ArrayKind::Other(other) => match other.encoding().id() { + // // PyEnc chooses to expose certain encodings as first-class objects. + // // For the remainder, we should have a generic EncArray implementation that supports basic functions. + // ALPEncoding::ID => { + // PyALPArray::wrap(py, inner.into_any().downcast::().unwrap())? + // .extract(py) + // } + // RoaringBoolEncoding::ID => PyRoaringBoolArray::wrap( + // py, + // inner.into_any().downcast::().unwrap(), + // )? + // .extract(py), + // RoaringIntEncoding::ID => PyRoaringIntArray::wrap( + // py, + // inner.into_any().downcast::().unwrap(), + // )? + // .extract(py), + // ZigZagEncoding::ID => { + // PyZigZagArray::wrap(py, inner.into_any().downcast::().unwrap())? + // .extract(py) + // } + // _ => Py::new(py, Self { inner }), + //}, } } - pub fn unwrap(&self) -> &ArrayRef { + pub fn unwrap(&self) -> &OwnedArray { &self.inner } } @@ -202,36 +247,36 @@ impl PyArray { fn take(&self, indices: PyRef<'_, PyArray>) -> PyResult> { take(&self.inner, indices.unwrap()) .map_err(PyVortexError::map_err) - .and_then(|arr| PyArray::wrap(indices.py(), arr)) - } -} - -#[pymethods] -impl PyRoaringBoolArray { - #[staticmethod] - fn encode(array: PyRef<'_, PyArray>) -> PyResult> { - RoaringBoolArray::encode(array.unwrap()) - .map_err(PyVortexError::map_err) - .and_then(|zarray| PyArray::wrap(array.py(), zarray.into_array())) - } -} - -#[pymethods] -impl PyRoaringIntArray { - #[staticmethod] - fn encode(array: PyRef<'_, PyArray>) -> PyResult> { - RoaringIntArray::encode(array.unwrap()) - .map_err(PyVortexError::map_err) - .and_then(|zarray| PyArray::wrap(array.py(), zarray.into_array())) - } -} - -#[pymethods] -impl PyZigZagArray { - #[staticmethod] - fn encode(array: PyRef<'_, PyArray>) -> PyResult> { - ZigZagArray::encode(array.unwrap()) - .map_err(PyVortexError::map_err) - .and_then(|zarray| PyArray::wrap(array.py(), zarray)) + .and_then(|arr| PyArray::wrap(indices.py(), arr.into_array_data())) } } +// +// #[pymethods] +// impl PyRoaringBoolArray { +// #[staticmethod] +// fn encode(array: PyRef<'_, PyArray>) -> PyResult> { +// RoaringBoolArray::encode(array.unwrap()) +// .map_err(PyVortexError::map_err) +// .and_then(|zarray| PyArray::wrap(array.py(), zarray.into_array())) +// } +// } +// +// #[pymethods] +// impl PyRoaringIntArray { +// #[staticmethod] +// fn encode(array: PyRef<'_, PyArray>) -> PyResult> { +// RoaringIntArray::encode(array.unwrap()) +// .map_err(PyVortexError::map_err) +// .and_then(|zarray| PyArray::wrap(array.py(), zarray.into_array())) +// } +// } +// +// #[pymethods] +// impl PyZigZagArray { +// #[staticmethod] +// fn encode(array: PyRef<'_, PyArray>) -> PyResult> { +// ZigZagArray::encode(array.unwrap()) +// .map_err(PyVortexError::map_err) +// .and_then(|zarray| PyArray::wrap(array.py(), zarray)) +// } +// } diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index c17aabc849..9c9342662a 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use pyo3::types::PyType; use pyo3::{pyclass, pyfunction, pymethods, Py, PyResult, Python}; use vortex::compress::{CompressConfig, CompressCtx}; -use vortex::encoding::ENCODINGS; use crate::array::PyArray; use crate::error::PyVortexError; diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index 270268b917..6d1382e053 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -1,4 +1,4 @@ -use arrow::array::{make_array, ArrayData}; +use arrow::array::{make_array, ArrayData as ArrowArrayData}; use arrow::datatypes::{DataType, Field}; use arrow::ffi_stream::ArrowArrayStreamReader; use arrow::pyarrow::FromPyArrow; @@ -6,13 +6,12 @@ use arrow::record_batch::RecordBatchReader; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use vortex::array::chunked::ChunkedArray; -use vortex::array::IntoArray; -use vortex::array::{Array, ArrayRef}; -use vortex::arrow::FromArrowType; -use vortex::encode::FromArrowArray; +use vortex::arrow::{FromArrowArray, FromArrowType}; +use vortex::{ArrayData, IntoArray, ToArrayData}; use vortex_schema::DType; use crate::array::PyArray; +use crate::error::PyVortexError; use crate::vortex_arrow::map_arrow_err; /// The main entry point for creating enc arrays from other Python objects. @@ -25,35 +24,45 @@ pub fn encode(obj: &PyAny) -> PyResult> { let table = pa.getattr("Table")?; if obj.is_instance(pa_array)? { - let arrow_array = ArrayData::from_pyarrow(obj).map(make_array)?; - let enc_array = ArrayRef::from_arrow(arrow_array, false); + let arrow_array = ArrowArrayData::from_pyarrow(obj).map(make_array)?; + let enc_array = ArrayData::from_arrow(arrow_array, false); PyArray::wrap(obj.py(), enc_array) } else if obj.is_instance(chunked_array)? { let chunks: Vec<&PyAny> = obj.getattr("chunks")?.extract()?; let encoded_chunks = chunks .iter() .map(|a| { - ArrayData::from_pyarrow(a) + ArrowArrayData::from_pyarrow(a) .map(make_array) - .map(|a| ArrayRef::from_arrow(a, false)) + .map(|a| ArrayData::from_arrow(a, false).into_array()) }) - .collect::>>()?; + .collect::>>()?; let dtype: DType = obj .getattr("type") .and_then(DataType::from_pyarrow) .map(|dt| DType::from_arrow(&Field::new("_", dt, false)))?; PyArray::wrap( obj.py(), - ChunkedArray::new(encoded_chunks, dtype).into_array(), + ChunkedArray::try_new(encoded_chunks, dtype) + .map_err(PyVortexError::map_err)? + .to_array_data(), ) } else if obj.is_instance(table)? { let array_stream = ArrowArrayStreamReader::from_pyarrow(obj)?; let dtype = DType::from_arrow(array_stream.schema()); let chunks = array_stream .into_iter() - .map(|b| b.map(|bb| bb.into_array()).map_err(map_arrow_err)) - .collect::>>()?; - PyArray::wrap(obj.py(), ChunkedArray::new(chunks, dtype).into_array()) + .map(|b| { + b.map(|bb| bb.to_array_data().into_array()) + .map_err(map_arrow_err) + }) + .collect::>>()?; + PyArray::wrap( + obj.py(), + ChunkedArray::try_new(chunks, dtype) + .map_err(PyVortexError::map_err)? + .to_array_data(), + ) } else { Err(PyValueError::new_err("Cannot convert object to enc array")) } diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index e8a33287fc..fb3d08b160 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -1,18 +1,16 @@ use dtype::PyDType; use log::debug; use pyo3::prelude::*; +use vortex::encoding::VORTEX_ENCODINGS; use vortex_schema::DType; use vortex_schema::Signedness::{Signed, Unsigned}; use crate::array::*; -use crate::compress::PyCompressConfig; mod array; -mod compress; mod dtype; mod encode; mod error; -mod serde; mod vortex_arrow; /// A Python module implemented in Rust. @@ -22,16 +20,14 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { debug!( "Discovered encodings: {:?}", - vortex::encoding::ENCODINGS + VORTEX_ENCODINGS .iter() .map(|e| e.id().to_string()) .collect::>() ); m.add_function(wrap_pyfunction!(encode::encode, m)?)?; - m.add_function(wrap_pyfunction!(compress::compress, m)?)?; - m.add_function(wrap_pyfunction!(serde::write, m)?)?; - m.add_function(wrap_pyfunction!(serde::read, m)?)?; + // m.add_function(wrap_pyfunction!(compress::compress, m)?)?; m.add_class::()?; m.add_class::()?; @@ -40,6 +36,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -49,12 +46,11 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + // m.add_class::()?; + m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_function(wrap_pyfunction!(dtype_int, m)?)?; m.add_function(wrap_pyfunction!(dtype_uint, m)?)?; m.add_function(wrap_pyfunction!(dtype_float, m)?)?; diff --git a/pyvortex/src/serde.rs b/pyvortex/src/serde.rs deleted file mode 100644 index 97f27df2ac..0000000000 --- a/pyvortex/src/serde.rs +++ /dev/null @@ -1,105 +0,0 @@ -use std::io; -use std::io::{ErrorKind, Read, Write}; - -use pyo3::exceptions::{PyTypeError, PyValueError}; -use pyo3::{ffi, pyfunction, FromPyPointer, IntoPy, Py, PyAny, PyResult, Python}; -use vortex::serde::{ReadCtx, WriteCtx}; - -use crate::array::PyArray; -use crate::dtype::PyDType; - -#[pyfunction] -pub fn read(py: Python<'_>, schema: &PyDType, read: &PyAny) -> PyResult> { - if !read.hasattr("readinto")? { - return Err(PyTypeError::new_err( - "reader has to support `readinto` method", - )); - } - let read_no_gil: Py = read.into_py(py); - let mut pyread = PyRead::new(read_no_gil); - let mut ctx = ReadCtx::new(schema.unwrap(), &mut pyread); - ctx.read() - .map_err(|e| PyValueError::new_err(e.to_string())) - .and_then(|arr| PyArray::wrap(py, arr)) -} - -struct PyRead { - pyref: Py, -} - -impl PyRead { - pub fn new(pyref: Py) -> Self { - Self { pyref } - } -} - -impl Read for PyRead { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - Python::with_gil(|py| { - let view = unsafe { - let v = ffi::PyMemoryView_FromMemory( - buf.as_mut_ptr() as _, - buf.len() as ffi::Py_ssize_t, - ffi::PyBUF_WRITE, - ); - PyAny::from_owned_ptr(py, v) - }; - self.pyref - .call_method(py, "readinto", (view,), None) - .and_then(|v| v.extract(py)) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) - }) - } -} - -#[pyfunction] -pub fn write(py: Python<'_>, arr: &PyArray, write: &PyAny) -> PyResult<()> { - if !write.hasattr("write")? && !write.hasattr("flush")? { - return Err(PyTypeError::new_err( - "writer has to support `write` and `flush` methods", - )); - } - let write_no_gil: Py = write.into_py(py); - let mut pywrite = PyWrite::new(write_no_gil); - let mut ctx = WriteCtx::new(&mut pywrite); - ctx.write(arr.unwrap()) - .map_err(|e| PyValueError::new_err(e.to_string())) -} - -struct PyWrite { - pyref: Py, -} - -impl PyWrite { - pub fn new(pyref: Py) -> Self { - Self { pyref } - } -} - -impl Write for PyWrite { - fn write(&mut self, buf: &[u8]) -> io::Result { - Python::with_gil(|py| { - let view = unsafe { - let v = ffi::PyMemoryView_FromMemory( - buf.as_ptr() as _, - buf.len() as ffi::Py_ssize_t, - ffi::PyBUF_READ, - ); - PyAny::from_owned_ptr(py, v) - }; - self.pyref - .call_method(py, "write", (view,), None) - .and_then(|v| v.extract(py)) - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e)) - }) - } - - fn flush(&mut self) -> io::Result<()> { - Python::with_gil(|py| { - self.pyref - .call_method0(py, "flush") - .map(|_| ()) - .map_err(|e| io::Error::new(ErrorKind::InvalidInput, e)) - }) - } -} diff --git a/pyvortex/src/vortex_arrow.rs b/pyvortex/src/vortex_arrow.rs index c9d58d641b..449330a937 100644 --- a/pyvortex/src/vortex_arrow.rs +++ b/pyvortex/src/vortex_arrow.rs @@ -4,17 +4,19 @@ use arrow::pyarrow::ToPyArrow; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyList}; -use vortex::array::Array; use vortex::compute::as_arrow::as_arrow_chunks; +use vortex::Array; + +use crate::error::PyVortexError; pub fn map_arrow_err(error: ArrowError) -> PyErr { PyValueError::new_err(error.to_string()) } -pub fn export_array<'py>(py: Python<'py>, array: &dyn Array) -> PyResult<&'py PyAny> { +pub fn export_array<'py>(py: Python<'py>, array: &Array<'_>) -> PyResult<&'py PyAny> { // NOTE(ngates): for struct arrays, we could also return a RecordBatchStreamReader. // NOTE(robert): Return RecordBatchStreamReader always? - let chunks = as_arrow_chunks(array).unwrap(); + let chunks = as_arrow_chunks(array).map_err(PyVortexError::map_err)?; if chunks.is_empty() { return Err(PyValueError::new_err("No chunks in array")); } diff --git a/pyvortex/test/test_compress.py b/pyvortex/test/test_compress.py index 9f5406adac..e360da103c 100644 --- a/pyvortex/test/test_compress.py +++ b/pyvortex/test/test_compress.py @@ -8,6 +8,7 @@ import vortex +@pytest.mark.xfail(reason="Not yet implemented") def test_primitive_compress(): a = pa.array([0, 0, 0, 0, 9, 9, 9, 9, 1, 5]) arr_compressed = vortex.compress(vortex.encode(a)) @@ -15,12 +16,14 @@ def test_primitive_compress(): assert arr_compressed.nbytes < a.nbytes +@pytest.mark.xfail(reason="Not yet implemented") def test_for_compress(): a = pa.array(np.arange(10_000) + 10_000_000) arr_compressed = vortex.compress(vortex.encode(a)) assert not isinstance(arr_compressed, vortex.PrimitiveArray) +@pytest.mark.xfail(reason="Not yet implemented") def test_bool_compress(): a = vortex.encode(pa.array([False] * 10_000 + [True] * 10_000)) arr_compressed = vortex.compress(a) @@ -29,6 +32,7 @@ def test_bool_compress(): assert arr_compressed.nbytes < a.nbytes +@pytest.mark.xfail(reason="Not yet implemented") def test_roaring_bool_encode(): a = vortex.encode(pa.array([True] * 10_000)) rarr = vortex.RoaringBoolArray.encode(a) @@ -36,6 +40,7 @@ def test_roaring_bool_encode(): assert rarr.nbytes < a.nbytes +@pytest.mark.xfail(reason="Not yet implemented") def test_arange_encode(): a = vortex.encode(pa.array(np.arange(10_000), type=pa.uint32())) compressed = vortex.compress(a) @@ -43,6 +48,7 @@ def test_arange_encode(): assert compressed.nbytes < a.nbytes +@pytest.mark.xfail(reason="Not yet implemented") def test_zigzag_encode(): a = vortex.encode(pa.array([-1, -1, 0, -1, 1, -1])) zarr = vortex.ZigZagArray.encode(a) diff --git a/pyvortex/test/test_serde.py b/pyvortex/test/test_serde.py deleted file mode 100644 index 175e473ac3..0000000000 --- a/pyvortex/test/test_serde.py +++ /dev/null @@ -1,18 +0,0 @@ -import pyarrow as pa -import vortex -from pyarrow import fs - -local = fs.LocalFileSystem() - - -def test_serde(tmp_path): - a = pa.array([0, 1, 2, 3]) - arr = vortex.encode(a) - assert isinstance(arr, vortex.PrimitiveArray) - subfs = fs.SubTreeFileSystem(str(tmp_path), local) - with subfs.open_output_stream("array.enc", buffer_size=8192) as nf: - vortex.write(arr, nf) - - with subfs.open_input_stream("array.enc", buffer_size=8192) as nf: - read_array = vortex.read(arr.dtype, nf) - assert isinstance(read_array, vortex.PrimitiveArray) diff --git a/vortex-alp/Cargo.toml b/vortex-alp/Cargo.toml index 9f9cee7fc2..5740705e83 100644 --- a/vortex-alp/Cargo.toml +++ b/vortex-alp/Cargo.toml @@ -15,13 +15,15 @@ rust-version = { workspace = true } workspace = true [dependencies] +itertools = { workspace = true } +linkme = { workspace = true } +log = { workspace = true } +num-traits = { workspace = true } +paste = { workspace = true } +serde = { workspace = true, features = ["derive"] } vortex-array = { path = "../vortex-array" } vortex-error = { path = "../vortex-error" } vortex-schema = { path = "../vortex-schema" } -linkme = { workspace = true } -itertools = { workspace = true } -num-traits = { workspace = true } -log = { workspace = true } [dev-dependencies] divan = { workspace = true } diff --git a/vortex-alp/src/alp.rs b/vortex-alp/src/alp.rs index 6a3a5c7095..f8251a96da 100644 --- a/vortex-alp/src/alp.rs +++ b/vortex-alp/src/alp.rs @@ -2,10 +2,11 @@ use std::mem::size_of; use itertools::Itertools; use num_traits::{Float, NumCast, PrimInt, Zero}; +use serde::{Deserialize, Serialize}; const SAMPLE_SIZE: usize = 32; -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct Exponents { pub e: u8, pub f: u8, diff --git a/vortex-alp/src/array.rs b/vortex-alp/src/array.rs index 4de69a3e4a..ea5e2c330b 100644 --- a/vortex-alp/src/array.rs +++ b/vortex-alp/src/array.rs @@ -1,39 +1,31 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::{ArrayValidity, Validity}; -use vortex::{impl_array, ArrayWalker}; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::{DType, IntWidth, Signedness}; +use serde::{Deserialize, Serialize}; +use vortex::array::primitive::PrimitiveArray; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, IntoArrayData, OwnedArray, ToArrayData}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::{IntWidth, Signedness}; use crate::alp::Exponents; -use crate::compress::alp_encode; +use crate::compress::{alp_encode, decompress}; -#[derive(Debug, Clone)] -pub struct ALPArray { - encoded: ArrayRef, +impl_encoding!("vortex.alp", ALP); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ALPMetadata { exponents: Exponents, - patches: Option, - dtype: DType, - stats: Arc>, + encoded_dtype: DType, + patches_dtype: Option, } -impl ALPArray { - pub fn new(encoded: ArrayRef, exponents: Exponents, patches: Option) -> Self { - Self::try_new(encoded, exponents, patches).unwrap() - } - +impl ALPArray<'_> { pub fn try_new( - encoded: ArrayRef, + encoded: Array, exponents: Exponents, - patches: Option, + patches: Option, ) -> VortexResult { + let encoded_dtype = encoded.dtype().clone(); let dtype = match encoded.dtype() { DType::Int(IntWidth::_32, Signedness::Signed, nullability) => { DType::Float(32.into(), *nullability) @@ -43,120 +35,88 @@ impl ALPArray { } d => vortex_bail!(MismatchedTypes: "int32 or int64", d), }; - Ok(Self { - encoded, - exponents, - patches, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Primitive(p) => Ok(alp_encode(p)?.into_array()), - _ => Err(vortex_err!("ALP can only encoding primitive arrays")), + let mut children = Vec::with_capacity(2); + children.push(encoded.into_array_data()); + if let Some(ref patch) = patches { + children.push(patch.to_array_data()); + } + + Self::try_from_parts( + dtype, + ALPMetadata { + exponents, + encoded_dtype, + patches_dtype: patches.map(|a| a.dtype().as_nullable()), + }, + children.into(), + Default::default(), + ) + } + + pub fn encode(array: Array<'_>) -> VortexResult { + if let Ok(parray) = PrimitiveArray::try_from(array) { + Ok(alp_encode(&parray)?.into_array()) + } else { + vortex_bail!("ALP can only encode primitive arrays"); } } - pub fn encoded(&self) -> &ArrayRef { - &self.encoded + pub fn encoded(&self) -> Array { + self.array() + .child(0, &self.metadata().encoded_dtype) + .expect("Missing encoded array") } pub fn exponents(&self) -> &Exponents { - &self.exponents + &self.metadata().exponents } - pub fn patches(&self) -> Option<&ArrayRef> { - self.patches.as_ref() + pub fn patches(&self) -> Option { + self.metadata().patches_dtype.as_ref().map(|dt| { + self.array() + .child(1, dt) + .expect("Missing patches with present metadata flag") + }) } } -impl Array for ALPArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.encoded.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.encoded.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &ALPEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.encoded().nbytes() + self.patches().map(|p| p.nbytes()).unwrap_or(0) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) +impl ArrayValidity for ALPArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.encoded().with_dyn(|a| a.is_valid(index)) } - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.encoded()) + fn logical_validity(&self) -> LogicalValidity { + self.encoded().with_dyn(|a| a.logical_validity()) } } -impl ArrayDisplay for ALPArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("exponents", format!("{:?}", self.exponents()))?; - f.child("encoded", self.encoded())?; - f.maybe_child("patches", self.patches()) +impl ArrayFlatten for ALPArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + decompress(self).map(Flattened::Primitive) } } -impl ArrayValidity for ALPArray { - fn logical_validity(&self) -> Validity { - self.encoded().logical_validity() - } - - fn is_valid(&self, index: usize) -> bool { - self.encoded().is_valid(index) +impl AcceptArrayVisitor for ALPArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("encoded", &self.encoded())?; + if self.patches().is_some() { + visitor.visit_child( + "patches", + &self.patches().expect("Expected patches to be present "), + )?; + } + Ok(()) } } -#[derive(Debug)] -pub struct ALPEncoding; +impl ArrayStatisticsCompute for ALPArray<'_> {} -impl ALPEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.alp"); -} - -impl Encoding for ALPEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for ALPArray<'_> { + fn len(&self) -> usize { + self.encoded().len() } } diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index a6bd6e6a10..389eb1044d 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -1,20 +1,16 @@ use itertools::Itertools; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::sparse::{SparseArray, SparseEncoding}; -use vortex::array::{Array, ArrayRef}; +use vortex::array::sparse::{Sparse, SparseArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::compute::flatten::flatten_primitive; use vortex::ptype::{NativePType, PType}; use vortex::scalar::Scalar; -use vortex::validity::OwnedValidity; -use vortex::view::ToOwnedView; +use vortex::validity::Validity; +use vortex::{Array, ArrayDType, ArrayDef, AsArray, IntoArray, OwnedArray}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use crate::alp::ALPFloat; use crate::array::{ALPArray, ALPEncoding}; -use crate::downcast::DowncastALP; -use crate::Exponents; +use crate::{Exponents, OwnedALPArray}; #[macro_export] macro_rules! match_each_alp_float_ptype { @@ -34,11 +30,11 @@ macro_rules! match_each_alp_float_ptype { impl EncodingCompression for ALPEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive arrays - let parray = array.maybe_primitive()?; + let parray = PrimitiveArray::try_from(array).ok()?; // Only supports f32 and f64 if !matches!(parray.ptype(), PType::F32 | PType::F64) { @@ -50,41 +46,44 @@ impl EncodingCompression for ALPEncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { - let like_alp = like.map(|like_array| like_array.as_alp()); + ) -> VortexResult> { + let like_alp = like.map(|like_array| like_array.as_array_ref()); + let like_exponents = like + .map(|like_array| ALPArray::try_from(like_array).unwrap()) + .map(|a| a.exponents().to_owned()); // TODO(ngates): fill forward nulls let parray = array.as_primitive(); let (exponents, encoded, patches) = match_each_alp_float_ptype!( parray.ptype(), |$T| { - encode_to_array::<$T>(parray, like_alp.map(|l| l.exponents())) + encode_to_array::<$T>(&parray, like_exponents.as_ref()) })?; let compressed_encoded = ctx .named("packed") .excluding(&ALPEncoding) - .compress(encoded.as_ref(), like_alp.map(|a| a.encoded()))?; + .compress(encoded.as_array_ref(), like_alp)?; let compressed_patches = patches .map(|p| { ctx.auxiliary("patches") .excluding(&ALPEncoding) - .compress(p.as_ref(), like_alp.and_then(|a| a.patches())) + .compress(p.as_array_ref(), like_alp) }) .transpose()?; - Ok(ALPArray::new(compressed_encoded, exponents, compressed_patches).into_array()) + ALPArray::try_new(compressed_encoded, exponents, compressed_patches).map(|a| a.into_array()) } } fn encode_to_array( values: &PrimitiveArray, exponents: Option<&Exponents>, -) -> (Exponents, ArrayRef, Option) +) -> (Exponents, OwnedArray, Option) where T: ALPFloat + NativePType, T::ALPInt: NativePType, @@ -93,13 +92,11 @@ where let len = encoded.len(); ( exponents, - PrimitiveArray::from(encoded) - .into_nullable(values.nullability()) - .into_array(), + PrimitiveArray::from_vec(encoded, values.validity()).into_array(), (!exc.is_empty()).then(|| { SparseArray::new( PrimitiveArray::from(exc_pos).into_array(), - PrimitiveArray::from(exc).into_array(), + PrimitiveArray::from_vec(exc, Validity::AllValid).into_array(), len, Scalar::null(&values.dtype().as_nullable()), ) @@ -108,41 +105,46 @@ where ) } -pub(crate) fn alp_encode(parray: &PrimitiveArray) -> VortexResult { +pub(crate) fn alp_encode(parray: &PrimitiveArray) -> VortexResult { let (exponents, encoded, patches) = match parray.ptype() { PType::F32 => encode_to_array::(parray, None), PType::F64 => encode_to_array::(parray, None), _ => vortex_bail!("ALP can only encode f32 and f64"), }; - Ok(ALPArray::new(encoded, exponents, patches)) + ALPArray::try_new(encoded, exponents, patches) } -pub fn decompress(array: &ALPArray) -> VortexResult { - let encoded = flatten_primitive(array.encoded())?; +pub fn decompress(array: ALPArray) -> VortexResult { + let encoded = array.encoded().clone().flatten_primitive()?; + let decoded = match_each_alp_float_ptype!(array.dtype().try_into().unwrap(), |$T| { - PrimitiveArray::from_nullable( + PrimitiveArray::from_vec( decompress_primitive::<$T>(encoded.typed_data(), array.exponents()), - encoded.validity().to_owned_view(), + encoded.validity(), ) })?; if let Some(patches) = array.patches() { - patch_decoded(decoded, patches) + patch_decoded(decoded, &patches) } else { Ok(decoded) } } -fn patch_decoded(array: PrimitiveArray, patches: &dyn Array) -> VortexResult { +fn patch_decoded<'a>( + array: PrimitiveArray<'a>, + patches: &Array, +) -> VortexResult> { match patches.encoding().id() { - SparseEncoding::ID => { + Sparse::ID => { match_each_alp_float_ptype!(array.ptype(), |$T| { + let typed_patches = SparseArray::try_from(patches).unwrap(); array.patch( - &patches.as_sparse().resolved_indices(), - flatten_primitive(patches.as_sparse().values())?.typed_data::<$T>())? + &typed_patches.resolved_indices(), + typed_patches.values().flatten_primitive()?.typed_data::<$T>())? }) } - _ => panic!("can't patch alp array with {}", patches), + _ => panic!("can't patch ALP array with {}", patches), } } @@ -166,28 +168,28 @@ mod tests { let encoded = alp_encode(&array).unwrap(); assert!(encoded.patches().is_none()); assert_eq!( - encoded.encoded().as_primitive().typed_data::(), + encoded.encoded().into_primitive().typed_data::(), vec![1234; 1025] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); - let decoded = decompress(&encoded).unwrap(); + let decoded = decompress(encoded).unwrap(); assert_eq!(array.typed_data::(), decoded.typed_data::()); } #[test] fn test_nullable_compress() { - let array = PrimitiveArray::from_iter(vec![None, Some(1.234f32), None]); + let array = PrimitiveArray::from_nullable_vec(vec![None, Some(1.234f32), None]); let encoded = alp_encode(&array).unwrap(); println!("Encoded {:?}", encoded); assert!(encoded.patches().is_none()); assert_eq!( - encoded.encoded().as_primitive().typed_data::(), + encoded.encoded().into_primitive().typed_data::(), vec![0, 1234, 0] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); - let decoded = decompress(&encoded).unwrap(); + let decoded = decompress(encoded).unwrap(); let expected = vec![0f32, 1.234f32, 0f32]; assert_eq!(decoded.typed_data::(), expected.as_slice()); } @@ -201,12 +203,12 @@ mod tests { println!("Encoded {:?}", encoded); assert!(encoded.patches().is_some()); assert_eq!( - encoded.encoded().as_primitive().typed_data::(), + encoded.encoded().into_primitive().typed_data::(), vec![1234i64, 2718, 2718, 4000] // fill forward ); assert_eq!(encoded.exponents(), &Exponents { e: 3, f: 0 }); - let decoded = decompress(&encoded).unwrap(); + let decoded = decompress(encoded).unwrap(); assert_eq!(values, decoded.typed_data::()); } } diff --git a/vortex-alp/src/compute.rs b/vortex-alp/src/compute.rs index 37920557e9..b5710c03ea 100644 --- a/vortex-alp/src/compute.rs +++ b/vortex-alp/src/compute.rs @@ -1,20 +1,14 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; use vortex::scalar::Scalar; +use vortex::{Array, ArrayDType, IntoArray, OwnedArray}; use vortex_error::VortexResult; -use crate::compress::decompress; -use crate::{match_each_alp_float_ptype, ALPArray, ALPFloat}; - -impl ArrayCompute for ALPArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } +use crate::{match_each_alp_float_ptype, ALPArray}; +impl ArrayCompute for ALPArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -28,19 +22,13 @@ impl ArrayCompute for ALPArray { } } -impl FlattenFn for ALPArray { - fn flatten(&self) -> VortexResult { - decompress(self).map(FlattenedArray::Primitive) - } -} - -impl ScalarAtFn for ALPArray { +impl ScalarAtFn for ALPArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - if let Some(patch) = self.patches().and_then(|p| scalar_at(p, index).ok()) { + if let Some(patch) = self.patches().and_then(|p| scalar_at(&p, index).ok()) { return Ok(patch); } - - let encoded_val = scalar_at(self.encoded(), index)?; + use crate::ALPFloat; + let encoded_val = scalar_at(&self.encoded(), index)?; match_each_alp_float_ptype!(self.dtype().try_into().unwrap(), |$T| { let encoded_val: <$T as ALPFloat>::ALPInt = encoded_val.try_into().unwrap(); Scalar::from(<$T as ALPFloat>::decode_single( @@ -51,24 +39,24 @@ impl ScalarAtFn for ALPArray { } } -impl TakeFn for ALPArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for ALPArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { // TODO(ngates): wrap up indices in an array that caches decompression? - Ok(ALPArray::new( - take(self.encoded(), indices)?, + Ok(ALPArray::try_new( + take(&self.encoded(), indices)?, self.exponents().clone(), - self.patches().map(|p| take(p, indices)).transpose()?, - ) + self.patches().map(|p| take(&p, indices)).transpose()?, + )? .into_array()) } } -impl SliceFn for ALPArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for ALPArray<'_> { + fn slice(&self, start: usize, end: usize) -> VortexResult { Ok(ALPArray::try_new( - slice(self.encoded(), start, stop)?, + slice(&self.encoded(), start, end)?, self.exponents().clone(), - self.patches().map(|p| slice(p, start, stop)).transpose()?, + self.patches().map(|p| slice(&p, start, end)).transpose()?, )? .into_array()) } diff --git a/vortex-alp/src/downcast.rs b/vortex-alp/src/downcast.rs deleted file mode 100644 index 1eee5d4ddb..0000000000 --- a/vortex-alp/src/downcast.rs +++ /dev/null @@ -1,31 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::ALPArray; - -mod private { - pub trait Sealed {} -} - -pub trait DowncastALP: private::Sealed { - fn maybe_alp(&self) -> Option<&ALPArray>; - - fn as_alp(&self) -> &ALPArray { - self.maybe_alp().unwrap() - } -} - -impl private::Sealed for dyn Array + '_ {} - -impl DowncastALP for dyn Array + '_ { - fn maybe_alp(&self) -> Option<&ALPArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastALP for ArrayRef { - fn maybe_alp(&self) -> Option<&ALPArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-alp/src/lib.rs b/vortex-alp/src/lib.rs index 9428b3cb98..b255ef0074 100644 --- a/vortex-alp/src/lib.rs +++ b/vortex-alp/src/lib.rs @@ -1,15 +1,7 @@ pub use alp::*; pub use array::*; -use linkme::distributed_slice; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod alp; mod array; mod compress; mod compute; -mod downcast; -mod serde; -mod stats; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_ALP: EncodingRef = &ALPEncoding; diff --git a/vortex-alp/src/serde.rs b/vortex-alp/src/serde.rs deleted file mode 100644 index 846ba87125..0000000000 --- a/vortex-alp/src/serde.rs +++ /dev/null @@ -1,92 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, FloatWidth, Signedness}; - -use crate::alp::Exponents; -use crate::ALPArray; -use crate::ALPEncoding; - -impl ArraySerde for ALPArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_optional_array(self.patches())?; - ctx.write_fixed_slice([self.exponents().e, self.exponents().f])?; - ctx.write(self.encoded()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(Some(vec![self.exponents().e, self.exponents().f])) - } -} - -impl EncodingSerde for ALPEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let patches = ctx.read_optional_array()?; - let exponents = ctx.read_nbytes::<2>()?; - let encoded_dtype = match ctx.schema() { - DType::Float(FloatWidth::_32, nullability) => { - DType::Int(32.into(), Signedness::Signed, *nullability) - } - DType::Float(FloatWidth::_64, nullability) => { - DType::Int(64.into(), Signedness::Signed, *nullability) - } - _ => vortex_bail!(MismatchedTypes: "f32 or f64", ctx.schema()), - }; - let encoded = ctx.with_schema(&encoded_dtype).read()?; - Ok(ALPArray::new( - encoded, - Exponents { - e: exponents[0], - f: exponents[1], - }, - patches, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use vortex::array::downcast::DowncastArrayBuiltin; - use vortex::array::primitive::PrimitiveArray; - use vortex::array::{Array, ArrayRef}; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - use crate::compress::alp_encode; - use crate::downcast::DowncastALP; - - fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } - - #[test] - fn roundtrip() { - let arr = alp_encode(&PrimitiveArray::from(vec![ - 0.00001f64, - 0.0004f64, - 1000000.0f64, - 0.33f64, - ])) - .unwrap(); - let read_arr = roundtrip_array(&arr).unwrap(); - - let read_alp = read_arr.as_alp(); - assert_eq!( - arr.encoded().as_primitive().buffer().typed_data::(), - read_alp - .encoded() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!(arr.exponents().e, read_alp.exponents().e); - assert_eq!(arr.exponents().f, read_alp.exponents().f); - } -} diff --git a/vortex-alp/src/stats.rs b/vortex-alp/src/stats.rs deleted file mode 100644 index 36a544530b..0000000000 --- a/vortex-alp/src/stats.rs +++ /dev/null @@ -1,13 +0,0 @@ -use std::collections::HashMap; - -use vortex::stats::{Stat, StatsCompute, StatsSet}; -use vortex_error::VortexResult; - -use crate::ALPArray; - -impl StatsCompute for ALPArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - // TODO(ngates): implement based on the encoded array - Ok(StatsSet::from(HashMap::new())) - } -} diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index fe5c0bf12a..8a23aa0c5e 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -37,9 +37,9 @@ paste = { workspace = true } rand = { workspace = true } thiserror = { workspace = true } vortex-alloc = { path = "../vortex-alloc" } -vortex-error = { path = "../vortex-error" } +vortex-error = { path = "../vortex-error", features = ["flexbuffers"] } vortex-flatbuffers = { path = "../vortex-flatbuffers" } -vortex-schema = { path = "../vortex-schema" } +vortex-schema = { path = "../vortex-schema", features = ["serde"] } serde = { workspace = true, features = ["derive"] } [build-dependencies] diff --git a/vortex-array/flatbuffers/array.fbs b/vortex-array/flatbuffers/array.fbs index 2dedd3927c..86961eedc6 100644 --- a/vortex-array/flatbuffers/array.fbs +++ b/vortex-array/flatbuffers/array.fbs @@ -6,10 +6,10 @@ enum Version: uint8 { table Array { version: Version = V0; + has_buffer: bool; encoding: uint16; metadata: [ubyte]; children: [Array]; - nbuffers: uint16; } root_type Array; diff --git a/vortex-array/flatbuffers/scalar.fbs b/vortex-array/flatbuffers/scalar.fbs new file mode 100644 index 0000000000..c756f8c65a --- /dev/null +++ b/vortex-array/flatbuffers/scalar.fbs @@ -0,0 +1,72 @@ +include "vortex-schema/flatbuffers/dtype.fbs"; + +namespace vortex.scalar; + +table Binary { + value: [ubyte]; +} + +table Bool { + value: bool; +} + +table List { + value: [Scalar]; +} + +table Null { +} + +// Since Rust doesn't support structs in a union, it would be very inefficient to wrap each primitive type in a table. +// So instead we store a PType and a byte vector. +enum PType: uint8 { + U8, + U16, + U32, + U64, + I8, + I16, + I32, + I64, + F16, + F32, + F64, +} + +table Primitive { + ptype: PType; + // TODO(ngates): this isn't an ideal way to store the bytes. + bytes: [ubyte]; +} + +table Struct_ { + names: [string]; + value: [Scalar]; +} + +table UTF8 { + value: string; +} + +table Composite { + value: Scalar; +} + +union Type { + Binary, + Bool, + List, + Null, + Primitive, + Struct_, + UTF8, + Composite, +} + +// TODO(ngates): separate out ScalarValue from Scalar, even in-memory, so we can avoid duplicating dtype information (e.g. Struct field names). +table Scalar { + type: Type; + nullability: bool; +} + +root_type Scalar; \ No newline at end of file diff --git a/vortex-array/src/accessor.rs b/vortex-array/src/accessor.rs index 46a51a46d3..fe61bdf768 100644 --- a/vortex-array/src/accessor.rs +++ b/vortex-array/src/accessor.rs @@ -1,5 +1,7 @@ -use crate::array::Array; +use vortex_error::VortexResult; -pub trait ArrayAccessor<'a, T>: Array { - fn value(&'a self, index: usize) -> Option; +pub trait ArrayAccessor { + fn with_iterator(&self, f: F) -> VortexResult + where + F: for<'a> FnOnce(&mut (dyn Iterator>)) -> R; } diff --git a/vortex-array2/src/array/bool/compute/as_arrow.rs b/vortex-array/src/array/bool/compute/as_arrow.rs similarity index 100% rename from vortex-array2/src/array/bool/compute/as_arrow.rs rename to vortex-array/src/array/bool/compute/as_arrow.rs diff --git a/vortex-array2/src/array/bool/compute/as_contiguous.rs b/vortex-array/src/array/bool/compute/as_contiguous.rs similarity index 85% rename from vortex-array2/src/array/bool/compute/as_contiguous.rs rename to vortex-array/src/array/bool/compute/as_contiguous.rs index bc2ecae0a6..969381e266 100644 --- a/vortex-array2/src/array/bool/compute/as_contiguous.rs +++ b/vortex-array/src/array/bool/compute/as_contiguous.rs @@ -4,10 +4,10 @@ use vortex_error::VortexResult; use crate::array::bool::BoolArray; use crate::compute::as_contiguous::AsContiguousFn; use crate::validity::Validity; -use crate::{Array, IntoArray}; +use crate::{Array, ArrayDType, IntoArray, OwnedArray}; impl AsContiguousFn for BoolArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { let validity = if self.dtype().is_nullable() { Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) } else { diff --git a/vortex-array2/src/array/bool/compute/fill.rs b/vortex-array/src/array/bool/compute/fill.rs similarity index 92% rename from vortex-array2/src/array/bool/compute/fill.rs rename to vortex-array/src/array/bool/compute/fill.rs index c4245ad64a..7987020dd5 100644 --- a/vortex-array2/src/array/bool/compute/fill.rs +++ b/vortex-array/src/array/bool/compute/fill.rs @@ -4,10 +4,10 @@ use vortex_schema::Nullability; use crate::array::bool::BoolArray; use crate::compute::fill::FillForwardFn; use crate::validity::ArrayValidity; -use crate::{Array, IntoArray, ToArrayData}; +use crate::{ArrayDType, IntoArray, OwnedArray, ToArrayData}; impl FillForwardFn for BoolArray<'_> { - fn fill_forward(&self) -> VortexResult> { + fn fill_forward(&self) -> VortexResult { if self.dtype().nullability() == Nullability::NonNullable { return Ok(self.to_array_data().into_array()); } diff --git a/vortex-array2/src/array/bool/compute/flatten.rs b/vortex-array/src/array/bool/compute/flatten.rs similarity index 100% rename from vortex-array2/src/array/bool/compute/flatten.rs rename to vortex-array/src/array/bool/compute/flatten.rs diff --git a/vortex-array/src/array/bool/compute/mod.rs b/vortex-array/src/array/bool/compute/mod.rs index 88623e9a21..11b780a2a2 100644 --- a/vortex-array/src/array/bool/compute/mod.rs +++ b/vortex-array/src/array/bool/compute/mod.rs @@ -1,30 +1,21 @@ -use std::sync::Arc; - -use arrow_array::{ArrayRef as ArrowArrayRef, BooleanArray as ArrowBoolArray}; -use arrow_buffer::buffer::BooleanBuffer; -use vortex_error::VortexResult; - use crate::array::bool::BoolArray; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::{Array, ArrayRef}; -use crate::arrow::wrappers::as_nulls; use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::AsContiguousFn; use crate::compute::fill::FillForwardFn; -use crate::compute::flatten::{FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; -use crate::scalar::{BoolScalar, Scalar}; -use crate::validity::ArrayValidity; -use crate::validity::OwnedValidity; -use crate::validity::Validity; -use crate::view::AsView; +mod as_arrow; +mod as_contiguous; +mod fill; +mod flatten; +mod scalar_at; +mod slice; mod take; -impl ArrayCompute for BoolArray { +impl ArrayCompute for BoolArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } @@ -33,10 +24,6 @@ impl ArrayCompute for BoolArray { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { Some(self) } @@ -53,108 +40,3 @@ impl ArrayCompute for BoolArray { Some(self) } } - -impl AsArrowArray for BoolArray { - fn as_arrow(&self) -> VortexResult { - Ok(Arc::new(ArrowBoolArray::new( - self.buffer().clone(), - as_nulls(self.logical_validity())?, - ))) - } -} - -impl AsContiguousFn for BoolArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { - let validity: Option = if self.dtype().is_nullable() { - Some(Validity::from_iter( - arrays.iter().map(|a| a.logical_validity()), - )) - } else { - None - }; - - Ok(BoolArray::new( - BooleanBuffer::from( - arrays - .iter() - .flat_map(|a| a.as_bool().buffer().iter()) - .collect::>(), - ), - validity, - ) - .into_array()) - } -} - -impl FlattenFn for BoolArray { - fn flatten(&self) -> VortexResult { - Ok(FlattenedArray::Bool(self.clone())) - } -} - -impl ScalarAtFn for BoolArray { - fn scalar_at(&self, index: usize) -> VortexResult { - Ok(BoolScalar::try_new( - self.is_valid(index).then(|| self.buffer.value(index)), - self.nullability(), - ) - .unwrap() - .into()) - } -} - -impl FillForwardFn for BoolArray { - fn fill_forward(&self) -> VortexResult { - if self.validity().is_none() { - return Ok(Arc::new(self.clone())); - } - - let validity = self.validity().unwrap().to_bool_array(); - let bools = self.buffer(); - let mut last_value = false; - let filled = bools - .iter() - .zip(validity.buffer().iter()) - .map(|(v, valid)| { - if valid { - last_value = v; - } - last_value - }) - .collect::>(); - Ok(BoolArray::from(filled).into_array()) - } -} - -impl SliceFn for BoolArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(BoolArray::new( - self.buffer.slice(start, stop - start), - self.validity - .as_view() - .map(|v| v.slice(start, stop)) - .transpose()?, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use crate::array::bool::BoolArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::compute; - use crate::validity::OwnedValidity; - - #[test] - fn fill_forward() { - let barr = BoolArray::from_iter(vec![None, Some(false), None, Some(true), None]); - let filled = compute::fill::fill_forward(&barr).unwrap(); - let filled_bool = filled.as_bool(); - assert_eq!( - filled_bool.buffer().iter().collect::>(), - vec![false, false, false, true, true] - ); - assert!(filled_bool.validity().is_none()); - } -} diff --git a/vortex-array2/src/array/bool/compute/scalar_at.rs b/vortex-array/src/array/bool/compute/scalar_at.rs similarity index 87% rename from vortex-array2/src/array/bool/compute/scalar_at.rs rename to vortex-array/src/array/bool/compute/scalar_at.rs index 1897bdfc94..fd711acbd6 100644 --- a/vortex-array2/src/array/bool/compute/scalar_at.rs +++ b/vortex-array/src/array/bool/compute/scalar_at.rs @@ -1,9 +1,10 @@ -use vortex::scalar::{BoolScalar, Scalar}; use vortex_error::VortexResult; use crate::array::bool::BoolArray; use crate::compute::scalar_at::ScalarAtFn; +use crate::scalar::{BoolScalar, Scalar}; use crate::validity::ArrayValidity; +use crate::ArrayDType; impl ScalarAtFn for BoolArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { diff --git a/vortex-array/src/array/bool/compute/slice.rs b/vortex-array/src/array/bool/compute/slice.rs new file mode 100644 index 0000000000..675563b30e --- /dev/null +++ b/vortex-array/src/array/bool/compute/slice.rs @@ -0,0 +1,15 @@ +use vortex_error::VortexResult; + +use crate::array::bool::BoolArray; +use crate::compute::slice::SliceFn; +use crate::{IntoArray, OwnedArray}; + +impl SliceFn for BoolArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + BoolArray::try_new( + self.boolean_buffer().slice(start, stop - start), + self.validity().slice(start, stop)?, + ) + .map(|a| a.into_array()) + } +} diff --git a/vortex-array/src/array/bool/compute/take.rs b/vortex-array/src/array/bool/compute/take.rs index 6bd6fca41a..feea250e60 100644 --- a/vortex-array/src/array/bool/compute/take.rs +++ b/vortex-array/src/array/bool/compute/take.rs @@ -3,20 +3,20 @@ use num_traits::AsPrimitive; use vortex_error::VortexResult; use crate::array::bool::BoolArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::flatten::flatten_primitive; use crate::compute::take::TakeFn; use crate::match_each_integer_ptype; -use crate::validity::OwnedValidity; +use crate::AsArray; +use crate::IntoArray; +use crate::{Array, OwnedArray}; -impl TakeFn for BoolArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - let validity = self.validity().map(|v| v.take(indices)).transpose()?; - let indices = flatten_primitive(indices)?; +impl TakeFn for BoolArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + let validity = self.validity(); + let indices = indices.clone().flatten_primitive()?; match_each_integer_ptype!(indices.ptype(), |$I| { - Ok(BoolArray::from_nullable( - take_bool(self.buffer(), indices.typed_data::<$I>()), - validity, + Ok(BoolArray::from_vec( + take_bool(&self.boolean_buffer(), indices.typed_data::<$I>()), + validity.take(indices.as_array_ref())?, ).into_array()) }) } @@ -29,9 +29,9 @@ fn take_bool>(bools: &BooleanBuffer, indices: &[I]) -> Vec #[cfg(test)] mod test { use crate::array::bool::BoolArray; - use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::compute::take::take; + use crate::IntoArray; #[test] fn take_nullable() { @@ -41,11 +41,20 @@ mod test { Some(false), None, Some(false), - ]); - let res = take(&reference, &PrimitiveArray::from(vec![0, 3, 4])).unwrap(); + ]) + .into_array(); + + let b = BoolArray::try_from( + take( + &reference, + &PrimitiveArray::from(vec![0, 3, 4]).into_array(), + ) + .unwrap(), + ) + .unwrap(); assert_eq!( - res.as_bool().buffer(), - BoolArray::from_iter(vec![Some(false), None, Some(false)]).buffer() + b.boolean_buffer(), + BoolArray::from_iter(vec![Some(false), None, Some(false)]).boolean_buffer() ); } } diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index 14b6e9dab2..946fb3bea0 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -1,178 +1,76 @@ -use std::sync::{Arc, RwLock}; - -use arrow_buffer::buffer::BooleanBuffer; -use linkme::distributed_slice; +use arrow_buffer::BooleanBuffer; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; use vortex_error::VortexResult; -use vortex_schema::{DType, Nullability}; - -use super::{Array, ArrayRef}; -use crate::array::IntoArray; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stat, Stats, StatsSet}; -use crate::validity::OwnedValidity; -use crate::validity::{Validity, ValidityView}; -use crate::view::AsView; -use crate::{impl_array, ArrayWalker}; + +use crate::buffer::Buffer; +use crate::validity::{ArrayValidity, ValidityMetadata}; +use crate::validity::{LogicalValidity, Validity}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{impl_encoding, ArrayFlatten}; mod compute; -mod serde; mod stats; -#[derive(Debug, Clone)] -pub struct BoolArray { - buffer: BooleanBuffer, - stats: Arc>, - validity: Option, -} - -impl BoolArray { - pub fn new(buffer: BooleanBuffer, validity: Option) -> Self { - Self::try_new(buffer, validity).unwrap() - } - - pub fn try_new(buffer: BooleanBuffer, validity: Option) -> VortexResult { - if let Some(v) = &validity { - assert_eq!(v.as_view().len(), buffer.len()); - } - Ok(Self { - buffer, - stats: Arc::new(RwLock::new(StatsSet::new())), - validity, - }) - } - - /// Create an all-null boolean array. - pub fn null(n: usize) -> Self { - BoolArray::new( - BooleanBuffer::from(vec![false; n]), - Some(Validity::Invalid(n)), - ) - } - - pub fn from_nullable(values: Vec, validity: Option) -> Self { - BoolArray::new(BooleanBuffer::from(values), validity) - } - - #[inline] - pub fn buffer(&self) -> &BooleanBuffer { - &self.buffer - } +impl_encoding!("vortex.bool", Bool); - pub fn into_buffer(self) -> BooleanBuffer { - self.buffer - } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct BoolMetadata { + validity: ValidityMetadata, + length: usize, } -impl Array for BoolArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.buffer.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.buffer.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - if self.validity().is_some() { - &DType::Bool(Nullability::Nullable) - } else { - &DType::Bool(Nullability::NonNullable) - } - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &BoolEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - (self.len() + 7) / 8 - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) +impl BoolArray<'_> { + pub fn buffer(&self) -> &Buffer { + self.array().buffer().expect("missing buffer") } - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) + pub fn boolean_buffer(&self) -> BooleanBuffer { + BooleanBuffer::new(BoolArray::buffer(self).clone().into(), 0, self.len()) } - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - if let Some(v) = self.validity() { - // FIXME(ngates): Validity to implement Array? - walker.visit_child(&v.to_array())?; - } - walker.visit_buffer(self.buffer.inner()) + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(0, &Validity::DTYPE)) } } -impl OwnedValidity for BoolArray { - fn validity(&self) -> Option { - self.validity.as_view() +impl BoolArray<'_> { + pub fn try_new(buffer: BooleanBuffer, validity: Validity) -> VortexResult { + Ok(Self { + typed: TypedArray::try_from_parts( + DType::Bool(validity.nullability()), + BoolMetadata { + validity: validity.to_metadata(buffer.len())?, + length: buffer.len(), + }, + Some(Buffer::Owned(buffer.into_inner())), + validity.into_array_data().into_iter().collect_vec().into(), + HashMap::default(), + )?, + }) } -} -impl ArrayDisplay for BoolArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - let true_count = self.stats().get_or_compute_or(0usize, &Stat::TrueCount); - let false_count = self.len() - true_count; - f.property("n_true", true_count)?; - f.property("n_false", false_count)?; - f.validity(self.validity()) + pub fn from_vec(bools: Vec, validity: Validity) -> Self { + let buffer = BooleanBuffer::from(bools); + Self::try_new(buffer, validity).unwrap() } } -#[derive(Debug)] -pub struct BoolEncoding; - -impl BoolEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.bool"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_BOOL: EncodingRef = &BoolEncoding; - -impl Encoding for BoolEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl From for OwnedBoolArray { + fn from(value: BooleanBuffer) -> Self { + BoolArray::try_new(value, Validity::NonNullable).unwrap() } } -impl From> for BoolArray { +impl From> for OwnedBoolArray { fn from(value: Vec) -> Self { - BoolArray::new(BooleanBuffer::from(value), None) - } -} - -impl IntoArray for Vec { - fn into_array(self) -> ArrayRef { - Arc::new(BoolArray::from(self)) + BoolArray::from_vec(value, Validity::NonNullable) } } -impl FromIterator> for BoolArray { +impl FromIterator> for OwnedBoolArray { fn from_iter>>(iter: I) -> Self { let iter = iter.into_iter(); let (lower, _) = iter.size_hint(); @@ -185,35 +83,54 @@ impl FromIterator> for BoolArray { }) .collect::>(); - if validity.is_empty() { - BoolArray::from(values) - } else { - BoolArray::new(BooleanBuffer::from(values), Some(validity.into())) - } + BoolArray::try_new(BooleanBuffer::from(values), Validity::from(validity)).unwrap() + } +} + +impl ArrayTrait for BoolArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } +impl ArrayFlatten for BoolArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + Ok(Flattened::Bool(self)) + } +} + +impl ArrayValidity for BoolArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) + } + + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) + } +} + +impl AcceptArrayVisitor for BoolArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer(self.buffer())?; + visitor.visit_validity(&self.validity()) + } +} + +impl EncodingCompression for BoolEncoding {} + #[cfg(test)] -mod test { +mod tests { use crate::array::bool::BoolArray; - use crate::array::Array; use crate::compute::scalar_at::scalar_at; - use crate::compute::slice::slice; - - #[test] - fn slice_array() { - let arr = slice(&BoolArray::from(vec![true, true, false, false, true]), 1, 4).unwrap(); - assert_eq!(arr.len(), 3); - assert_eq!(scalar_at(&arr, 0).unwrap(), true.into()); - assert_eq!(scalar_at(&arr, 1).unwrap(), false.into()); - assert_eq!(scalar_at(&arr, 2).unwrap(), false.into()); - } + use crate::IntoArray; #[test] - fn nbytes() { - assert_eq!( - BoolArray::from(vec![true, true, false, false, true]).nbytes(), - 1 - ); + fn bool_array() { + let arr = BoolArray::from(vec![true, false, true]).into_array(); + let scalar: bool = scalar_at(&arr, 0).unwrap().try_into().unwrap(); + assert!(scalar); } } diff --git a/vortex-array/src/array/bool/serde.rs b/vortex-array/src/array/bool/serde.rs deleted file mode 100644 index 2e22a086a8..0000000000 --- a/vortex-array/src/array/bool/serde.rs +++ /dev/null @@ -1,47 +0,0 @@ -use arrow_buffer::buffer::BooleanBuffer; -use vortex_error::VortexResult; - -use crate::array::bool::{BoolArray, BoolEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, ArrayView, BytesSerde, EncodingSerde, ReadCtx, WriteCtx}; -use crate::validity::OwnedValidity; - -impl ArraySerde for BoolArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_validity(self.validity())?; - ctx.write_buffer(self.len(), &self.buffer().sliced()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(Some(self.len().serialize())) - } -} - -impl EncodingSerde for BoolEncoding { - fn len(&self, view: &ArrayView) -> usize { - usize::deserialize(view.metadata().unwrap()).unwrap() - } - - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let validity = ctx.read_validity()?; - let (logical_len, buf) = ctx.read_buffer(|len| (len + 7) / 8)?; - Ok(BoolArray::new(BooleanBuffer::new(buf, 0, logical_len), validity).into_array()) - } -} - -#[cfg(test)] -mod test { - use crate::array::bool::BoolArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::serde::test::roundtrip_array; - use crate::validity::ArrayValidity; - - #[test] - fn roundtrip() { - let arr = BoolArray::from_iter(vec![Some(false), None, Some(true), Some(false)]); - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!(arr.buffer().values(), read_arr.as_bool().buffer().values()); - assert_eq!(arr.logical_validity(), read_arr.logical_validity()); - } -} diff --git a/vortex-array/src/array/bool/stats.rs b/vortex-array/src/array/bool/stats.rs index 3c43424e72..35fe73275e 100644 --- a/vortex-array/src/array/bool/stats.rs +++ b/vortex-array/src/array/bool/stats.rs @@ -3,22 +3,23 @@ use std::collections::HashMap; use vortex_error::VortexResult; use crate::array::bool::BoolArray; -use crate::array::Array; -use crate::stats::{Stat, StatsCompute, StatsSet}; +use crate::scalar::Scalar; +use crate::stats::{ArrayStatisticsCompute, Stat}; +use crate::ArrayTrait; -impl StatsCompute for BoolArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - if self.len() == 0 { - return Ok(StatsSet::from(HashMap::from([ +impl ArrayStatisticsCompute for BoolArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { + if self.is_empty() { + return Ok(HashMap::from([ (Stat::TrueCount, 0.into()), (Stat::RunCount, 0.into()), - ]))); + ])); } - let mut prev_bit = self.buffer().value(0); + let mut prev_bit = self.boolean_buffer().value(0); let mut true_count: usize = if prev_bit { 1 } else { 0 }; let mut run_count: usize = 0; - for bit in self.buffer().iter().skip(1) { + for bit in self.boolean_buffer().iter().skip(1) { if bit { true_count += 1 } @@ -29,7 +30,7 @@ impl StatsCompute for BoolArray { } run_count += 1; - Ok(StatsSet::from(HashMap::from([ + Ok(HashMap::from([ (Stat::Min, (true_count == self.len()).into()), (Stat::Max, (true_count > 0).into()), ( @@ -38,6 +39,6 @@ impl StatsCompute for BoolArray { ), (Stat::RunCount, run_count.into()), (Stat::TrueCount, true_count.into()), - ]))) + ])) } } diff --git a/vortex-array/src/array/chunked/compute/mod.rs b/vortex-array/src/array/chunked/compute/mod.rs index ecac4aad93..13ecc1889e 100644 --- a/vortex-array/src/array/chunked/compute/mod.rs +++ b/vortex-array/src/array/chunked/compute/mod.rs @@ -1,92 +1,45 @@ -use itertools::Itertools; use vortex_error::VortexResult; use crate::array::chunked::ChunkedArray; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::{Array, ArrayRef}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::flatten::{FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::slice::{slice, SliceFn}; use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; use crate::scalar::Scalar; +use crate::{Array, OwnedArray, ToStatic}; mod take; -impl ArrayCompute for ChunkedArray { +impl ArrayCompute for ChunkedArray<'_> { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - fn take(&self) -> Option<&dyn TakeFn> { Some(self) } } -impl AsContiguousFn for ChunkedArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { +impl AsContiguousFn for ChunkedArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { // Combine all the chunks into one, then call as_contiguous again. - let chunks = arrays - .iter() - .flat_map(|a| a.as_chunked().chunks().iter()) - .cloned() - .collect_vec(); + let mut chunks = Vec::with_capacity(self.nchunks()); + for array in arrays { + for chunk in ChunkedArray::try_from(array).unwrap().chunks() { + chunks.push(chunk.to_static()); + } + } as_contiguous(&chunks) } } -impl FlattenFn for ChunkedArray { - fn flatten(&self) -> VortexResult { - Ok(FlattenedArray::Chunked(self.clone())) - } -} - -impl ScalarAtFn for ChunkedArray { +impl ScalarAtFn for ChunkedArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { let (chunk_index, chunk_offset) = self.find_chunk_idx(index); - scalar_at(self.chunks[chunk_index].as_ref(), chunk_offset) - } -} - -impl SliceFn for ChunkedArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - let (offset_chunk, offset_in_first_chunk) = self.find_chunk_idx(start); - let (length_chunk, length_in_last_chunk) = self.find_chunk_idx(stop); - - if length_chunk == offset_chunk { - if let Some(chunk) = self.chunks.get(offset_chunk) { - return Ok(ChunkedArray::new( - vec![slice(chunk, offset_in_first_chunk, length_in_last_chunk)?], - self.dtype.clone(), - ) - .into_array()); - } - } - - let mut chunks = self.chunks.clone()[offset_chunk..length_chunk + 1].to_vec(); - if let Some(c) = chunks.first_mut() { - *c = slice(c, offset_in_first_chunk, c.len())?; - } - - if length_in_last_chunk == 0 { - chunks.pop(); - } else if let Some(c) = chunks.last_mut() { - *c = slice(c, 0, length_in_last_chunk)?; - } - - Ok(ChunkedArray::new(chunks, self.dtype.clone()).into_array()) + scalar_at(&self.chunk(chunk_index).unwrap(), chunk_offset) } } diff --git a/vortex-array/src/array/chunked/compute/take.rs b/vortex-array/src/array/chunked/compute/take.rs index 3a069ebbff..f3a5b67766 100644 --- a/vortex-array/src/array/chunked/compute/take.rs +++ b/vortex-array/src/array/chunked/compute/take.rs @@ -1,19 +1,19 @@ use vortex_error::VortexResult; use crate::array::chunked::ChunkedArray; -use crate::array::{Array, ArrayRef, IntoArray}; use crate::compute::cast::cast; -use crate::compute::flatten::flatten_primitive; use crate::compute::take::{take, TakeFn}; use crate::ptype::PType; +use crate::{Array, IntoArray, OwnedArray, ToArray, ToStatic}; +use crate::{ArrayDType, ArrayTrait}; -impl TakeFn for ChunkedArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for ChunkedArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { if self.len() == indices.len() { - return Ok(self.to_array()); + return Ok(self.to_array().to_static()); } - let indices = flatten_primitive(cast(indices, PType::U64.into())?.as_ref())?; + let indices = cast(indices, PType::U64.into())?.flatten_primitive()?; // While the chunk idx remains the same, accumulate a list of chunk indices. let mut chunks = Vec::new(); @@ -28,7 +28,7 @@ impl TakeFn for ChunkedArray { // Start a new chunk let indices_in_chunk_array = indices_in_chunk.clone().into_array(); chunks.push(take( - &self.chunks()[prev_chunk_idx], + &self.chunk(prev_chunk_idx).unwrap(), &indices_in_chunk_array, )?); indices_in_chunk = Vec::new(); @@ -41,30 +41,41 @@ impl TakeFn for ChunkedArray { if !indices_in_chunk.is_empty() { let indices_in_chunk_array = indices_in_chunk.into_array(); chunks.push(take( - &self.chunks()[prev_chunk_idx], + &self.chunk(prev_chunk_idx).unwrap(), &indices_in_chunk_array, )?); } - Ok(ChunkedArray::new(chunks, self.dtype().clone()).into_array()) + Ok(ChunkedArray::try_new(chunks, self.dtype().clone())?.into_array()) } } #[cfg(test)] mod test { + use itertools::Itertools; + use crate::array::chunked::ChunkedArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::IntoArray; use crate::compute::as_contiguous::as_contiguous; use crate::compute::take::take; + use crate::{ArrayDType, ArrayTrait, AsArray, IntoArray}; #[test] fn test_take() { let a = vec![1i32, 2, 3].into_array(); - let arr = ChunkedArray::new(vec![a.clone(), a.clone(), a.clone()], a.dtype().clone()); + let arr = ChunkedArray::try_new(vec![a.clone(), a.clone(), a.clone()], a.dtype().clone()) + .unwrap(); + assert_eq!(arr.nchunks(), 3); + assert_eq!(arr.len(), 9); let indices = vec![0, 0, 6, 4].into_array(); - let result = as_contiguous(take(&arr, &indices).unwrap().as_chunked().chunks()).unwrap(); - assert_eq!(result.as_primitive().typed_data::(), &[1, 1, 1, 2]); + let result = as_contiguous( + &ChunkedArray::try_from(take(arr.as_array_ref(), &indices).unwrap()) + .unwrap() + .chunks() + .collect_vec(), + ) + .unwrap() + .into_primitive(); + assert_eq!(result.typed_data::(), &[1, 1, 1, 2]); } } diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index efc2edc439..26d269a609 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -1,199 +1,159 @@ -use std::sync::{Arc, RwLock}; - use itertools::Itertools; -use linkme::distributed_slice; +use serde::{Deserialize, Serialize}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; - -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsSet}; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::{impl_array, ArrayWalker}; +use vortex_schema::{IntWidth, Nullability, Signedness}; + +use crate::array::primitive::PrimitiveArray; +use crate::compute::scalar_at::scalar_at; +use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; +use crate::validity::Validity::NonNullable; +use crate::validity::{ArrayValidity, LogicalValidity}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{impl_encoding, ArrayDType, ArrayFlatten, IntoArrayData, OwnedArray, ToArrayData}; mod compute; -mod serde; mod stats; -#[derive(Debug, Clone)] -pub struct ChunkedArray { - chunks: Vec, - chunk_ends: Vec, - dtype: DType, - stats: Arc>, -} +impl_encoding!("vortex.chunked", Chunked); -impl ChunkedArray { - pub fn new(chunks: Vec, dtype: DType) -> Self { - Self::try_new(chunks, dtype).unwrap() - } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChunkedMetadata; + +impl ChunkedArray<'_> { + const ENDS_DTYPE: DType = DType::Int( + IntWidth::_64, + Signedness::Unsigned, + Nullability::NonNullable, + ); - pub fn try_new(chunks: Vec, dtype: DType) -> VortexResult { + pub fn try_new(chunks: Vec, dtype: DType) -> VortexResult { for chunk in &chunks { if chunk.dtype() != &dtype { vortex_bail!(MismatchedTypes: dtype, chunk.dtype()); } } - let chunk_ends = [0u64] - .into_iter() - .chain(chunks.iter().map(|c| c.len() as u64)) - .scan(0, |acc, c| { - *acc += c; - Some(*acc) - }) - .collect_vec(); - Ok(Self { - chunks, - chunk_ends, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - #[inline] - pub fn chunks(&self) -> &[ArrayRef] { - &self.chunks - } - - #[inline] - pub fn chunk_ends(&self) -> &[u64] { - &self.chunk_ends - } - - pub fn find_chunk_idx(&self, index: usize) -> (usize, usize) { - assert!(index <= self.len(), "Index out of bounds of the array"); - let index_chunk = self - .chunk_ends - .binary_search(&(index as u64)) - // Since chunk ends start with 0 whenever value falls in between two ends it's in the chunk that starts the END - .unwrap_or_else(|o| o - 1); - let index_in_chunk = index - self.chunk_ends[index_chunk] as usize; - (index_chunk, index_in_chunk) - } -} - -impl Array for ChunkedArray { - impl_array!(); + let chunk_ends = PrimitiveArray::from_vec( + [0u64] + .into_iter() + .chain(chunks.iter().map(|c| c.len() as u64)) + .scan(0, |acc, c| { + *acc += c; + Some(*acc) + }) + .collect_vec(), + NonNullable, + ); - fn len(&self) -> usize { - self.chunk_ends.last().map(|&i| i as usize).unwrap_or(0) - } + let mut children = vec![chunk_ends.into_array_data()]; + children.extend(chunks.iter().map(|a| a.to_array_data())); - #[inline] - fn is_empty(&self) -> bool { - self.chunks.is_empty() || self.len() == 0 + Self::try_from_parts(dtype, ChunkedMetadata, children.into(), HashMap::default()) } #[inline] - fn dtype(&self) -> &DType { - &self.dtype + pub fn chunk(&self, idx: usize) -> Option { + // Offset the index since chunk_ends is child 0. + self.array().child(idx + 1, self.array().dtype()) } - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) + pub fn nchunks(&self) -> usize { + self.chunk_ends().len() - 1 } #[inline] - fn encoding(&self) -> EncodingRef { - &ChunkedEncoding + pub fn chunk_ends(&self) -> Array { + self.array() + .child(0, &Self::ENDS_DTYPE) + .expect("missing chunk ends") } - fn nbytes(&self) -> usize { - self.chunks().iter().map(|arr| arr.nbytes()).sum() - } + pub fn find_chunk_idx(&self, index: usize) -> (usize, usize) { + assert!(index <= self.len(), "Index out of bounds of the array"); - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } + // TODO(ngates): migrate to the new search_sorted API to subtract 1 if not exact match. + let mut index_chunk = search_sorted(&self.chunk_ends(), index, SearchSortedSide::Left) + .unwrap() + .to_index(); + let mut chunk_start = + usize::try_from(scalar_at(&self.chunk_ends(), index_chunk).unwrap()).unwrap(); + + if chunk_start != index { + index_chunk -= 1; + chunk_start = + usize::try_from(scalar_at(&self.chunk_ends(), index_chunk).unwrap()).unwrap(); + } - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) + let index_in_chunk = index - chunk_start; + (index_chunk, index_in_chunk) } +} - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - for chunk in self.chunks() { - walker.visit_child(chunk)?; - } - Ok(()) +impl<'a> ChunkedArray<'a> { + pub fn chunks(&'a self) -> impl Iterator> { + (0..self.nchunks()).map(|c| self.chunk(c).unwrap()) } } -impl FromIterator for ChunkedArray { - fn from_iter>(iter: T) -> Self { - let chunks: Vec = iter.into_iter().collect(); +impl FromIterator for OwnedChunkedArray { + fn from_iter>(iter: T) -> Self { + let chunks: Vec = iter.into_iter().collect(); let dtype = chunks .first() .map(|c| c.dtype().clone()) .expect("Cannot create a chunked array from an empty iterator"); - Self::new(chunks, dtype) + Self::try_new(chunks, dtype).unwrap() } } -impl ArrayValidity for ChunkedArray { - fn logical_validity(&self) -> Validity { - if !self.dtype.is_nullable() { - return Validity::Valid(self.len()); - } - Validity::from_iter(self.chunks.iter().map(|chunk| chunk.logical_validity())) - } - - fn is_valid(&self, _index: usize) -> bool { - todo!() +impl ArrayFlatten for ChunkedArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + Ok(Flattened::Chunked(self)) } } -impl ArrayDisplay for ChunkedArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - for (i, c) in self.chunks().iter().enumerate() { - f.new_total_size(c.nbytes(), |f| f.child(&format!("[{}]", i), c.as_ref()))?; +impl AcceptArrayVisitor for ChunkedArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("chunk_ends", &self.chunk_ends())?; + for (idx, chunk) in self.chunks().enumerate() { + visitor.visit_child(format!("[{}]", idx).as_str(), &chunk)?; } Ok(()) } } -#[derive(Debug)] -pub struct ChunkedEncoding; - -impl ChunkedEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.chunked"); +impl ArrayTrait for ChunkedArray<'_> { + fn len(&self) -> usize { + usize::try_from(scalar_at(&self.chunk_ends(), self.nchunks()).unwrap()).unwrap() + } } -#[distributed_slice(ENCODINGS)] -static ENCODINGS_CHUNKED: EncodingRef = &ChunkedEncoding; - -impl Encoding for ChunkedEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl ArrayValidity for ChunkedArray<'_> { + fn is_valid(&self, _index: usize) -> bool { + todo!() } - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) + fn logical_validity(&self) -> LogicalValidity { + todo!() } } +impl EncodingCompression for ChunkedEncoding {} + #[cfg(test)] mod test { use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - use crate::array::chunked::ChunkedArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::IntoArray; - use crate::array::{Array, ArrayRef}; - use crate::compute::flatten::flatten_primitive; - use crate::compute::slice::slice; + use crate::array::chunked::{ChunkedArray, OwnedChunkedArray}; use crate::ptype::NativePType; + use crate::{Array, IntoArray}; - fn chunked_array() -> ChunkedArray { - ChunkedArray::new( + #[allow(dead_code)] + fn chunked_array() -> OwnedChunkedArray { + ChunkedArray::try_new( vec![ vec![1u64, 2, 3].into_array(), vec![4u64, 5, 6].into_array(), @@ -205,43 +165,43 @@ mod test { Nullability::NonNullable, ), ) + .unwrap() } - fn assert_equal_slices(arr: ArrayRef, slice: &[T]) { + #[allow(dead_code)] + fn assert_equal_slices(arr: Array, slice: &[T]) { let mut values = Vec::with_capacity(arr.len()); - arr.as_chunked() + ChunkedArray::try_from(arr) + .unwrap() .chunks() - .iter() - .map(|a| flatten_primitive(a.as_ref()).unwrap()) + .map(|a| a.flatten_primitive().unwrap()) .for_each(|a| values.extend_from_slice(a.typed_data::())); assert_eq!(values, slice); } - #[test] - pub fn slice_middle() { - assert_equal_slices(slice(&chunked_array(), 2, 5).unwrap(), &[3u64, 4, 5]) - } - - #[test] - pub fn slice_begin() { - assert_equal_slices(slice(&chunked_array(), 1, 3).unwrap(), &[2u64, 3]); - } - - #[test] - pub fn slice_aligned() { - assert_equal_slices(slice(&chunked_array(), 3, 6).unwrap(), &[4u64, 5, 6]); - } - - #[test] - pub fn slice_many_aligned() { - assert_equal_slices( - slice(&chunked_array(), 0, 6).unwrap(), - &[1u64, 2, 3, 4, 5, 6], - ); - } - - #[test] - pub fn slice_end() { - assert_equal_slices(slice(&chunked_array(), 7, 8).unwrap(), &[8u64]); - } + // FIXME(ngates): bring back when slicing is a compute function. + // #[test] + // pub fn slice_middle() { + // assert_equal_slices(chunked_array().slice(2, 5).unwrap(), &[3u64, 4, 5]) + // } + // + // #[test] + // pub fn slice_begin() { + // assert_equal_slices(chunked_array().slice(1, 3).unwrap(), &[2u64, 3]); + // } + // + // #[test] + // pub fn slice_aligned() { + // assert_equal_slices(chunked_array().slice(3, 6).unwrap(), &[4u64, 5, 6]); + // } + // + // #[test] + // pub fn slice_many_aligned() { + // assert_equal_slices(chunked_array().slice(0, 6).unwrap(), &[1u64, 2, 3, 4, 5, 6]); + // } + // + // #[test] + // pub fn slice_end() { + // assert_equal_slices(chunked_array().slice(7, 8).unwrap(), &[8u64]); + // } } diff --git a/vortex-array/src/array/chunked/serde.rs b/vortex-array/src/array/chunked/serde.rs deleted file mode 100644 index 6461f05a5a..0000000000 --- a/vortex-array/src/array/chunked/serde.rs +++ /dev/null @@ -1,80 +0,0 @@ -use flexbuffers::Builder; -use vortex_error::VortexResult; - -use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for ChunkedArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.chunks().len())?; - for c in self.chunks() { - ctx.write(c.as_ref())?; - } - Ok(()) - } - - fn metadata(&self) -> VortexResult>> { - // TODO(ngates) #163 - the chunk lengths should probably themselves be an array? - let mut builder = Builder::default(); - let mut vec = builder.start_vector(); - for end in self.chunk_ends() { - vec.push(*end); - } - vec.end_vector(); - Ok(Some(builder.take_buffer())) - } -} - -impl EncodingSerde for ChunkedEncoding { - fn len(&self, view: &ArrayView) -> usize { - (0..view.nchildren()) - .map(|c| view.child(c, view.dtype()).unwrap()) - .map(|v| v.len()) - .sum() - } - - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let chunk_len = ctx.read_usize()?; - let mut chunks = Vec::::with_capacity(chunk_len); - // TODO(robert): Use read_vectored - for _ in 0..chunk_len { - chunks.push(ctx.read()?); - } - Ok(ChunkedArray::new(chunks, ctx.schema().clone()).into_array()) - } -} - -#[cfg(test)] -mod test { - use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - - use crate::array::chunked::ChunkedArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::primitive::PrimitiveArray; - use crate::array::Array; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = ChunkedArray::new( - vec![ - PrimitiveArray::from_iter(vec![Some(0), None, Some(2), Some(42)]).into_array(), - PrimitiveArray::from_iter(vec![Some(5), None, Some(7), Some(42)]).into_array(), - ], - DType::Int(IntWidth::_32, Signedness::Signed, Nullability::Nullable), - ); - - let read_arr = roundtrip_array(&arr).unwrap(); - - for (i, chunk) in arr.chunks().iter().enumerate() { - assert_eq!( - chunk.as_primitive().buffer().typed_data::(), - read_arr.as_chunked().chunks()[i] - .as_primitive() - .buffer() - .typed_data::() - ); - } - } -} diff --git a/vortex-array/src/array/chunked/stats.rs b/vortex-array/src/array/chunked/stats.rs index 96bb1a6192..c1acf0701e 100644 --- a/vortex-array/src/array/chunked/stats.rs +++ b/vortex-array/src/array/chunked/stats.rs @@ -1,22 +1,13 @@ +use std::collections::HashMap; + use vortex_error::VortexResult; use crate::array::chunked::ChunkedArray; -use crate::stats::{Stat, StatsCompute, StatsSet}; +use crate::scalar::Scalar; +use crate::stats::{ArrayStatisticsCompute, Stat}; -impl StatsCompute for ChunkedArray { - fn compute(&self, stat: &Stat) -> VortexResult { - Ok(self - .chunks() - .iter() - .map(|c| { - let s = c.stats(); - // HACK(robert): This will compute all stats but we could just compute one - s.get_or_compute(stat); - s.get_all() - }) - .fold(StatsSet::new(), |mut acc, x| { - acc.merge(&x); - acc - })) +impl ArrayStatisticsCompute for ChunkedArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { + todo!() } } diff --git a/vortex-array/src/array/composite/array.rs b/vortex-array/src/array/composite/array.rs index 4bc2860058..7013ef41be 100644 --- a/vortex-array/src/array/composite/array.rs +++ b/vortex-array/src/array/composite/array.rs @@ -1,169 +1,183 @@ -use std::fmt::{Debug, Display}; -use std::sync::{Arc, RwLock}; - -use linkme::distributed_slice; -use vortex_error::VortexResult; -use vortex_schema::{CompositeID, DType}; +use flatbuffers::root; +use vortex_error::{vortex_err, VortexResult}; +use vortex_flatbuffers::{FlatBufferToBytes, ReadFlatBuffer}; +use vortex_schema::flatbuffers as fb; +use vortex_schema::{CompositeID, DTypeSerdeContext}; use crate::array::composite::{find_extension, CompositeExtensionRef, TypedCompositeArray}; -use crate::array::{Array, ArrayRef}; -use crate::compress::EncodingCompression; use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, BytesSerde, EncodingSerde}; -use crate::stats::{Stats, StatsCompute, StatsSet}; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::{impl_array, ArrayWalker}; - -pub trait CompositeMetadata: - 'static + Debug + Display + Send + Sync + Sized + Clone + BytesSerde +use crate::scalar::AsBytes; +use crate::stats::ArrayStatisticsCompute; +use crate::validity::{ArrayValidity, LogicalValidity}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{ + impl_encoding, ArrayDType, ArrayFlatten, IntoArrayData, TryDeserializeArrayMetadata, + TrySerializeArrayMetadata, +}; + +pub trait UnderlyingMetadata: + 'static + Send + Sync + Debug + TrySerializeArrayMetadata + for<'m> TryDeserializeArrayMetadata<'m> { fn id(&self) -> CompositeID; } +impl_encoding!("vortex.composite", Composite); + #[derive(Debug, Clone)] -pub struct CompositeArray { - extension: CompositeExtensionRef, - metadata: Arc>, - underlying: ArrayRef, - dtype: DType, - stats: Arc>, +pub struct CompositeMetadata { + ext: CompositeExtensionRef, + underlying_dtype: DType, + underlying_metadata: Arc<[u8]>, } -impl CompositeArray { - pub fn new(id: CompositeID, metadata: Arc>, underlying: ArrayRef) -> Self { - let dtype = DType::Composite(id, underlying.dtype().is_nullable().into()); - let extension = find_extension(id.0).expect("Unrecognized composite extension"); - Self { - extension, - metadata, - underlying, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), +impl TrySerializeArrayMetadata for CompositeMetadata { + fn try_serialize_metadata(&self) -> VortexResult> { + let mut fb = flexbuffers::Builder::default(); + { + let mut elems = fb.start_vector(); + elems.push(self.ext.id().0); + self.underlying_dtype + .with_flatbuffer_bytes(|b| elems.push(flexbuffers::Blob(b))); + elems.push(flexbuffers::Blob(self.underlying_metadata.as_ref())); } + Ok(fb.take_buffer().into()) } +} - #[inline] - pub fn id(&self) -> CompositeID { - self.extension.id() - } - - #[inline] - pub fn extension(&self) -> CompositeExtensionRef { - self.extension - } - - pub fn metadata(&self) -> Arc> { - self.metadata.clone() - } - - #[inline] - pub fn underlying(&self) -> &ArrayRef { - &self.underlying +impl TryDeserializeArrayMetadata<'_> for CompositeMetadata { + fn try_deserialize_metadata(metadata: Option<&[u8]>) -> VortexResult { + let reader = flexbuffers::Reader::get_root(metadata.expect("missing metadata"))?; + let elems = reader.as_vector(); + + let ext_id = elems.index(0).expect("missing composite id").as_str(); + let ext = find_extension(ext_id) + .ok_or_else(|| vortex_err!("Unrecognized composite extension: {}", ext_id))?; + + let dtype_blob = elems.index(1).expect("missing dtype").as_blob(); + let ctx = DTypeSerdeContext::new(vec![]); // FIXME: composite_ids + let underlying_dtype = DType::read_flatbuffer( + &ctx, + &root::(dtype_blob.0).expect("invalid dtype"), + )?; + + let underlying_metadata: Arc<[u8]> = elems + .index(2) + .expect("missing underlying metadata") + .as_blob() + .0 + .to_vec() + .into(); + + Ok(CompositeMetadata { + ext, + underlying_dtype, + underlying_metadata, + }) } +} - pub fn as_typed(&self) -> TypedCompositeArray { - TypedCompositeArray::new( - M::deserialize(self.metadata().as_slice()).unwrap(), - self.underlying().clone(), +impl<'a> CompositeArray<'a> { + pub fn new(id: CompositeID, metadata: Arc<[u8]>, underlying: Array<'a>) -> Self { + let dtype = DType::Composite(id, underlying.dtype().is_nullable().into()); + let ext = find_extension(id.0).expect("Unrecognized composite extension"); + Self::try_from_parts( + dtype, + CompositeMetadata { + ext, + underlying_dtype: underlying.dtype().clone(), + underlying_metadata: metadata, + }, + vec![underlying.into_array_data()].into(), + HashMap::default(), ) - } - - pub fn as_typed_compute(&self) -> Box { - self.extension.as_typed_compute(self) + .unwrap() } } -impl Array for CompositeArray { - impl_array!(); - +impl CompositeArray<'_> { #[inline] - fn len(&self) -> usize { - self.underlying.len() + pub fn id(&self) -> CompositeID { + self.metadata().ext.id() } #[inline] - fn is_empty(&self) -> bool { - self.underlying.is_empty() + pub fn extension(&self) -> CompositeExtensionRef { + find_extension(self.id().0).expect("Unrecognized composite extension") } - #[inline] - fn dtype(&self) -> &DType { - &self.dtype + pub fn underlying_metadata(&self) -> &Arc<[u8]> { + &self.metadata().underlying_metadata } - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) + pub fn underlying_dtype(&self) -> &DType { + &self.metadata().underlying_dtype } #[inline] - fn encoding(&self) -> EncodingRef { - &CompositeEncoding + pub fn underlying(&self) -> Array { + self.array() + .child(0, self.underlying_dtype()) + .expect("CompositeArray must have an underlying array") } - fn nbytes(&self) -> usize { - self.underlying.nbytes() - } + pub fn with_compute(&self, mut f: F) -> R + where + F: FnMut(&dyn ArrayCompute) -> R, + { + let mut result = None; - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } + self.extension() + .with_compute(self, &mut |c| { + result = Some(f(c)); + Ok(()) + }) + .unwrap(); - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) + // Now we unwrap the optional, which we know to be populated by the closure. + result.unwrap() } +} - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.underlying()) +impl<'a> CompositeArray<'a> { + pub fn as_typed(&'a self) -> VortexResult> { + Ok(TypedCompositeArray::new( + M::try_deserialize_metadata(Some(self.underlying_metadata().as_bytes()))?, + self.underlying().clone(), + )) } } -impl StatsCompute for CompositeArray {} - -impl ArrayDisplay for CompositeArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("metadata", format!("{:#?}", self.metadata().as_slice()))?; - f.child("underlying", self.underlying.as_ref()) +impl ArrayTrait for CompositeArray<'_> { + fn len(&self) -> usize { + self.underlying().len() } } -impl ArrayValidity for CompositeArray { - fn logical_validity(&self) -> Validity { - self.underlying().logical_validity() +impl ArrayFlatten for CompositeArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + Ok(Flattened::Composite(self)) } +} +impl ArrayValidity for CompositeArray<'_> { fn is_valid(&self, index: usize) -> bool { - self.underlying().is_valid(index) + self.underlying().with_dyn(|a| a.is_valid(index)) } -} - -#[derive(Debug)] -pub struct CompositeEncoding; -impl CompositeEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.composite"); + fn logical_validity(&self) -> LogicalValidity { + self.underlying().with_dyn(|a| a.logical_validity()) + } } -#[distributed_slice(ENCODINGS)] -static ENCODINGS_COMPOSITE: EncodingRef = &CompositeEncoding; - -impl Encoding for CompositeEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl AcceptArrayVisitor for CompositeArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("underlying", &self.underlying()) } +} - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } +impl ArrayStatisticsCompute for CompositeArray<'_> {} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} +impl EncodingCompression for CompositeEncoding {} diff --git a/vortex-array/src/array/composite/compute.rs b/vortex-array/src/array/composite/compute.rs index 5249117a49..2b371c4740 100644 --- a/vortex-array/src/array/composite/compute.rs +++ b/vortex-array/src/array/composite/compute.rs @@ -3,18 +3,16 @@ use itertools::Itertools; use vortex_error::{vortex_err, VortexResult}; use crate::array::composite::array::CompositeArray; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::{Array, ArrayRef}; use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::flatten::{FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::slice::{slice, SliceFn}; use crate::compute::take::{take, TakeFn}; use crate::compute::ArrayCompute; use crate::scalar::Scalar; +use crate::{Array, ArrayDType, IntoArray, OwnedArray}; -impl ArrayCompute for CompositeArray { +impl ArrayCompute for CompositeArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } @@ -23,10 +21,6 @@ impl ArrayCompute for CompositeArray { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -40,69 +34,61 @@ impl ArrayCompute for CompositeArray { } } -impl AsArrowArray for CompositeArray { +impl AsArrowArray for CompositeArray<'_> { fn as_arrow(&self) -> VortexResult { - self.extension() - .as_typed_compute(self) - .as_arrow() - .map(|a| a.as_arrow()) - .unwrap_or_else(|| { + self.with_compute(|c| { + c.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { Err(vortex_err!( NotImplemented: "as_arrow", format!("composite extension {}", self.id()) )) }) + }) } } -impl AsContiguousFn for CompositeArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { +impl AsContiguousFn for CompositeArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { let composites = arrays .iter() - .map(|array| array.as_composite().underlying()) - .cloned() + .map(|array| CompositeArray::try_from(array).unwrap()) .collect_vec(); + let underlyings = composites.iter().map(|c| c.underlying()).collect_vec(); Ok(CompositeArray::new( self.id(), - self.metadata().clone(), - as_contiguous(&composites)?, + self.underlying_metadata().clone(), + as_contiguous(&underlyings)?, ) .into_array()) } } -impl FlattenFn for CompositeArray { - fn flatten(&self) -> VortexResult { - Ok(FlattenedArray::Composite(self.clone())) - } -} - -impl ScalarAtFn for CompositeArray { +impl ScalarAtFn for CompositeArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { // TODO(ngates): this seems wrong... I don't think we just cast scalars like this. // e.g. how do we know what a datetime is in? - let underlying = scalar_at(self.underlying(), index)?; + let underlying = scalar_at(&self.underlying(), index)?; underlying.cast(self.dtype()) } } -impl TakeFn for CompositeArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for CompositeArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { Ok(CompositeArray::new( self.id(), - self.metadata().clone(), - take(self.underlying(), indices)?, + self.underlying_metadata().clone(), + take(&self.underlying(), indices)?, ) .into_array()) } } -impl SliceFn for CompositeArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for CompositeArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(CompositeArray::new( self.id(), - self.metadata().clone(), - slice(self.underlying(), start, stop)?, + self.underlying_metadata().clone(), + slice(&self.underlying(), start, stop)?, ) .into_array()) } diff --git a/vortex-array/src/array/composite/mod.rs b/vortex-array/src/array/composite/mod.rs index 38f2b8dbc3..5146392881 100644 --- a/vortex-array/src/array/composite/mod.rs +++ b/vortex-array/src/array/composite/mod.rs @@ -4,16 +4,14 @@ pub use typed::*; use vortex_schema::CompositeID; mod array; -mod compress; mod compute; -mod serde; mod typed; #[distributed_slice] -pub static COMPOSITE_EXTENSIONS: [&'static dyn CompositeExtension] = [..]; +pub static VORTEX_COMPOSITE_EXTENSIONS: [&'static dyn CompositeExtension] = [..]; pub fn find_extension(id: &str) -> Option<&'static dyn CompositeExtension> { - COMPOSITE_EXTENSIONS + VORTEX_COMPOSITE_EXTENSIONS .iter() .find(|ext| ext.id().0 == id) .copied() diff --git a/vortex-array/src/array/composite/serde.rs b/vortex-array/src/array/composite/serde.rs deleted file mode 100644 index 5e8c88fe35..0000000000 --- a/vortex-array/src/array/composite/serde.rs +++ /dev/null @@ -1,39 +0,0 @@ -use std::sync::Arc; - -use flatbuffers::FlatBufferBuilder; -use vortex_error::VortexResult; -use vortex_flatbuffers::WriteFlatBuffer; -use vortex_schema::DType; - -use crate::array::composite::{CompositeArray, CompositeEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for CompositeArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_slice(self.metadata().as_slice())?; - let underlying = self.underlying(); - ctx.dtype(underlying.dtype())?; - ctx.write(self.underlying()) - } - - fn metadata(&self) -> VortexResult>> { - let mut fbb = FlatBufferBuilder::new(); - let dtype = self.underlying().dtype().write_flatbuffer(&mut fbb); - fbb.finish_minimal(dtype); - Ok(Some(fbb.finished_data().to_vec())) - } -} - -impl EncodingSerde for CompositeEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let DType::Composite(id, _) = *ctx.schema() else { - panic!("Expected composite schema, found {}", ctx.schema()) - }; - let metadata = ctx.read_slice()?; - let underling_dtype = ctx.dtype()?; - let underlying = ctx.with_schema(&underling_dtype).read()?; - - Ok(CompositeArray::new(id, Arc::new(metadata), underlying).into_array()) - } -} diff --git a/vortex-array/src/array/composite/typed.rs b/vortex-array/src/array/composite/typed.rs index 745f5c041b..985240f616 100644 --- a/vortex-array/src/array/composite/typed.rs +++ b/vortex-array/src/array/composite/typed.rs @@ -1,31 +1,35 @@ use std::fmt::Debug; -use std::sync::Arc; +use vortex_error::{VortexError, VortexResult}; use vortex_schema::CompositeID; use vortex_schema::DType; use crate::array::composite::array::CompositeArray; -use crate::array::composite::CompositeMetadata; -use crate::array::{Array, ArrayRef}; +use crate::array::composite::UnderlyingMetadata; use crate::compute::ArrayCompute; +use crate::{Array, ArrayDType}; pub trait CompositeExtension: Debug + Send + Sync + 'static { fn id(&self) -> CompositeID; - fn as_typed_compute(&self, array: &CompositeArray) -> Box; + fn with_compute<'a>( + &self, + array: &'a CompositeArray<'a>, + f: &mut dyn for<'b> FnMut(&'b (dyn ArrayCompute + 'a)) -> VortexResult<()>, + ) -> VortexResult<()>; } pub type CompositeExtensionRef = &'static dyn CompositeExtension; #[derive(Debug, Clone)] -pub struct TypedCompositeArray { +pub struct TypedCompositeArray<'a, M: UnderlyingMetadata> { metadata: M, - underlying: ArrayRef, + underlying: Array<'a>, dtype: DType, } -impl TypedCompositeArray { - pub fn new(metadata: M, underlying: ArrayRef) -> Self { +impl<'a, M: UnderlyingMetadata> TypedCompositeArray<'a, M> { + pub fn new(metadata: M, underlying: Array<'a>) -> Self { let dtype = DType::Composite(metadata.id(), underlying.dtype().is_nullable().into()); Self { metadata, @@ -35,12 +39,12 @@ impl TypedCompositeArray { } #[inline] - pub fn metadata(&self) -> &M { + pub fn underlying_metadata(&self) -> &M { &self.metadata } #[inline] - pub fn underlying(&self) -> &ArrayRef { + pub fn underlying(&self) -> &Array<'a> { &self.underlying } @@ -49,25 +53,35 @@ impl TypedCompositeArray { &self.dtype } - pub fn as_composite(&self) -> CompositeArray { - CompositeArray::new( - self.metadata().id(), - Arc::new(self.metadata().serialize()), + pub fn as_composite(&self) -> VortexResult> { + Ok(CompositeArray::new( + self.underlying_metadata().id(), + self.underlying_metadata().try_serialize_metadata()?, self.underlying().clone(), - ) + )) } } -macro_rules! composite_impl { +impl<'a, M: UnderlyingMetadata> TryFrom<&'a CompositeArray<'a>> for TypedCompositeArray<'a, M> { + type Error = VortexError; + + fn try_from(value: &'a CompositeArray<'a>) -> Result { + value.as_typed::() + } +} + +#[macro_export] +macro_rules! impl_composite { ($id:expr, $T:ty) => { use linkme::distributed_slice; use paste::paste; - use vortex_schema::{DType, Nullability}; - - use crate::array::composite::{ - CompositeArray, CompositeExtension, CompositeMetadata, COMPOSITE_EXTENSIONS, + use vortex_schema::{CompositeID, DType, Nullability}; + use $crate::array::composite::{ + CompositeArray, CompositeExtension, TypedCompositeArray, UnderlyingMetadata, + VORTEX_COMPOSITE_EXTENSIONS, }; - use crate::compute::ArrayCompute; + use $crate::compute::ArrayCompute; + use $crate::TryDeserializeArrayMetadata; paste! { #[derive(Debug)] @@ -86,24 +100,32 @@ macro_rules! composite_impl { Self::ID } - fn as_typed_compute(&self, array: &CompositeArray) -> Box { + fn with_compute<'a>( + &self, + array: &'a CompositeArray<'a>, + f: &mut dyn for<'b> FnMut(&'b (dyn ArrayCompute + 'a)) -> VortexResult<()>, + ) -> VortexResult<()> { if array.id() != Self::ID { panic!("Incorrect CompositeID"); } - Box::new(array.as_typed::<$T>()) + let typed = TypedCompositeArray::new( + $T::try_deserialize_metadata(Some(array.underlying_metadata().as_ref()))?, + array.underlying().clone(), + ); + f(&typed) } } - impl CompositeMetadata for $T { + impl UnderlyingMetadata for $T { fn id(&self) -> CompositeID { [<$T Extension>]::ID } } - #[distributed_slice(COMPOSITE_EXTENSIONS)] + #[distributed_slice(VORTEX_COMPOSITE_EXTENSIONS)] static ENCODINGS_COMPOSITE_EXT: &'static dyn CompositeExtension = &[<$T Extension>]; } }; } -pub(crate) use composite_impl; +pub use impl_composite; diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index 30d85b2027..09790b1bf1 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -1,47 +1,35 @@ use itertools::Itertools; use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::Nullability; -use crate::array::bool::BoolArray; use crate::array::constant::ConstantArray; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::PrimitiveArray; -use crate::array::{Array, ArrayRef}; use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::flatten::{FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; -use crate::match_each_native_ptype; use crate::scalar::Scalar; -use crate::validity::Validity; +use crate::{Array, ArrayTrait, IntoArray, OwnedArray}; -impl ArrayCompute for ConstantArray { +impl ArrayCompute for ConstantArray<'_> { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - fn take(&self) -> Option<&dyn TakeFn> { Some(self) } } -impl AsContiguousFn for ConstantArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { - let chunks = arrays.iter().map(|a| a.as_constant().clone()).collect_vec(); +impl AsContiguousFn for ConstantArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { + let chunks = arrays + .iter() + .map(|a| ConstantArray::try_from(a).unwrap()) + .collect_vec(); + if chunks.iter().map(|c| c.scalar()).all_equal() { Ok(ConstantArray::new( chunks.first().unwrap().scalar().clone(), @@ -57,48 +45,14 @@ impl AsContiguousFn for ConstantArray { } } -impl FlattenFn for ConstantArray { - fn flatten(&self) -> VortexResult { - let validity = match self.nullability() { - Nullability::NonNullable => None, - Nullability::Nullable => Some(match self.scalar().is_null() { - true => Validity::Invalid(self.len()), - false => Validity::Valid(self.len()), - }), - }; - - Ok(match self.scalar() { - Scalar::Bool(b) => FlattenedArray::Bool(BoolArray::from_nullable( - vec![b.value().copied().unwrap_or_default(); self.len()], - validity, - )), - Scalar::Primitive(p) => { - match_each_native_ptype!(p.ptype(), |$P| { - FlattenedArray::Primitive(PrimitiveArray::from_nullable::<$P>( - vec![$P::try_from(self.scalar())?; self.len()], - validity, - )) - }) - } - _ => panic!("Unsupported scalar type {}", self.dtype()), - }) - } -} - -impl ScalarAtFn for ConstantArray { +impl ScalarAtFn for ConstantArray<'_> { fn scalar_at(&self, _index: usize) -> VortexResult { Ok(self.scalar().clone()) } } -impl TakeFn for ConstantArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for ConstantArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { Ok(ConstantArray::new(self.scalar().clone(), indices.len()).into_array()) } } - -impl SliceFn for ConstantArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(ConstantArray::new(self.scalar.clone(), stop - start).into_array()) - } -} diff --git a/vortex-array2/src/array/constant/flatten.rs b/vortex-array/src/array/constant/flatten.rs similarity index 93% rename from vortex-array2/src/array/constant/flatten.rs rename to vortex-array/src/array/constant/flatten.rs index b9f4205fba..cc4d8c61d2 100644 --- a/vortex-array2/src/array/constant/flatten.rs +++ b/vortex-array/src/array/constant/flatten.rs @@ -1,12 +1,12 @@ -use vortex::match_each_native_ptype; -use vortex::scalar::Scalar; use vortex_error::VortexResult; use vortex_schema::Nullability; use crate::array::bool::BoolArray; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; +use crate::scalar::Scalar; use crate::validity::Validity; +use crate::{match_each_native_ptype, ArrayDType, ArrayTrait}; use crate::{ArrayFlatten, Flattened}; impl ArrayFlatten for ConstantArray<'_> { diff --git a/vortex-array/src/array/constant/mod.rs b/vortex-array/src/array/constant/mod.rs index 2512d911ed..c5519bab4f 100644 --- a/vortex-array/src/array/constant/mod.rs +++ b/vortex-array/src/array/constant/mod.rs @@ -1,146 +1,75 @@ -use std::sync::{Arc, RwLock}; - -use linkme::distributed_slice; +use serde::{Deserialize, Serialize}; use vortex_error::VortexResult; -use vortex_schema::DType; -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::scalar::Scalar; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stat, Stats, StatsSet}; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::{impl_array, ArrayWalker}; +use crate::impl_encoding; +use crate::validity::{ArrayValidity, LogicalValidity}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; mod compute; -mod serde; +mod flatten; mod stats; -#[derive(Debug, Clone)] -pub struct ConstantArray { +impl_encoding!("vortex.constant", Constant); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConstantMetadata { scalar: Scalar, length: usize, - stats: Arc>, } -impl ConstantArray { +impl ConstantArray<'_> { pub fn new(scalar: S, length: usize) -> Self where Scalar: From, { let scalar: Scalar = scalar.into(); - let stats = StatsSet::from( - [ - (Stat::Max, scalar.clone()), - (Stat::Min, scalar.clone()), - (Stat::IsConstant, true.into()), - (Stat::IsSorted, true.into()), - (Stat::RunCount, 1.into()), - ] - .into(), - ); - Self { - scalar, - length, - stats: Arc::new(RwLock::new(stats)), - } + let stats = HashMap::from([ + (Stat::Max, scalar.clone()), + (Stat::Min, scalar.clone()), + (Stat::IsConstant, true.into()), + (Stat::IsSorted, true.into()), + (Stat::RunCount, 1.into()), + ]); + Self::try_from_parts( + scalar.dtype().clone(), + ConstantMetadata { scalar, length }, + vec![].into(), + stats, + ) + .unwrap() } pub fn scalar(&self) -> &Scalar { - &self.scalar - } -} - -impl Array for ConstantArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.length - } - - #[inline] - fn is_empty(&self) -> bool { - self.length == 0 - } - - #[inline] - fn dtype(&self) -> &DType { - self.scalar.dtype() - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &ConstantEncoding - } - - fn nbytes(&self) -> usize { - self.scalar.nbytes() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - Ok(()) + &self.metadata().scalar } } -impl ArrayDisplay for ConstantArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("scalar", self.scalar()) - } -} - -impl ArrayValidity for ConstantArray { - fn logical_validity(&self) -> Validity { - match self.scalar().is_null() { - true => Validity::Invalid(self.len()), - false => Validity::Valid(self.len()), - } - } - +impl ArrayValidity for ConstantArray<'_> { fn is_valid(&self, _index: usize) -> bool { - match self.scalar.dtype().is_nullable() { + match self.metadata().scalar.dtype().is_nullable() { true => !self.scalar().is_null(), false => true, } } -} - -#[derive(Debug)] -pub struct ConstantEncoding; -impl ConstantEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.constant"); + fn logical_validity(&self) -> LogicalValidity { + match self.scalar().is_null() { + true => LogicalValidity::AllInvalid(self.len()), + false => LogicalValidity::AllValid(self.len()), + } + } } -#[distributed_slice(ENCODINGS)] -static ENCODINGS_CONSTANT: EncodingRef = &ConstantEncoding; - -impl Encoding for ConstantEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl AcceptArrayVisitor for ConstantArray<'_> { + fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + Ok(()) } +} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for ConstantArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } + +impl EncodingCompression for ConstantEncoding {} diff --git a/vortex-array/src/array/constant/serde.rs b/vortex-array/src/array/constant/serde.rs deleted file mode 100644 index e2e70e2f3f..0000000000 --- a/vortex-array/src/array/constant/serde.rs +++ /dev/null @@ -1,46 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::constant::{ConstantArray, ConstantEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for ConstantArray { - fn write(&self, ctx: &mut WriteCtx<'_>) -> VortexResult<()> { - ctx.write_usize(self.len())?; - ctx.scalar(self.scalar()) - } - - fn metadata(&self) -> VortexResult>> { - // FIXME(ngates): use flatbuffer / serde. - let mut vec = Vec::new(); - let mut ctx = WriteCtx::new(&mut vec); - ctx.write_usize(self.len())?; - ctx.scalar(self.scalar())?; - Ok(Some(vec)) - } -} - -impl EncodingSerde for ConstantEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let scalar = ctx.scalar()?; - Ok(ConstantArray::new(scalar, len).into_array()) - } -} - -#[cfg(test)] -mod test { - use crate::array::constant::ConstantArray; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::Array; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = ConstantArray::new(42i32, 100); - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!(arr.scalar(), read_arr.as_constant().scalar()); - assert_eq!(arr.len(), read_arr.len()); - } -} diff --git a/vortex-array/src/array/constant/stats.rs b/vortex-array/src/array/constant/stats.rs index f5a31f9b0f..f5f5923523 100644 --- a/vortex-array/src/array/constant/stats.rs +++ b/vortex-array/src/array/constant/stats.rs @@ -1,26 +1,25 @@ +use std::collections::HashMap; + use vortex_error::VortexResult; use vortex_schema::DType; use crate::array::constant::ConstantArray; -use crate::array::Array; use crate::scalar::Scalar; -use crate::stats::{Stat, StatsCompute, StatsSet}; +use crate::stats::{ArrayStatisticsCompute, Stat}; +use crate::{ArrayDType, ArrayTrait}; -impl StatsCompute for ConstantArray { - fn compute(&self, _stat: &Stat) -> VortexResult { +impl ArrayStatisticsCompute for ConstantArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { if matches!(self.dtype(), &DType::Bool(_)) { let Scalar::Bool(b) = self.scalar() else { unreachable!("Got bool dtype without bool scalar") }; - return Ok(StatsSet::from( - [( - Stat::TrueCount, - (self.len() as u64 * b.value().cloned().map(|v| v as u64).unwrap_or(0)).into(), - )] - .into(), - )); + return Ok([( + Stat::TrueCount, + (self.len() as u64 * b.value().cloned().map(|v| v as u64).unwrap_or(0)).into(), + )] + .into()); } - - Ok(StatsSet::default()) + Ok(HashMap::default()) } } diff --git a/vortex-array/src/datetime/README.md b/vortex-array/src/array/datetime/README.md similarity index 100% rename from vortex-array/src/datetime/README.md rename to vortex-array/src/array/datetime/README.md diff --git a/vortex-array2/src/array/datetime/localdatetime.rs b/vortex-array/src/array/datetime/localdatetime.rs similarity index 88% rename from vortex-array2/src/array/datetime/localdatetime.rs rename to vortex-array/src/array/datetime/localdatetime.rs index 7146c709ed..ccdd828efe 100644 --- a/vortex-array2/src/array/datetime/localdatetime.rs +++ b/vortex-array/src/array/datetime/localdatetime.rs @@ -5,13 +5,13 @@ use arrow_array::{ TimestampNanosecondArray, TimestampSecondArray, }; use serde::{Deserialize, Serialize}; -use vortex::ptype::PType; use vortex_error::VortexResult; use crate::array::datetime::TimeUnit; use crate::compute::as_arrow::AsArrowArray; use crate::compute::cast::cast; use crate::impl_composite; +use crate::ptype::PType; use crate::validity::ArrayValidity; impl_composite!("vortex.localdatetime", LocalDateTime); @@ -23,6 +23,17 @@ pub struct LocalDateTime { time_unit: TimeUnit, } +impl LocalDateTime { + pub fn new(time_unit: TimeUnit) -> Self { + Self { time_unit } + } + + #[inline] + pub fn time_unit(&self) -> TimeUnit { + self.time_unit + } +} + impl ArrayCompute for LocalDateTimeArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) diff --git a/vortex-array2/src/array/datetime/mod.rs b/vortex-array/src/array/datetime/mod.rs similarity index 100% rename from vortex-array2/src/array/datetime/mod.rs rename to vortex-array/src/array/datetime/mod.rs diff --git a/vortex-array/src/array/downcast.rs b/vortex-array/src/array/downcast.rs deleted file mode 100644 index 892cb6d767..0000000000 --- a/vortex-array/src/array/downcast.rs +++ /dev/null @@ -1,150 +0,0 @@ -use crate::array::bool::BoolArray; -use crate::array::chunked::ChunkedArray; -use crate::array::composite::CompositeArray; -use crate::array::constant::ConstantArray; -use crate::array::primitive::PrimitiveArray; -use crate::array::sparse::SparseArray; -use crate::array::struct_::StructArray; -use crate::array::varbin::VarBinArray; -use crate::array::varbinview::VarBinViewArray; -use crate::array::{Array, ArrayRef}; - -mod private { - pub trait Sealed {} -} - -pub trait DowncastArrayBuiltin: private::Sealed { - fn maybe_primitive(&self) -> Option<&PrimitiveArray>; - - fn as_primitive(&self) -> &PrimitiveArray { - self.maybe_primitive().unwrap() - } - - fn maybe_bool(&self) -> Option<&BoolArray>; - - fn as_bool(&self) -> &BoolArray { - self.maybe_bool().unwrap() - } - - fn maybe_varbin(&self) -> Option<&VarBinArray>; - - fn as_varbin(&self) -> &VarBinArray { - self.maybe_varbin().unwrap() - } - - fn maybe_varbinview(&self) -> Option<&VarBinViewArray>; - - fn as_varbinview(&self) -> &VarBinViewArray { - self.maybe_varbinview().unwrap() - } - - fn maybe_composite(&self) -> Option<&CompositeArray>; - - fn as_composite(&self) -> &CompositeArray { - self.maybe_composite().unwrap() - } - - fn maybe_struct(&self) -> Option<&StructArray>; - - fn as_struct(&self) -> &StructArray { - self.maybe_struct().unwrap() - } - - fn maybe_sparse(&self) -> Option<&SparseArray>; - - fn as_sparse(&self) -> &SparseArray { - self.maybe_sparse().unwrap() - } - - fn maybe_constant(&self) -> Option<&ConstantArray>; - - fn as_constant(&self) -> &ConstantArray { - self.maybe_constant().unwrap() - } - - fn maybe_chunked(&self) -> Option<&ChunkedArray>; - - fn as_chunked(&self) -> &ChunkedArray { - self.maybe_chunked().unwrap() - } -} - -impl private::Sealed for dyn Array + '_ {} - -impl DowncastArrayBuiltin for dyn Array + '_ { - fn maybe_primitive(&self) -> Option<&PrimitiveArray> { - self.as_any().downcast_ref() - } - - fn maybe_bool(&self) -> Option<&BoolArray> { - self.as_any().downcast_ref() - } - - fn maybe_varbin(&self) -> Option<&VarBinArray> { - self.as_any().downcast_ref() - } - - fn maybe_varbinview(&self) -> Option<&VarBinViewArray> { - self.as_any().downcast_ref() - } - - fn maybe_composite(&self) -> Option<&CompositeArray> { - self.as_any().downcast_ref() - } - - fn maybe_struct(&self) -> Option<&StructArray> { - self.as_any().downcast_ref() - } - - fn maybe_sparse(&self) -> Option<&SparseArray> { - self.as_any().downcast_ref() - } - - fn maybe_constant(&self) -> Option<&ConstantArray> { - self.as_any().downcast_ref() - } - - fn maybe_chunked(&self) -> Option<&ChunkedArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastArrayBuiltin for ArrayRef { - fn maybe_primitive(&self) -> Option<&PrimitiveArray> { - self.as_ref().maybe_primitive() - } - - fn maybe_bool(&self) -> Option<&BoolArray> { - self.as_ref().maybe_bool() - } - - fn maybe_varbin(&self) -> Option<&VarBinArray> { - self.as_any().downcast_ref() - } - - fn maybe_varbinview(&self) -> Option<&VarBinViewArray> { - self.as_any().downcast_ref() - } - - fn maybe_composite(&self) -> Option<&CompositeArray> { - self.as_any().downcast_ref() - } - - fn maybe_struct(&self) -> Option<&StructArray> { - self.as_any().downcast_ref() - } - - fn maybe_sparse(&self) -> Option<&SparseArray> { - self.as_any().downcast_ref() - } - - fn maybe_constant(&self) -> Option<&ConstantArray> { - self.as_any().downcast_ref() - } - - fn maybe_chunked(&self) -> Option<&ChunkedArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-array/src/array/mod.rs b/vortex-array/src/array/mod.rs index 788336e2c1..4ba6ea80f2 100644 --- a/vortex-array/src/array/mod.rs +++ b/vortex-array/src/array/mod.rs @@ -1,272 +1,10 @@ -use std::any::Any; -use std::fmt::{Debug, Display, Formatter}; -use std::sync::Arc; - -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, Nullability}; - -use crate::array::bool::{BoolArray, BoolEncoding}; -use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; -use crate::array::composite::{CompositeArray, CompositeEncoding}; -use crate::array::constant::{ConstantArray, ConstantEncoding}; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use crate::array::sparse::{SparseArray, SparseEncoding}; -use crate::array::struct_::{StructArray, StructEncoding}; -use crate::array::varbin::{VarBinArray, VarBinEncoding}; -use crate::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; -use crate::compute::ArrayCompute; -use crate::encoding::EncodingRef; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::ArraySerde; -use crate::stats::Stats; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::ArrayWalker; - pub mod bool; pub mod chunked; pub mod composite; pub mod constant; -pub mod downcast; +pub mod datetime; pub mod primitive; pub mod sparse; -pub mod struct_; +pub mod r#struct; pub mod varbin; pub mod varbinview; - -pub type ArrayRef = Arc; - -/// A Vortex Array is the base object representing all arrays in enc. -/// -/// Arrays have a dtype and an encoding. DTypes represent the logical type of the -/// values stored in a vortex array. Encodings represent the physical layout of the -/// array. -/// -/// This differs from Apache Arrow where logical and physical are combined in -/// the data type, e.g. LargeString, RunEndEncoded. -pub trait Array: ArrayValidity + ArrayDisplay + Debug + Send + Sync { - /// Converts itself to a reference of [`Any`], which enables downcasting to concrete types. - fn as_any(&self) -> &dyn Any; - fn into_any(self: Arc) -> Arc; - fn to_array(&self) -> ArrayRef; - fn into_array(self) -> ArrayRef; - - /// Get the length of the array - fn len(&self) -> usize; - /// Check whether the array is empty - fn is_empty(&self) -> bool; - /// Get the dtype of the array - fn dtype(&self) -> &DType; - /// Get the nullability of the array - fn nullability(&self) -> Nullability { - self.dtype().nullability() - } - - /// Get statistics for the array - /// TODO(ngates): this is interesting. What type do we return from this? - /// Maybe we actually need to model stats more like compute? - fn stats(&self) -> Stats; - - /// Encoding kind of the array - fn encoding(&self) -> EncodingRef; - /// Approximate size in bytes of the array. Only takes into account variable size portion of the array - - fn nbytes(&self) -> usize; - - fn with_compute_mut( - &self, - _f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()>; - - fn serde(&self) -> Option<&dyn ArraySerde> { - None - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()>; -} - -pub trait WithArrayCompute { - fn with_compute VortexResult>(&self, f: F) - -> VortexResult; -} - -impl WithArrayCompute for dyn Array + '_ { - #[inline] - fn with_compute VortexResult>( - &self, - f: F, - ) -> VortexResult { - let mut result: Option = None; - self.with_compute_mut(&mut |compute| { - result = Some(f(compute)?); - Ok(()) - })?; - Ok(result.unwrap()) - } -} - -pub trait IntoArray { - fn into_array(self) -> ArrayRef; -} - -#[macro_export] -macro_rules! impl_array { - () => { - #[inline] - fn as_any(&self) -> &dyn std::any::Any { - self - } - - #[inline] - fn into_any(self: std::sync::Arc) -> std::sync::Arc { - self - } - - #[inline] - fn to_array(&self) -> ArrayRef { - self.clone().into_array() - } - - #[inline] - fn into_array(self) -> ArrayRef { - std::sync::Arc::new(self) - } - }; -} -pub use impl_array; - -impl Array for ArrayRef { - fn as_any(&self) -> &dyn Any { - self.as_ref().as_any() - } - - fn into_any(self: Arc) -> Arc { - self - } - - fn to_array(&self) -> ArrayRef { - self.as_ref().to_array() - } - - fn into_array(self) -> ArrayRef { - self - } - - fn len(&self) -> usize { - self.as_ref().len() - } - - fn is_empty(&self) -> bool { - self.as_ref().is_empty() - } - - fn dtype(&self) -> &DType { - self.as_ref().dtype() - } - - fn stats(&self) -> Stats { - self.as_ref().stats() - } - - fn encoding(&self) -> EncodingRef { - self.as_ref().encoding() - } - - fn nbytes(&self) -> usize { - self.as_ref().nbytes() - } - - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - self.as_ref().with_compute_mut(f) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - self.as_ref().serde() - } - - #[allow(unused_variables)] - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - self.as_ref().walk(walker) - } -} - -impl ArrayValidity for ArrayRef { - fn logical_validity(&self) -> Validity { - self.as_ref().logical_validity() - } - - fn is_valid(&self, index: usize) -> bool { - self.as_ref().is_valid(index) - } -} - -impl ArrayDisplay for ArrayRef { - fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { - ArrayDisplay::fmt(self.as_ref(), fmt) - } -} - -pub fn check_validity_buffer(validity: Option<&ArrayRef>, expected_len: usize) -> VortexResult<()> { - if let Some(v) = validity { - if !matches!(v.dtype(), DType::Bool(Nullability::NonNullable)) { - vortex_bail!(MismatchedTypes: DType::Bool(Nullability::NonNullable), v.dtype()); - } - if v.len() != expected_len { - vortex_bail!( - "Validity buffer {} has incorrect length {}, expected {}", - v, - v.len(), - expected_len - ); - } - } - - Ok(()) -} - -#[derive(Debug, Clone)] -pub enum ArrayKind<'a> { - Bool(&'a BoolArray), - Chunked(&'a ChunkedArray), - Composite(&'a CompositeArray), - Constant(&'a ConstantArray), - Primitive(&'a PrimitiveArray), - Sparse(&'a SparseArray), - Struct(&'a StructArray), - VarBin(&'a VarBinArray), - VarBinView(&'a VarBinViewArray), - Other(&'a dyn Array), -} - -impl<'a> From<&'a dyn Array> for ArrayKind<'a> { - fn from(value: &'a dyn Array) -> Self { - match value.encoding().id() { - BoolEncoding::ID => ArrayKind::Bool(value.as_bool()), - ChunkedEncoding::ID => ArrayKind::Chunked(value.as_chunked()), - CompositeEncoding::ID => ArrayKind::Composite(value.as_composite()), - ConstantEncoding::ID => ArrayKind::Constant(value.as_constant()), - PrimitiveEncoding::ID => ArrayKind::Primitive(value.as_primitive()), - SparseEncoding::ID => ArrayKind::Sparse(value.as_sparse()), - StructEncoding::ID => ArrayKind::Struct(value.as_struct()), - VarBinEncoding::ID => ArrayKind::VarBin(value.as_varbin()), - VarBinViewEncoding::ID => ArrayKind::VarBinView(value.as_varbinview()), - _ => ArrayKind::Other(value), - } - } -} - -impl Display for dyn Array + '_ { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}({}, len={})", - self.encoding().id(), - self.dtype(), - self.len() - ) - } -} diff --git a/vortex-array/src/array/primitive/accessor.rs b/vortex-array/src/array/primitive/accessor.rs new file mode 100644 index 0000000000..4e43cbf6c2 --- /dev/null +++ b/vortex-array/src/array/primitive/accessor.rs @@ -0,0 +1,28 @@ +use vortex_error::VortexResult; + +use crate::accessor::ArrayAccessor; +use crate::array::primitive::PrimitiveArray; +use crate::ptype::NativePType; +use crate::validity::ArrayValidity; + +impl ArrayAccessor for PrimitiveArray<'_> { + fn with_iterator(&self, f: F) -> VortexResult + where + F: for<'a> FnOnce(&mut (dyn Iterator>)) -> R, + { + match self.logical_validity().to_null_buffer()? { + None => { + let mut iter = self.typed_data::().iter().map(Some); + Ok(f(&mut iter)) + } + Some(nulls) => { + let mut iter = self + .typed_data::() + .iter() + .zip(nulls.iter()) + .map(|(value, valid)| if valid { Some(value) } else { None }); + Ok(f(&mut iter)) + } + } + } +} diff --git a/vortex-array/src/array/primitive/compute/as_arrow.rs b/vortex-array/src/array/primitive/compute/as_arrow.rs index 2773dfb5f4..9893a70ae4 100644 --- a/vortex-array/src/array/primitive/compute/as_arrow.rs +++ b/vortex-array/src/array/primitive/compute/as_arrow.rs @@ -6,20 +6,36 @@ use arrow_array::{ use arrow_buffer::ScalarBuffer; use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; -use crate::arrow::wrappers::as_nulls; +use crate::array::primitive::PrimitiveArray; use crate::compute::as_arrow::AsArrowArray; -use crate::ptype::{AsArrowPrimitiveType, NativePType}; +use crate::ptype::PType; +use crate::validity::ArrayValidity; +use crate::ArrayTrait; -impl AsArrowArray for &dyn PrimitiveTrait { +impl AsArrowArray for PrimitiveArray<'_> { fn as_arrow(&self) -> VortexResult { - Ok(Arc::new( - ArrowPrimitiveArray::<::ArrowType>::new( - ScalarBuffer::< - <::ArrowType as ArrowPrimitiveType>::Native, - >::new(self.buffer().clone(), 0, self.len()), - as_nulls(self.logical_validity())?, - ), - )) + use arrow_array::types::*; + Ok(match self.ptype() { + PType::U8 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::U64 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I8 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::I64 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F16 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F32 => Arc::new(as_arrow_array_primitive::(self)?), + PType::F64 => Arc::new(as_arrow_array_primitive::(self)?), + }) } } + +fn as_arrow_array_primitive( + array: &PrimitiveArray, +) -> VortexResult> { + Ok(ArrowPrimitiveArray::new( + ScalarBuffer::::new(array.buffer().clone().into(), 0, array.len()), + array.logical_validity().to_null_buffer()?, + )) +} diff --git a/vortex-array/src/array/primitive/compute/as_contiguous.rs b/vortex-array/src/array/primitive/compute/as_contiguous.rs index 1a33515d06..a1ab4cfee5 100644 --- a/vortex-array/src/array/primitive/compute/as_contiguous.rs +++ b/vortex-array/src/array/primitive/compute/as_contiguous.rs @@ -1,50 +1,30 @@ -use itertools::Itertools; -use vortex_alloc::{AlignedVec, ALIGNED_ALLOCATOR}; -use vortex_error::{vortex_bail, VortexResult}; +use arrow_buffer::{MutableBuffer, ScalarBuffer}; +use vortex_error::VortexResult; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::compute::PrimitiveTrait; use crate::array::primitive::PrimitiveArray; -use crate::array::{Array, ArrayRef}; use crate::compute::as_contiguous::AsContiguousFn; -use crate::ptype::NativePType; -use crate::validity::ArrayValidity; use crate::validity::Validity; +use crate::{match_each_native_ptype, ArrayDType}; +use crate::{Array, IntoArray, OwnedArray}; -impl AsContiguousFn for &dyn PrimitiveTrait { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { - if !arrays - .iter() - .map(|chunk| chunk.as_primitive().ptype()) - .all_equal() - { - vortex_bail!(ComputeError: "Chunks have differing ptypes"); - } - +impl AsContiguousFn for PrimitiveArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { let validity = if self.dtype().is_nullable() { - Some(Validity::from_iter( - arrays.iter().map(|v| v.logical_validity()), - )) + Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) } else { - None + Validity::NonNullable }; - Ok(PrimitiveArray::from_nullable_in( - native_primitive_as_contiguous( - arrays - .iter() - .map(|a| a.as_primitive().typed_data::()) - .collect(), - ), - validity, - ) - .into_array()) + let mut buffer = MutableBuffer::with_capacity( + arrays.iter().map(|a| a.len()).sum::() * self.ptype().byte_width(), + ); + for array in arrays { + buffer.extend_from_slice(array.as_primitive().buffer().as_slice()) + } + match_each_native_ptype!(self.ptype(), |$T| { + Ok(PrimitiveArray::try_new(ScalarBuffer::<$T>::from(buffer), validity) + .unwrap() + .into_array()) + }) } } - -fn native_primitive_as_contiguous(arrays: Vec<&[P]>) -> AlignedVec

{ - let len = arrays.iter().map(|a| a.len()).sum(); - let mut result = AlignedVec::with_capacity_in(len, ALIGNED_ALLOCATOR); - arrays.iter().for_each(|arr| result.extend_from_slice(arr)); - result -} diff --git a/vortex-array/src/array/primitive/compute/cast.rs b/vortex-array/src/array/primitive/compute/cast.rs index 46a4bcca3f..c607b67698 100644 --- a/vortex-array/src/array/primitive/compute/cast.rs +++ b/vortex-array/src/array/primitive/compute/cast.rs @@ -1,66 +1,75 @@ use vortex_error::{vortex_err, VortexResult}; use vortex_schema::DType; -use crate::array::primitive::compute::PrimitiveTrait; use crate::array::primitive::PrimitiveArray; -use crate::array::Validity; -use crate::array::{Array, ArrayRef}; use crate::compute::cast::CastFn; -use crate::match_each_native_ptype; use crate::ptype::{NativePType, PType}; +use crate::validity::Validity; +use crate::{match_each_native_ptype, ArrayDType}; +use crate::{IntoArray, OwnedArray}; -impl CastFn for &dyn PrimitiveTrait { - fn cast(&self, dtype: &DType) -> VortexResult { - // TODO(ngates): check validity - let into_ptype = PType::try_from(dtype)?; - if into_ptype == self.ptype() { - Ok(self.to_array()) - } else { - match_each_native_ptype!(into_ptype, |$P| { - Ok(PrimitiveArray::from_nullable( - cast::(self.typed_data())?, - Validity::try_from_logical(self.logical_validity(), self.nullability())?, - ).into_array()) +impl CastFn for PrimitiveArray<'_> { + fn cast(&self, dtype: &DType) -> VortexResult { + let ptype = PType::try_from(dtype)?; + + // Short-cut if we can just change the nullability + if self.ptype() == ptype && !self.dtype().is_nullable() && dtype.is_nullable() { + match_each_native_ptype!(self.ptype(), |$T| { + return Ok( + PrimitiveArray::try_new(self.scalar_buffer::<$T>(), Validity::AllValid)? + .into_array(), + ); }) } + + // FIXME(ngates): #260 - check validity and nullability + match_each_native_ptype!(ptype, |$T| { + Ok(PrimitiveArray::from_vec( + cast::<$T>(self)?, + self.validity().clone(), + ).into_array()) + }) } } -fn cast(array: &[P]) -> VortexResult> { - array - .iter() - // TODO(ngates): allow configurable checked/unchecked casting - .map(|&v| { - T::from(v) - .ok_or_else(|| vortex_err!(ComputeError: "Failed to cast {} to {}", v, T::PTYPE)) - }) - .collect() +fn cast(array: &PrimitiveArray) -> VortexResult> { + match_each_native_ptype!(array.ptype(), |$E| { + array + .typed_data::<$E>() + .iter() + // TODO(ngates): allow configurable checked/unchecked casting + .map(|&v| { + T::from(v).ok_or_else(|| { + vortex_err!(ComputeError: "Failed to cast {} to {:?}", v, T::PTYPE) + }) + }) + .collect() + }) } #[cfg(test)] mod test { use vortex_error::VortexError; - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::IntoArray; - use crate::compute; use crate::ptype::PType; + use crate::{compute, IntoArray}; #[test] fn cast_u32_u8() { let arr = vec![0u32, 10, 200].into_array(); - let u8arr = compute::cast::cast(&arr, PType::U8.into()).unwrap(); - assert_eq!(u8arr.as_primitive().typed_data::(), vec![0u8, 10, 200]); + let p = compute::cast::cast(&arr, PType::U8.into()) + .unwrap() + .into_primitive(); + assert_eq!(p.typed_data::(), vec![0u8, 10, 200]); } #[test] fn cast_u32_f32() { let arr = vec![0u32, 10, 200].into_array(); - let u8arr = compute::cast::cast(&arr, PType::F32.into()).unwrap(); - assert_eq!( - u8arr.as_primitive().typed_data::(), - vec![0.0f32, 10., 200.] - ); + let u8arr = compute::cast::cast(&arr, PType::F32.into()) + .unwrap() + .into_primitive(); + assert_eq!(u8arr.typed_data::(), vec![0.0f32, 10., 200.]); } #[test] @@ -70,6 +79,6 @@ mod test { let VortexError::ComputeError(s, _) = error else { unreachable!() }; - assert_eq!(s.to_string(), "Failed to cast -1 to u32"); + assert_eq!(s.to_string(), "Failed to cast -1 to U32"); } } diff --git a/vortex-array/src/array/primitive/compute/fill.rs b/vortex-array/src/array/primitive/compute/fill.rs index cbf2357884..abacba2ded 100644 --- a/vortex-array/src/array/primitive/compute/fill.rs +++ b/vortex-array/src/array/primitive/compute/fill.rs @@ -1,72 +1,71 @@ use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; use crate::array::primitive::PrimitiveArray; -use crate::array::{Array, ArrayRef, IntoArray}; use crate::compute::fill::FillForwardFn; -use crate::ptype::NativePType; +use crate::match_each_native_ptype; +use crate::validity::ArrayValidity; +use crate::{IntoArray, OwnedArray, ToArrayData}; -impl FillForwardFn for &dyn PrimitiveTrait { - fn fill_forward(&self) -> VortexResult { - if self.validity().is_none() { - return Ok(self.to_array()); - } - - let validity = self.validity().unwrap(); - if validity.all_valid() { - return Ok(PrimitiveArray::new(self.ptype(), self.buffer().clone(), None).into_array()); - } - - let mut last_value = T::zero(); - let filled = self - .typed_data() - .iter() - .zip(validity.to_bool_array().into_buffer().iter()) - .map(|(v, valid)| { - if valid { - last_value = *v; - } - last_value - }) - .collect::>(); - Ok(filled.into_array()) +impl FillForwardFn for PrimitiveArray<'_> { + fn fill_forward(&self) -> VortexResult { + let validity = self.logical_validity(); + let Some(nulls) = validity.to_null_buffer()? else { + return Ok(self.to_array_data().into_array()); + }; + match_each_native_ptype!(self.ptype(), |$T| { + let typed_data = self.typed_data::<$T>(); + let mut last_value = $T::default(); + let filled = typed_data + .iter() + .zip(nulls.into_iter()) + .map(|(v, valid)| { + if valid { + last_value = *v; + } + last_value + }) + .collect::>(); + Ok(filled.into_array()) + }) } } #[cfg(test)] mod test { - use crate::array::downcast::DowncastArrayBuiltin; + use crate::array::bool::BoolArray; use crate::array::primitive::PrimitiveArray; - use crate::compute; - use crate::validity::OwnedValidity; + use crate::validity::{ArrayValidity, Validity}; + use crate::{compute, IntoArray}; #[test] fn leading_none() { - let arr = PrimitiveArray::from_iter(vec![None, Some(8u8), None, Some(10), None]); - let filled = compute::fill::fill_forward(&arr).unwrap(); - let filled_primitive = filled.as_primitive(); - assert_eq!(filled_primitive.typed_data::(), vec![0, 8, 8, 10, 10]); - assert!(filled_primitive.validity().is_none()); + let arr = PrimitiveArray::from_nullable_vec(vec![None, Some(8u8), None, Some(10), None]) + .into_array(); + let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + assert_eq!(p.typed_data::(), vec![0, 8, 8, 10, 10]); + assert!(p.logical_validity().is_all_valid()); } #[test] fn all_none() { - let arr = PrimitiveArray::from_iter(vec![Option::::None, None, None, None, None]); - let filled = compute::fill::fill_forward(&arr).unwrap(); - let filled_primitive = filled.as_primitive(); - assert_eq!(filled_primitive.typed_data::(), vec![0, 0, 0, 0, 0]); - assert!(filled_primitive.validity().is_none()); + let arr = + PrimitiveArray::from_nullable_vec(vec![Option::::None, None, None, None, None]) + .into_array(); + + let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + assert_eq!(p.typed_data::(), vec![0, 0, 0, 0, 0]); + assert!(p.logical_validity().is_all_valid()); } #[test] fn nullable_non_null() { - let arr = PrimitiveArray::from_nullable( + let arr = PrimitiveArray::from_vec( vec![8u8, 10u8, 12u8, 14u8, 16u8], - Some(vec![true, true, true, true, true].into()), - ); - let filled = compute::fill::fill_forward(&arr).unwrap(); - let filled_primitive = filled.as_primitive(); - assert_eq!(filled_primitive.typed_data::(), vec![8, 10, 12, 14, 16]); - assert!(filled_primitive.validity().is_none()); + Validity::Array(BoolArray::from(vec![true, true, true, true, true]).into_array()), + ) + .into_array(); + let p = compute::fill::fill_forward(&arr).unwrap().into_primitive(); + assert_eq!(p.typed_data::(), vec![8, 10, 12, 14, 16]); + assert!(p.logical_validity().is_all_valid()); } } diff --git a/vortex-array/src/array/primitive/compute/flatten.rs b/vortex-array/src/array/primitive/compute/flatten.rs deleted file mode 100644 index 4d4b13e66f..0000000000 --- a/vortex-array/src/array/primitive/compute/flatten.rs +++ /dev/null @@ -1,11 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::primitive::compute::PrimitiveTrait; -use crate::compute::flatten::{FlattenFn, FlattenedArray}; -use crate::ptype::NativePType; - -impl FlattenFn for &dyn PrimitiveTrait { - fn flatten(&self) -> VortexResult { - Ok(FlattenedArray::Primitive(self.to_primitive())) - } -} diff --git a/vortex-array/src/array/primitive/compute/mod.rs b/vortex-array/src/array/primitive/compute/mod.rs index a20050c532..180cc99601 100644 --- a/vortex-array/src/array/primitive/compute/mod.rs +++ b/vortex-array/src/array/primitive/compute/mod.rs @@ -1,49 +1,24 @@ -use std::fmt::Debug; - -use arrow_buffer::Buffer; - use crate::array::primitive::PrimitiveArray; -use crate::array::Array; use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::AsContiguousFn; use crate::compute::cast::CastFn; use crate::compute::fill::FillForwardFn; -use crate::compute::flatten::FlattenFn; -use crate::compute::patch::PatchFn; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::search_sorted::SearchSortedFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; -use crate::ptype::{AsArrowPrimitiveType, NativePType, PType}; -use crate::validity::OwnedValidity; mod as_arrow; mod as_contiguous; mod cast; mod fill; -mod flatten; -mod patch; mod scalar_at; mod search_sorted; mod slice; mod take; -pub(crate) trait PrimitiveTrait: - OwnedValidity + Array + Debug + Send + Sync -{ - fn ptype(&self) -> PType; - - fn buffer(&self) -> &Buffer; - - fn to_primitive(&self) -> PrimitiveArray; - - fn typed_data(&self) -> &[T] { - self.buffer().typed_data::() - } -} - -impl ArrayCompute for &dyn PrimitiveTrait { +impl ArrayCompute for PrimitiveArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } @@ -56,18 +31,10 @@ impl ArrayCompute for &dyn PrimitiveTrait Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { Some(self) } - fn patch(&self) -> Option<&dyn PatchFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } diff --git a/vortex-array/src/array/primitive/compute/patch.rs b/vortex-array/src/array/primitive/compute/patch.rs deleted file mode 100644 index bea2bcdcd5..0000000000 --- a/vortex-array/src/array/primitive/compute/patch.rs +++ /dev/null @@ -1,49 +0,0 @@ -use itertools::Itertools; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::compute::PrimitiveTrait; -use crate::array::primitive::PrimitiveArray; -use crate::array::sparse::{SparseArray, SparseEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::compute; -use crate::compute::patch::PatchFn; -use crate::ptype::NativePType; -use crate::view::ToOwnedView; - -impl PatchFn for &dyn PrimitiveTrait { - fn patch(&self, patch: &dyn Array) -> VortexResult { - match patch.encoding().id() { - SparseEncoding::ID => patch_with_sparse(*self, patch.as_sparse()), - // TODO(ngates): support a default implementation based on iter_arrow? - _ => Err(vortex_err!(NotImplemented: "patch", patch.encoding().id().name())), - } - } -} - -fn patch_with_sparse( - array: &dyn PrimitiveTrait, - patch: &SparseArray, -) -> VortexResult { - let patch_indices = patch.resolved_indices(); - let mut values = Vec::from(array.typed_data()); - let patch_values = compute::flatten::flatten_primitive(patch.values())?; - - if array.ptype() != patch_values.ptype() { - vortex_bail!(MismatchedTypes: array.dtype(), Array::dtype(&patch_values)) - } - - for (idx, value) in patch_indices - .iter() - .zip_eq(patch_values.typed_data().iter()) - { - values[*idx] = *value; - } - - Ok(PrimitiveArray::from_nullable( - values, - // TODO(ngates): if patch values has null, we need to patch into the validity buffer - array.validity().map(|v| v.to_owned_view()), - ) - .into_array()) -} diff --git a/vortex-array/src/array/primitive/compute/scalar_at.rs b/vortex-array/src/array/primitive/compute/scalar_at.rs index 05c4db5f83..a7b02b73c1 100644 --- a/vortex-array/src/array/primitive/compute/scalar_at.rs +++ b/vortex-array/src/array/primitive/compute/scalar_at.rs @@ -1,19 +1,22 @@ use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; +use crate::array::primitive::PrimitiveArray; use crate::compute::scalar_at::ScalarAtFn; -use crate::ptype::NativePType; -use crate::scalar::{PrimitiveScalar, Scalar}; +use crate::match_each_native_ptype; +use crate::scalar::PrimitiveScalar; +use crate::scalar::Scalar; +use crate::validity::ArrayValidity; +use crate::ArrayDType; -impl ScalarAtFn for &dyn PrimitiveTrait { +impl ScalarAtFn for PrimitiveArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - Ok(PrimitiveScalar::try_new( - self.validity() - .map(|v| v.is_valid(index)) - .unwrap_or(true) - .then(|| self.typed_data()[index]), - self.dtype().nullability(), - )? - .into()) + match_each_native_ptype!(self.ptype(), |$T| { + Ok(PrimitiveScalar::try_new( + self.is_valid(index) + .then(|| self.typed_data::<$T>()[index]), + self.dtype().nullability(), + )? + .into()) + }) } } diff --git a/vortex-array/src/array/primitive/compute/search_sorted.rs b/vortex-array/src/array/primitive/compute/search_sorted.rs index b2095e3854..eb7e20d82f 100644 --- a/vortex-array/src/array/primitive/compute/search_sorted.rs +++ b/vortex-array/src/array/primitive/compute/search_sorted.rs @@ -1,51 +1,45 @@ use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; +use crate::array::primitive::PrimitiveArray; use crate::compute::search_sorted::{SearchResult, SearchSorted}; use crate::compute::search_sorted::{SearchSortedFn, SearchSortedSide}; -use crate::ptype::NativePType; +use crate::match_each_native_ptype; use crate::scalar::Scalar; -impl SearchSortedFn for &dyn PrimitiveTrait { +impl SearchSortedFn for PrimitiveArray<'_> { fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult { - let pvalue: T = value.try_into()?; - Ok(self.typed_data().search_sorted(&pvalue, side)) + match_each_native_ptype!(self.ptype(), |$T| { + let pvalue: $T = value.try_into()?; + Ok(self.typed_data::<$T>().search_sorted(&pvalue, side)) + }) } } #[cfg(test)] mod test { use super::*; - use crate::array::IntoArray; use crate::compute::search_sorted::search_sorted; + use crate::IntoArray; #[test] fn test_searchsorted_primitive() { let values = vec![1u16, 2, 3].into_array(); assert_eq!( - search_sorted(&values, 0, SearchSortedSide::Left) - .unwrap() - .to_index(), - 0 + search_sorted(&values, 0, SearchSortedSide::Left).unwrap(), + SearchResult::NotFound(0) ); assert_eq!( - search_sorted(&values, 1, SearchSortedSide::Left) - .unwrap() - .to_index(), - 0 + search_sorted(&values, 1, SearchSortedSide::Left).unwrap(), + SearchResult::Found(0) ); assert_eq!( - search_sorted(&values, 1, SearchSortedSide::Right) - .unwrap() - .to_index(), - 1 + search_sorted(&values, 1, SearchSortedSide::Right).unwrap(), + SearchResult::Found(1) ); assert_eq!( - search_sorted(&values, 4, SearchSortedSide::Left) - .unwrap() - .to_index(), - 3 + search_sorted(&values, 4, SearchSortedSide::Left).unwrap(), + SearchResult::NotFound(3) ); } } diff --git a/vortex-array/src/array/primitive/compute/slice.rs b/vortex-array/src/array/primitive/compute/slice.rs index a68e3683f9..68644dfbfd 100644 --- a/vortex-array/src/array/primitive/compute/slice.rs +++ b/vortex-array/src/array/primitive/compute/slice.rs @@ -1,21 +1,19 @@ use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; use crate::array::primitive::PrimitiveArray; -use crate::array::{Array, ArrayRef}; use crate::compute::slice::SliceFn; -use crate::ptype::NativePType; +use crate::match_each_native_ptype; +use crate::IntoArray; +use crate::OwnedArray; -impl SliceFn for &dyn PrimitiveTrait { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - let byte_start = start * self.ptype().byte_width(); - let byte_length = (stop - start) * self.ptype().byte_width(); - - Ok(PrimitiveArray::new( - self.ptype(), - self.buffer().slice_with_length(byte_start, byte_length), - self.validity().map(|v| v.slice(start, stop)).transpose()?, - ) - .into_array()) +impl SliceFn for PrimitiveArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + match_each_native_ptype!(self.ptype(), |$T| { + Ok(PrimitiveArray::try_new( + self.scalar_buffer::<$T>().slice(start, stop - start), + self.validity().slice(start, stop)?, + )? + .into_array()) + }) } } diff --git a/vortex-array/src/array/primitive/compute/take.rs b/vortex-array/src/array/primitive/compute/take.rs index f0afdf4ad5..70206fa1b1 100644 --- a/vortex-array/src/array/primitive/compute/take.rs +++ b/vortex-array/src/array/primitive/compute/take.rs @@ -1,23 +1,24 @@ use num_traits::PrimInt; use vortex_error::VortexResult; -use crate::array::primitive::compute::PrimitiveTrait; use crate::array::primitive::PrimitiveArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::flatten::flatten_primitive; use crate::compute::take::TakeFn; -use crate::match_each_integer_ptype; use crate::ptype::NativePType; +use crate::IntoArray; +use crate::{match_each_integer_ptype, match_each_native_ptype}; +use crate::{Array, OwnedArray}; -impl TakeFn for &dyn PrimitiveTrait { - fn take(&self, indices: &dyn Array) -> VortexResult { - let validity = self.validity().map(|v| v.take(indices)).transpose()?; - let indices = flatten_primitive(indices)?; - match_each_integer_ptype!(indices.ptype(), |$I| { - Ok(PrimitiveArray::from_nullable( - take_primitive(self.typed_data(), indices.typed_data::<$I>()), - validity, - ).into_array()) +impl TakeFn for PrimitiveArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + let validity = self.validity(); + let indices = indices.clone().flatten_primitive()?; + match_each_native_ptype!(self.ptype(), |$T| { + match_each_integer_ptype!(indices.ptype(), |$I| { + Ok(PrimitiveArray::from_vec( + take_primitive(self.typed_data::<$T>(), indices.typed_data::<$I>()), + validity.take(indices.array())?, + ).into_array()) + }) }) } } diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index cff70808ce..cd78fe09da 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -1,190 +1,76 @@ -use core::cmp::min; -use std::iter; -use std::mem::size_of; -use std::panic::RefUnwindSafe; -use std::ptr::NonNull; -use std::sync::{Arc, RwLock}; - -use allocator_api2::alloc::Allocator; -use arrow_buffer::buffer::{Buffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, ScalarBuffer}; use itertools::Itertools; -use linkme::distributed_slice; use num_traits::AsPrimitive; -pub use view::*; +use serde::{Deserialize, Serialize}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, Nullability}; -use crate::accessor::ArrayAccessor; -use crate::array::primitive::compute::PrimitiveTrait; -use crate::array::IntoArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::iterator::ArrayIter; -use crate::ptype::{match_each_native_ptype, NativePType, PType}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsSet}; -use crate::validity::{ArrayValidity, OwnedValidity}; -use crate::validity::{Validity, ValidityView}; -use crate::view::{AsView, ToOwnedView}; -use crate::{impl_array, ArrayWalker}; +use crate::buffer::Buffer; +use crate::ptype::{NativePType, PType}; +use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{impl_encoding, ArrayDType, OwnedArray}; +use crate::{match_each_native_ptype, ArrayFlatten}; +mod accessor; mod compute; -mod serde; mod stats; -mod view; -#[derive(Debug, Clone)] -pub struct PrimitiveArray { - buffer: Buffer, - ptype: PType, - dtype: DType, - validity: Option, - stats: Arc>, -} +impl_encoding!("vortex.primitive", Primitive); -impl PrimitiveArray { - pub fn new(ptype: PType, buffer: Buffer, validity: Option) -> Self { - Self::try_new(ptype, buffer, validity).unwrap() - } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PrimitiveMetadata { + validity: ValidityMetadata, +} - pub fn try_new(ptype: PType, buffer: Buffer, validity: Option) -> VortexResult { - if let Some(v) = validity.as_view() { - if v.len() != buffer.len() / ptype.byte_width() { - vortex_bail!("Validity length does not match buffer length"); - } - } - let dtype = DType::from(ptype).with_nullability(validity.is_some().into()); +impl PrimitiveArray<'_> { + pub fn try_new( + buffer: ScalarBuffer, + validity: Validity, + ) -> VortexResult { Ok(Self { - buffer, - ptype, - dtype, - validity, - stats: Arc::new(RwLock::new(StatsSet::new())), + typed: TypedArray::try_from_parts( + DType::from(T::PTYPE).with_nullability(validity.nullability()), + PrimitiveMetadata { + validity: validity.to_metadata(buffer.len())?, + }, + Some(Buffer::Owned(buffer.into_inner())), + validity.into_array_data().into_iter().collect_vec().into(), + HashMap::default(), + )?, }) } - /// Allocate buffer from allocator-api2 vector. This would be easier when arrow gets https://github.com/apache/arrow-rs/issues/3960 - #[inline] - pub fn from_vec_in( - values: allocator_api2::vec::Vec, - ) -> Self { - Self::from_nullable_in(values, None) + pub fn from_vec(values: Vec, validity: Validity) -> Self { + Self::try_new(ScalarBuffer::from(values), validity).unwrap() } - pub fn from_nullable_in< - T: NativePType, - A: Allocator + RefUnwindSafe + Send + Sync + 'static, - >( - values: allocator_api2::vec::Vec, - validity: Option, - ) -> Self { - let ptr = values.as_ptr(); - let buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new(ptr as _).unwrap(), - values.len() * size_of::(), - Arc::new(values), - ) - }; - Self::new(T::PTYPE, buffer, validity) + pub fn from_nullable_vec(values: Vec>) -> Self { + let elems: Vec = values.iter().map(|v| v.unwrap_or_default()).collect(); + let validity = Validity::from(values.iter().map(|v| v.is_some()).collect::>()); + Self::from_vec(elems, validity) } - pub fn from_nullable(values: Vec, validity: Option) -> Self { - let buffer = Buffer::from_vec::(values); - Self::new(T::PTYPE, buffer, validity) + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(0, &Validity::DTYPE)) } - pub fn from_value(value: T, n: usize) -> Self { - PrimitiveArray::from(iter::repeat(value).take(n).collect::>()) - } - - pub fn null(n: usize) -> Self { - PrimitiveArray::from_nullable( - iter::repeat(T::zero()).take(n).collect::>(), - Some(Validity::Invalid(n)), - ) - } - - pub fn into_nullable(self, nullability: Nullability) -> Self { - let dtype = self.dtype().with_nullability(nullability); - if self.validity().is_some() && nullability == Nullability::NonNullable { - panic!("Cannot convert nullable array to non-nullable array") - } - let len = self.len(); - let validity = if nullability == Nullability::Nullable { - Some( - self.validity() - .to_owned_view() - .unwrap_or_else(|| Validity::Valid(len)), - ) - } else { - None - }; - Self { - buffer: self.buffer, - ptype: self.ptype, - dtype, - validity, - stats: self.stats, - } - } - - #[inline] pub fn ptype(&self) -> PType { - self.ptype + // TODO(ngates): we can't really cache this anywhere? + self.dtype().try_into().unwrap() } - #[inline] pub fn buffer(&self) -> &Buffer { - &self.buffer - } - - #[inline] - pub fn into_buffer(self) -> Buffer { - self.buffer + self.array().buffer().expect("missing buffer") } pub fn scalar_buffer(&self) -> ScalarBuffer { - ScalarBuffer::from(self.buffer().clone()) + ScalarBuffer::new(self.buffer().clone().into(), 0, self.len()) } pub fn typed_data(&self) -> &[T] { - if self.ptype() != T::PTYPE { - panic!( - "Invalid PType! Expected {}, got self.ptype {}", - T::PTYPE, - self.ptype() - ); - } - self.buffer().typed_data() - } - - pub fn patch, T: NativePType>( - mut self, - positions: &[P], - values: &[T], - ) -> VortexResult { - if self.ptype() != T::PTYPE { - vortex_bail!(MismatchedTypes: self.dtype, T::PTYPE) - } - - let mut own_values = self - .buffer - .into_vec::() - .unwrap_or_else(|b| Vec::from(b.typed_data::())); - // TODO(robert): Also patch validity - for (idx, value) in positions.iter().zip_eq(values.iter()) { - own_values[(*idx).as_()] = *value; - } - self.buffer = Buffer::from_vec::(own_values); - Ok(self) - } - - pub(crate) fn as_trait(&self) -> &dyn PrimitiveTrait { - assert_eq!(self.ptype, T::PTYPE); - self + self.buffer().typed_data::() } pub fn reinterpret_cast(&self, ptype: PType) -> Self { @@ -198,204 +84,96 @@ impl PrimitiveArray { "can't reinterpret cast between integers of two different widths" ); - PrimitiveArray::new( - ptype, - self.buffer().clone(), - self.validity().to_owned_view(), - ) - } -} - -impl Array for PrimitiveArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.buffer.len() / self.ptype.byte_width() - } - - #[inline] - fn is_empty(&self) -> bool { - self.buffer.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &PrimitiveEncoding - } - - fn nbytes(&self) -> usize { - self.buffer.len() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - match_each_native_ptype!(self.ptype(), |$P| { - f(&self.as_trait::<$P>()) + match_each_native_ptype!(ptype, |$P| { + PrimitiveArray::try_new( + ScalarBuffer::<$P>::new(self.buffer().clone().into(), 0, self.len()), + self.validity(), + ) + .unwrap() }) } - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - if let Some(v) = self.validity() { - // FIXME(ngates): should validity implement Array? - walker.visit_child(&v.to_array())?; + pub fn patch, T: NativePType>( + self, + positions: &[P], + values: &[T], + ) -> VortexResult { + if self.ptype() != T::PTYPE { + vortex_bail!(MismatchedTypes: self.dtype(), T::PTYPE) } - walker.visit_buffer(self.buffer()) - } -} - -impl OwnedValidity for PrimitiveArray { - fn validity(&self) -> Option { - self.validity.as_view() - } -} -impl PrimitiveTrait for PrimitiveArray { - fn ptype(&self) -> PType { - self.ptype - } - - fn buffer(&self) -> &Buffer { - &self.buffer - } - - fn to_primitive(&self) -> PrimitiveArray { - self.clone() - } -} + let validity = self.validity().to_static(); -impl ArrayAccessor<'_, T> for PrimitiveArray { - fn value(&self, index: usize) -> Option { - if self.is_valid(index) { - Some(self.typed_data::()[index]) - } else { - None + let mut own_values = self + .into_buffer() + .into_vec::() + .unwrap_or_else(|b| Vec::from(b.typed_data::())); + // TODO(robert): Also patch validity + for (idx, value) in positions.iter().zip_eq(values.iter()) { + own_values[(*idx).as_()] = *value; } + Self::try_new(ScalarBuffer::from(own_values), validity) } } -impl PrimitiveArray { - pub fn iter(&self) -> ArrayIter { - ArrayIter::new(self) +impl<'a> PrimitiveArray<'a> { + pub fn into_buffer(self) -> Buffer<'a> { + self.into_array().into_buffer().unwrap() } } -#[derive(Debug)] -pub struct PrimitiveEncoding; - -impl PrimitiveEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.primitive"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_PRIMITIVE: EncodingRef = &PrimitiveEncoding; - -impl Encoding for PrimitiveEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl From> for PrimitiveArray<'_> { + fn from(values: Vec) -> Self { + PrimitiveArray::from_vec(values, Validity::NonNullable) } +} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl IntoArray<'static> for Vec { + fn into_array(self) -> OwnedArray { + PrimitiveArray::from(self).into_array() } } -impl From> for PrimitiveArray { - fn from(values: Vec) -> Self { - Self::from_nullable(values, None) +impl ArrayFlatten for PrimitiveArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + Ok(Flattened::Primitive(self)) } } -impl IntoArray for Vec { - fn into_array(self) -> ArrayRef { - PrimitiveArray::from(self).into_array() +impl ArrayTrait for PrimitiveArray<'_> { + fn len(&self) -> usize { + self.buffer().len() / self.ptype().byte_width() } } -impl FromIterator> for PrimitiveArray { - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (lower, _) = iter.size_hint(); - - let mut validity: Vec = Vec::with_capacity(lower); - let values: Vec = iter - .map(|i| { - validity.push(i.is_some()); - i.unwrap_or_default() - }) - .collect::>(); +impl ArrayValidity for PrimitiveArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) + } - PrimitiveArray::from_nullable( - values, - if !validity.is_empty() { - Some(validity.into()) - } else { - None - }, - ) + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) } } -impl ArrayDisplay for PrimitiveArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - match_each_native_ptype!(self.ptype(), |$P| { - f.property("values", format!("{:?}{}", - &self.buffer().typed_data::<$P>()[..min(10, Array::len(self))], - if Array::len(self) > 10 { "..." } else { "" })) - })?; - f.validity(self.validity()) +impl AcceptArrayVisitor for PrimitiveArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer(self.buffer())?; + visitor.visit_validity(&self.validity()) } } -#[cfg(test)] -mod test { - use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - - use crate::array::primitive::PrimitiveArray; - use crate::array::Array; - use crate::compute::scalar_at::scalar_at; - use crate::compute::slice::slice; - use crate::ptype::PType; - - #[test] - fn from_arrow() { - let arr = PrimitiveArray::from(vec![1, 2, 3]); - assert_eq!(arr.len(), 3); - assert_eq!(arr.ptype, PType::I32); - assert_eq!( - arr.dtype(), - &DType::Int(IntWidth::_32, Signedness::Signed, Nullability::NonNullable) - ); - - // Ensure we can fetch the scalar at the given index. - assert_eq!(scalar_at(&arr, 0).unwrap(), 1.into()); - assert_eq!(scalar_at(&arr, 1).unwrap(), 2.into()); - assert_eq!(scalar_at(&arr, 2).unwrap(), 3.into()); +impl<'a> Array<'a> { + pub fn into_primitive(self) -> PrimitiveArray<'a> { + PrimitiveArray::try_from(self).expect("expected primitive array") } - #[test] - fn slice_array() { - let arr = slice(&PrimitiveArray::from(vec![1, 2, 3, 4, 5]), 1, 4).unwrap(); - assert_eq!(arr.len(), 3); - assert_eq!(scalar_at(&arr, 0).unwrap(), 2.into()); - assert_eq!(scalar_at(&arr, 1).unwrap(), 3.into()); - assert_eq!(scalar_at(&arr, 2).unwrap(), 4.into()); + pub fn as_primitive(&self) -> PrimitiveArray { + PrimitiveArray::try_from(self).expect("expected primitive array") } } + +impl EncodingCompression for PrimitiveEncoding {} diff --git a/vortex-array/src/array/primitive/serde.rs b/vortex-array/src/array/primitive/serde.rs deleted file mode 100644 index 845b690216..0000000000 --- a/vortex-array/src/array/primitive/serde.rs +++ /dev/null @@ -1,59 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding, PrimitiveView}; -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::match_each_native_ptype; -use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; -use crate::validity::OwnedValidity; - -impl ArraySerde for PrimitiveArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.ptype(self.ptype())?; - ctx.write_validity(self.validity())?; - ctx.write_buffer(self.len(), self.buffer()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for PrimitiveEncoding { - fn with_view_compute<'view>( - &self, - view: &'view ArrayView, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - let view = PrimitiveView::try_new(view)?; - match_each_native_ptype!(view.ptype(), |$T| { - f(&view.as_trait::<$T>()) - }) - } - - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let ptype = ctx.ptype()?; - let validity = ctx.read_validity()?; - let (_, buf) = ctx.read_buffer(|len| len * ptype.byte_width())?; - Ok(PrimitiveArray::new(ptype, buf, validity).into_array()) - } -} - -#[cfg(test)] -mod test { - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::primitive::PrimitiveArray; - use crate::serde::test::roundtrip_array; - use crate::validity::ArrayValidity; - - #[test] - fn roundtrip() { - let arr = PrimitiveArray::from_iter(vec![Some(0), None, Some(2), Some(42)]); - let read_arr = roundtrip_array(&arr).unwrap(); - assert_eq!( - arr.buffer().typed_data::(), - read_arr.as_primitive().buffer().typed_data::() - ); - assert_eq!(arr.logical_validity(), read_arr.logical_validity()); - } -} diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index ecc85713fc..6bade15197 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -5,41 +5,44 @@ use arrow_buffer::buffer::BooleanBuffer; use vortex_error::VortexResult; use crate::array::primitive::PrimitiveArray; -use crate::array::ArrayValidity; -use crate::compute::flatten::flatten_bool; use crate::match_each_native_ptype; use crate::ptype::NativePType; +use crate::scalar::Scalar; use crate::scalar::{ListScalarVec, PScalar}; -use crate::stats::{Stat, StatsCompute, StatsSet}; -use crate::validity::Validity; +use crate::stats::{ArrayStatisticsCompute, Stat}; +use crate::validity::ArrayValidity; +use crate::validity::LogicalValidity; +use crate::IntoArray; -impl StatsCompute for PrimitiveArray { - fn compute(&self, stat: &Stat) -> VortexResult { +impl ArrayStatisticsCompute for PrimitiveArray<'_> { + fn compute_statistics(&self, stat: Stat) -> VortexResult> { match_each_native_ptype!(self.ptype(), |$P| { match self.logical_validity() { - Validity::Valid(_) => self.typed_data::<$P>().compute(stat), - Validity::Invalid(v) => all_null_stats::<$P>(v), - Validity::Array(a) => { - NullableValues(self.typed_data::<$P>(), flatten_bool(&a)?.buffer()).compute(stat) - } + LogicalValidity::AllValid(_) => self.typed_data::<$P>().compute_statistics(stat), + LogicalValidity::AllInvalid(v) => all_null_stats::<$P>(v), + LogicalValidity::Array(a) => NullableValues( + self.typed_data::<$P>(), + &a.into_array().flatten_bool()?.boolean_buffer(), + ) + .compute_statistics(stat), } }) } } -impl StatsCompute for &[T] { - fn compute(&self, _stat: &Stat) -> VortexResult { +impl ArrayStatisticsCompute for &[T] { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { if self.is_empty() { - return Ok(StatsSet::default()); + return Ok(HashMap::default()); } let mut stats = StatsAccumulator::new(self[0]); self.iter().skip(1).for_each(|next| stats.next(*next)); - Ok(stats.into_set()) + Ok(stats.into_map()) } } -fn all_null_stats(len: usize) -> VortexResult { - Ok(StatsSet::from(HashMap::from([ +fn all_null_stats(len: usize) -> VortexResult> { + Ok(HashMap::from([ (Stat::Min, Option::::None.into()), (Stat::Max, Option::::None.into()), (Stat::IsConstant, true.into()), @@ -55,16 +58,16 @@ fn all_null_stats(len: usize) -> VortexResult { Stat::TrailingZeroFreq, ListScalarVec(vec![size_of::() * 8; size_of::() * 8 + 1]).into(), ), - ]))) + ])) } struct NullableValues<'a, T: NativePType>(&'a [T], &'a BooleanBuffer); -impl<'a, T: NativePType> StatsCompute for NullableValues<'a, T> { - fn compute(&self, _stat: &Stat) -> VortexResult { +impl<'a, T: NativePType> ArrayStatisticsCompute for NullableValues<'a, T> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { let values = self.0; if values.is_empty() { - return Ok(StatsSet::default()); + return Ok(HashMap::default()); } let first_non_null = self @@ -83,7 +86,7 @@ impl<'a, T: NativePType> StatsCompute for NullableValues<'a, T> { .skip(1) .map(|(next, valid)| valid.then_some(*next)) .for_each(|next| stats.nullable_next(next)); - Ok(stats.into_set()) + Ok(stats.into_map()) } } @@ -190,8 +193,8 @@ impl StatsAccumulator { self.prev = next; } - pub fn into_set(self) -> StatsSet { - StatsSet::from(HashMap::from([ + pub fn into_map(self) -> HashMap { + HashMap::from([ (Stat::Min, self.min.into()), (Stat::Max, self.max.into()), (Stat::NullCount, self.null_count.into()), @@ -207,39 +210,35 @@ impl StatsAccumulator { (self.is_sorted && self.is_strict_sorted).into(), ), (Stat::RunCount, self.run_count.into()), - ])) + ]) } } #[cfg(test)] mod test { use crate::array::primitive::PrimitiveArray; - use crate::array::Array; use crate::scalar::ListScalarVec; - use crate::stats::Stat; + use crate::stats::{ArrayStatistics, Stat}; #[test] fn stats() { let arr = PrimitiveArray::from(vec![1, 2, 3, 4, 5]); - let min: i32 = arr.stats().get_or_compute_as(&Stat::Min).unwrap(); - let max: i32 = arr.stats().get_or_compute_as(&Stat::Max).unwrap(); - let is_sorted: bool = arr.stats().get_or_compute_as(&Stat::IsSorted).unwrap(); - let is_strict_sorted: bool = arr - .stats() - .get_or_compute_as(&Stat::IsStrictSorted) - .unwrap(); - let is_constant: bool = arr.stats().get_or_compute_as(&Stat::IsConstant).unwrap(); + let min: i32 = arr.statistics().compute_as(Stat::Min).unwrap(); + let max: i32 = arr.statistics().compute_as(Stat::Max).unwrap(); + let is_sorted: bool = arr.statistics().compute_as(Stat::IsSorted).unwrap(); + let is_strict_sorted: bool = arr.statistics().compute_as(Stat::IsStrictSorted).unwrap(); + let is_constant: bool = arr.statistics().compute_as(Stat::IsConstant).unwrap(); let bit_width_freq: Vec = arr - .stats() - .get_or_compute_as::>(&Stat::BitWidthFreq) + .statistics() + .compute_as::>(Stat::BitWidthFreq) .unwrap() .0; let trailing_zeros_freq: Vec = arr - .stats() - .get_or_compute_as::>(&Stat::TrailingZeroFreq) + .statistics() + .compute_as::>(Stat::TrailingZeroFreq) .unwrap() .0; - let run_count: u64 = arr.stats().get_or_compute_as(&Stat::RunCount).unwrap(); + let run_count: u64 = arr.statistics().compute_as(Stat::RunCount).unwrap(); assert_eq!(min, 1); assert_eq!(max, 5); assert!(is_sorted); @@ -267,26 +266,26 @@ mod test { #[test] fn stats_u8() { let arr = PrimitiveArray::from(vec![1u8, 2, 3, 4, 5]); - let min: u8 = arr.stats().get_or_compute_as(&Stat::Min).unwrap(); - let max: u8 = arr.stats().get_or_compute_as(&Stat::Max).unwrap(); + let min: u8 = arr.statistics().compute_as(Stat::Min).unwrap(); + let max: u8 = arr.statistics().compute_as(Stat::Max).unwrap(); assert_eq!(min, 1); assert_eq!(max, 5); } #[test] fn nullable_stats_u8() { - let arr = PrimitiveArray::from_iter(vec![None, Some(1i32), None, Some(2)]); - let min: Option = arr.stats().get_or_compute_as(&Stat::Min); - let max: Option = arr.stats().get_or_compute_as(&Stat::Max); + let arr = PrimitiveArray::from_nullable_vec(vec![None, Some(1i32), None, Some(2)]); + let min: Option = arr.statistics().compute_as(Stat::Min); + let max: Option = arr.statistics().compute_as(Stat::Max); assert_eq!(min, Some(1)); assert_eq!(max, Some(2)); } #[test] fn all_null() { - let arr = PrimitiveArray::from_iter(vec![Option::::None, None, None]); - let min: Option = arr.stats().get_or_compute_as(&Stat::Min); - let max: Option = arr.stats().get_or_compute_as(&Stat::Max); + let arr = PrimitiveArray::from_nullable_vec(vec![Option::::None, None, None]); + let min: Option = arr.statistics().compute_as(Stat::Min); + let max: Option = arr.statistics().compute_as(Stat::Max); assert_eq!(min, None); assert_eq!(max, None); } diff --git a/vortex-array/src/array/primitive/view.rs b/vortex-array/src/array/primitive/view.rs deleted file mode 100644 index 8f628fd721..0000000000 --- a/vortex-array/src/array/primitive/view.rs +++ /dev/null @@ -1,136 +0,0 @@ -use std::any::Any; -use std::sync::Arc; - -use arrow_buffer::Buffer; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::DType; - -use crate::array::primitive::compute::PrimitiveTrait; -use crate::array::{Array, ArrayRef, PrimitiveArray}; -use crate::compute::ArrayCompute; -use crate::encoding::EncodingRef; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::ptype::{NativePType, PType}; -use crate::serde::ArrayView; -use crate::stats::Stats; -use crate::validity::OwnedValidity; -use crate::validity::{Validity, ValidityView}; -use crate::view::ToOwnedView; -use crate::ArrayWalker; - -#[derive(Debug)] -pub struct PrimitiveView<'a> { - ptype: PType, - buffer: &'a Buffer, - validity: Option>, -} - -impl<'a> PrimitiveView<'a> { - pub fn try_new(view: &'a ArrayView<'a>) -> VortexResult { - // TODO(ngates): validate the number of buffers / children. We could even extract them? - let ptype = PType::try_from(view.dtype())?; - let buffer = view - .buffers() - .first() - .ok_or_else(|| vortex_err!(InvalidSerde: "Missing primitive buffer"))?; - let validity = view.child(0, &Validity::DTYPE).map(ValidityView::from); - - Ok(Self { - ptype, - buffer, - validity, - }) - } - - pub fn ptype(&self) -> PType { - self.ptype - } - - pub(crate) fn as_trait(&self) -> &dyn PrimitiveTrait { - assert_eq!(self.ptype, T::PTYPE); - self - } -} - -impl<'a, T: NativePType> PrimitiveTrait for PrimitiveView<'a> { - fn ptype(&self) -> PType { - self.ptype - } - - fn buffer(&self) -> &Buffer { - self.buffer - } - - fn to_primitive(&self) -> PrimitiveArray { - PrimitiveArray::new( - self.ptype(), - self.buffer.clone(), - self.validity.to_owned_view(), - ) - } -} - -impl<'a> OwnedValidity for PrimitiveView<'a> { - fn validity(&self) -> Option> { - self.validity.clone() - } -} - -impl Array for PrimitiveView<'_> { - fn as_any(&self) -> &dyn Any { - todo!() - } - - fn into_any(self: Arc) -> Arc { - todo!() - } - - fn to_array(&self) -> ArrayRef { - todo!() - } - - fn into_array(self) -> ArrayRef { - todo!() - } - - fn len(&self) -> usize { - todo!() - } - - fn is_empty(&self) -> bool { - todo!() - } - - fn dtype(&self) -> &DType { - todo!() - } - - fn stats(&self) -> Stats { - todo!() - } - - fn encoding(&self) -> EncodingRef { - todo!() - } - - fn nbytes(&self) -> usize { - todo!() - } - - fn with_compute_mut( - &self, - _f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - todo!() - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - todo!() - } -} - -impl ArrayDisplay for PrimitiveView<'_> { - fn fmt(&self, _fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { - todo!() - } -} diff --git a/vortex-array/src/array/sparse/compute/mod.rs b/vortex-array/src/array/sparse/compute/mod.rs index 62f84dc2b9..ecb06566b3 100644 --- a/vortex-array/src/array/sparse/compute/mod.rs +++ b/vortex-array/src/array/sparse/compute/mod.rs @@ -1,34 +1,25 @@ use std::collections::HashMap; -use arrow_buffer::BooleanBufferBuilder; use itertools::Itertools; use vortex_error::{vortex_bail, VortexResult}; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::PrimitiveArray; +use crate::array::primitive::{OwnedPrimitiveArray, PrimitiveArray}; use crate::array::sparse::SparseArray; -use crate::array::{Array, ArrayRef}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::flatten::{flatten_primitive, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::slice::SliceFn; use crate::compute::take::{take, TakeFn}; use crate::compute::ArrayCompute; -use crate::ptype::NativePType; use crate::scalar::Scalar; -use crate::{match_each_integer_ptype, match_each_native_ptype}; +use crate::{match_each_integer_ptype, Array, ArrayDType, ArrayTrait, IntoArray, OwnedArray}; mod slice; -impl ArrayCompute for SparseArray { +impl ArrayCompute for SparseArray<'_> { fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -42,98 +33,39 @@ impl ArrayCompute for SparseArray { } } -impl AsContiguousFn for SparseArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { - let all_fill_types_are_equal = arrays +impl AsContiguousFn for SparseArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { + let sparse = arrays .iter() - .map(|a| a.as_sparse().fill_value()) - .all_equal(); - if !all_fill_types_are_equal { + .map(|a| SparseArray::try_from(a).unwrap()) + .collect_vec(); + + if !sparse.iter().map(|a| a.fill_value()).all_equal() { vortex_bail!("Cannot concatenate SparseArrays with differing fill values"); } Ok(SparseArray::new( - as_contiguous( - &arrays - .iter() - .map(|a| a.as_sparse().indices()) - .cloned() - .collect_vec(), - )?, - as_contiguous( - &arrays - .iter() - .map(|a| a.as_sparse().values()) - .cloned() - .collect_vec(), - )?, - arrays.iter().map(|a| a.len()).sum(), + as_contiguous(&sparse.iter().map(|a| a.indices()).collect_vec())?, + as_contiguous(&sparse.iter().map(|a| a.values()).collect_vec())?, + sparse.iter().map(|a| a.len()).sum(), self.fill_value().clone(), ) .into_array()) } } -impl FlattenFn for SparseArray { - fn flatten(&self) -> VortexResult { - // Resolve our indices into a vector of usize applying the offset - let indices = self.resolved_indices(); - - let mut validity = BooleanBufferBuilder::new(self.len()); - validity.append_n(self.len(), false); - let values = flatten_primitive(self.values())?; - match_each_native_ptype!(values.ptype(), |$P| { - flatten_sparse_values( - values.typed_data::<$P>(), - &indices, - self.len(), - self.fill_value(), - validity - ) - }) - } -} - -fn flatten_sparse_values( - values: &[T], - indices: &[usize], - len: usize, - fill_value: &Scalar, - mut validity: BooleanBufferBuilder, -) -> VortexResult { - let primitive_fill = if fill_value.is_null() { - T::default() - } else { - fill_value.try_into()? - }; - let mut result = vec![primitive_fill; len]; - - for (v, idx) in values.iter().zip_eq(indices) { - result[*idx] = *v; - validity.set_bit(*idx, true); - } - - let validity = validity.finish(); - let array = if fill_value.is_null() { - PrimitiveArray::from_nullable(result, Some(validity.into())) - } else { - PrimitiveArray::from(result) - }; - Ok(FlattenedArray::Primitive(array)) -} - -impl ScalarAtFn for SparseArray { +impl ScalarAtFn for SparseArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { match self.find_index(index)? { None => self.fill_value().clone().cast(self.dtype()), - Some(idx) => scalar_at(self.values(), idx)?.cast(self.dtype()), + Some(idx) => scalar_at(&self.values(), idx)?.cast(self.dtype()), } } } -impl TakeFn for SparseArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - let flat_indices = flatten_primitive(indices)?; +impl TakeFn for SparseArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + let flat_indices = indices.clone().flatten_primitive()?; // if we are taking a lot of values we should build a hashmap let (positions, physical_take_indices) = if indices.len() > 128 { take_map(self, &flat_indices)? @@ -141,7 +73,7 @@ impl TakeFn for SparseArray { take_search_sorted(self, &flat_indices)? }; - let taken_values = take(self.values(), &physical_take_indices)?; + let taken_values = take(&self.values(), &physical_take_indices.into_array())?; Ok(SparseArray::new( positions.into_array(), @@ -156,7 +88,7 @@ impl TakeFn for SparseArray { fn take_map( array: &SparseArray, indices: &PrimitiveArray, -) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { +) -> VortexResult<(OwnedPrimitiveArray, OwnedPrimitiveArray)> { let indices_map: HashMap = array .resolved_indices() .iter() @@ -180,7 +112,7 @@ fn take_map( fn take_search_sorted( array: &SparseArray, indices: &PrimitiveArray, -) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { +) -> VortexResult<(OwnedPrimitiveArray, OwnedPrimitiveArray)> { let resolved = match_each_integer_ptype!(indices.ptype(), |$P| { indices .typed_data::<$P>() @@ -207,43 +139,39 @@ mod test { use itertools::Itertools; use vortex_schema::{DType, FloatWidth, Nullability}; - use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::compute::take_map; use crate::array::sparse::SparseArray; - use crate::array::Array; use crate::compute::as_contiguous::as_contiguous; use crate::compute::slice::slice; use crate::compute::take::take; use crate::scalar::Scalar; + use crate::validity::Validity; + use crate::{ArrayTrait, IntoArray, OwnedArray}; - fn sparse_array() -> SparseArray { + fn sparse_array() -> OwnedArray { SparseArray::new( PrimitiveArray::from(vec![0u64, 37, 47, 99]).into_array(), - PrimitiveArray::from(vec![1.23f64, 0.47, 9.99, 3.5]).into_array(), + PrimitiveArray::from_vec(vec![1.23f64, 0.47, 9.99, 3.5], Validity::AllValid) + .into_array(), 100, Scalar::null(&DType::Float(FloatWidth::_64, Nullability::Nullable)), ) + .into_array() } #[test] fn sparse_take() { let sparse = sparse_array(); - let taken = take(&sparse, &PrimitiveArray::from(vec![0, 47, 47, 0, 99])).unwrap(); + let taken = + SparseArray::try_from(take(&sparse, &vec![0, 47, 47, 0, 99].into_array()).unwrap()) + .unwrap(); assert_eq!( - taken - .as_sparse() - .indices() - .as_primitive() - .typed_data::(), + taken.indices().into_primitive().typed_data::(), [0, 1, 2, 3, 4] ); assert_eq!( - taken - .as_sparse() - .values() - .as_primitive() - .typed_data::(), + taken.values().into_primitive().typed_data::(), [1.23f64, 9.99, 9.99, 1.23, 3.5] ); } @@ -251,43 +179,27 @@ mod test { #[test] fn nonexistent_take() { let sparse = sparse_array(); - let taken = take(&sparse, &PrimitiveArray::from(vec![69])).unwrap(); - assert_eq!( - taken - .as_sparse() - .indices() - .as_primitive() - .typed_data::(), - [] - ); - assert_eq!( - taken - .as_sparse() - .values() - .as_primitive() - .typed_data::(), - [] - ); + let taken = SparseArray::try_from(take(&sparse, &vec![69].into_array()).unwrap()).unwrap(); + assert!(taken + .indices() + .into_primitive() + .typed_data::() + .is_empty()); + assert!(taken + .values() + .into_primitive() + .typed_data::() + .is_empty()); } #[test] fn ordered_take() { let sparse = sparse_array(); - let taken = take(&sparse, &PrimitiveArray::from(vec![69, 37])).unwrap(); - assert_eq!( - taken - .as_sparse() - .indices() - .as_primitive() - .typed_data::(), - [1] - ); + let taken = + SparseArray::try_from(take(&sparse, &vec![69, 37].into_array()).unwrap()).unwrap(); + assert_eq!(taken.indices().into_primitive().typed_data::(), [1]); assert_eq!( - taken - .as_sparse() - .values() - .as_primitive() - .typed_data::(), + taken.values().into_primitive().typed_data::(), [0.47f64] ); assert_eq!(taken.len(), 2); @@ -296,49 +208,44 @@ mod test { #[test] fn take_slices_and_reassemble() { let sparse = sparse_array(); - let indices: PrimitiveArray = (0u64..10).collect_vec().into(); let slices = (0..10) .map(|i| slice(&sparse, i * 10, (i + 1) * 10).unwrap()) .collect_vec(); let taken = slices .iter() - .map(|s| take(s, &indices).unwrap()) + .map(|s| take(s, &(0u64..10).collect_vec().into_array()).unwrap()) .collect_vec(); for i in [1, 2, 5, 6, 7, 8] { - assert_eq!(taken[i].as_sparse().indices().len(), 0); + assert_eq!(SparseArray::try_from(&taken[i]).unwrap().indices().len(), 0); } for i in [0, 3, 4, 9] { - assert_eq!(taken[i].as_sparse().indices().len(), 1); + assert_eq!(SparseArray::try_from(&taken[i]).unwrap().indices().len(), 1); } - let contiguous = as_contiguous(&taken).unwrap(); + let contiguous = SparseArray::try_from(as_contiguous(&taken).unwrap()).unwrap(); assert_eq!( - contiguous - .as_sparse() - .indices() - .as_primitive() - .typed_data::(), + contiguous.indices().into_primitive().typed_data::(), [0u64, 7, 7, 9] // relative offsets ); assert_eq!( - contiguous - .as_sparse() + contiguous.values().into_primitive().typed_data::(), + SparseArray::try_from(sparse) + .unwrap() .values() - .as_primitive() - .typed_data::(), - sparse.values().as_primitive().typed_data() + .into_primitive() + .typed_data::() ); } #[test] fn test_take_map() { - let sparse = sparse_array(); + let sparse = SparseArray::try_from(sparse_array()).unwrap(); let indices = PrimitiveArray::from((0u64..100).collect_vec()); let (positions, patch_indices) = take_map(&sparse, &indices).unwrap(); assert_eq!( positions.typed_data::(), - sparse.indices().as_primitive().typed_data() + sparse.indices().into_primitive().typed_data::() ); assert_eq!(patch_indices.typed_data::(), [0u64, 1, 2, 3]); } diff --git a/vortex-array/src/array/sparse/compute/slice.rs b/vortex-array/src/array/sparse/compute/slice.rs index 29c2ee43a0..7c469fe568 100644 --- a/vortex-array/src/array/sparse/compute/slice.rs +++ b/vortex-array/src/array/sparse/compute/slice.rs @@ -1,21 +1,21 @@ use vortex_error::VortexResult; use crate::array::sparse::SparseArray; -use crate::array::{Array, ArrayRef}; use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; use crate::compute::slice::{slice, SliceFn}; +use crate::{IntoArray, OwnedArray}; -impl SliceFn for SparseArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for SparseArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { // Find the index of the first patch index that is greater than or equal to the offset of this array let index_start_index = - search_sorted(self.indices(), start, SearchSortedSide::Left)?.to_index(); + search_sorted(&self.indices(), start, SearchSortedSide::Left)?.to_index(); let index_end_index = - search_sorted(self.indices(), stop, SearchSortedSide::Left)?.to_index(); + search_sorted(&self.indices(), stop, SearchSortedSide::Left)?.to_index(); Ok(SparseArray::try_new_with_offset( - slice(self.indices(), index_start_index, index_end_index)?, - slice(self.values(), index_start_index, index_end_index)?, + slice(&self.indices(), index_start_index, index_end_index)?, + slice(&self.values(), index_start_index, index_end_index)?, stop - start, self.indices_offset() + start, self.fill_value().clone(), diff --git a/vortex-array/src/array/sparse/flatten.rs b/vortex-array/src/array/sparse/flatten.rs new file mode 100644 index 0000000000..5273418ed6 --- /dev/null +++ b/vortex-array/src/array/sparse/flatten.rs @@ -0,0 +1,61 @@ +use arrow_buffer::BooleanBufferBuilder; +use itertools::Itertools; +use vortex_error::VortexResult; + +use crate::array::primitive::PrimitiveArray; +use crate::array::sparse::SparseArray; +use crate::ptype::NativePType; +use crate::scalar::Scalar; +use crate::validity::Validity; +use crate::{match_each_native_ptype, ArrayFlatten, ArrayTrait, Flattened}; + +impl ArrayFlatten for SparseArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + // Resolve our indices into a vector of usize applying the offset + let indices = self.resolved_indices(); + + let mut validity = BooleanBufferBuilder::new(self.len()); + validity.append_n(self.len(), false); + let values = self.values().flatten_primitive()?; + match_each_native_ptype!(values.ptype(), |$P| { + flatten_sparse_values( + values.typed_data::<$P>(), + &indices, + self.len(), + self.fill_value(), + validity + ) + }) + } +} + +fn flatten_sparse_values( + values: &[T], + indices: &[usize], + len: usize, + fill_value: &Scalar, + mut validity: BooleanBufferBuilder, +) -> VortexResult> { + let primitive_fill = if fill_value.is_null() { + T::default() + } else { + fill_value.try_into()? + }; + let mut result = vec![primitive_fill; len]; + + for (v, idx) in values.iter().zip_eq(indices) { + result[*idx] = *v; + validity.set_bit(*idx, true); + } + + let validity = validity.finish(); + let array = if fill_value.is_null() { + PrimitiveArray::from_vec(result, Validity::from(validity)) + } else { + PrimitiveArray::from(result) + }; + Ok(Flattened::Primitive(array)) +} diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index 596b596f49..4a0e68ba9b 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -1,47 +1,36 @@ -use std::sync::{Arc, RwLock}; - -use linkme::distributed_slice; +use ::serde::{Deserialize, Serialize}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; use crate::array::constant::ConstantArray; -use crate::array::{Array, ArrayRef}; -use crate::compress::EncodingCompression; -use crate::compute::flatten::flatten_primitive; use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::scalar::Scalar; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsCompute, StatsSet}; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::{impl_array, match_each_integer_ptype, ArrayWalker}; - -mod compress; +use crate::stats::ArrayStatisticsCompute; +use crate::validity::{ArrayValidity, LogicalValidity}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{impl_encoding, match_each_integer_ptype, ArrayDType, IntoArrayData, ToArrayData}; + +// mod compress; mod compute; -mod serde; +mod flatten; -#[derive(Debug, Clone)] -pub struct SparseArray { - indices: ArrayRef, - values: ArrayRef, +impl_encoding!("vortex.sparse", Sparse); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SparseMetadata { + indices_dtype: DType, // Offset value for patch indices as a result of slicing indices_offset: usize, len: usize, - stats: Arc>, fill_value: Scalar, } -impl SparseArray { - pub fn new(indices: ArrayRef, values: ArrayRef, len: usize, fill_value: Scalar) -> Self { +impl<'a> SparseArray<'a> { + pub fn new(indices: Array<'a>, values: Array<'a>, len: usize, fill_value: Scalar) -> Self { Self::try_new(indices, values, len, fill_value).unwrap() } pub fn try_new( - indices: ArrayRef, - values: ArrayRef, + indices: Array<'a>, + values: Array<'a>, len: usize, fill_value: Scalar, ) -> VortexResult { @@ -49,8 +38,8 @@ impl SparseArray { } pub(crate) fn try_new_with_offset( - indices: ArrayRef, - values: ArrayRef, + indices: Array<'a>, + values: Array<'a>, len: usize, indices_offset: usize, fill_value: Scalar, @@ -58,42 +47,58 @@ impl SparseArray { if !matches!(indices.dtype(), &DType::IDX) { vortex_bail!("Cannot use {} as indices", indices.dtype()); } + if values.dtype() != fill_value.dtype() { + vortex_bail!( + "Mismatched fill value dtype {} and values dtype {}", + fill_value.dtype(), + values.dtype(), + ); + } - Ok(Self { - indices, - values, - indices_offset, - len, - stats: Arc::new(RwLock::new(StatsSet::new())), - fill_value, - }) + Self::try_from_parts( + values.dtype().clone(), + SparseMetadata { + indices_dtype: indices.dtype().clone(), + indices_offset, + len, + fill_value, + }, + vec![indices.to_array_data(), values.to_array_data()].into(), + HashMap::default(), + ) } +} +impl SparseArray<'_> { #[inline] pub fn indices_offset(&self) -> usize { - self.indices_offset + self.metadata().indices_offset } #[inline] - pub fn values(&self) -> &ArrayRef { - &self.values + pub fn values(&self) -> Array { + self.array() + .child(1, self.dtype()) + .expect("missing child array") } #[inline] - pub fn indices(&self) -> &ArrayRef { - &self.indices + pub fn indices(&self) -> Array { + self.array() + .child(0, &self.metadata().indices_dtype) + .expect("missing indices array") } #[inline] fn fill_value(&self) -> &Scalar { - &self.fill_value + &self.metadata().fill_value } /// Returns the position of a given index in the indices array if it exists. pub fn find_index(&self, index: usize) -> VortexResult> { search_sorted( - self.indices(), - self.indices_offset + index, + &self.indices(), + self.indices_offset() + index, SearchSortedSide::Left, ) .map(|r| r.to_found()) @@ -101,139 +106,70 @@ impl SparseArray { /// Return indices as a vector of usize with the indices_offset applied. pub fn resolved_indices(&self) -> Vec { - let flat_indices = flatten_primitive(self.indices()).unwrap(); + let flat_indices = self.indices().flatten_primitive().unwrap(); match_each_integer_ptype!(flat_indices.ptype(), |$P| { flat_indices .typed_data::<$P>() .iter() - .map(|v| (*v as usize) - self.indices_offset) + .map(|v| (*v as usize) - self.indices_offset()) .collect::>() }) } } -impl Array for SparseArray { - impl_array!(); - - #[inline] +impl ArrayTrait for SparseArray<'_> { fn len(&self) -> usize { - self.len - } - - #[inline] - fn is_empty(&self) -> bool { - self.indices.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - self.values().dtype() - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &SparseEncoding - } - - fn nbytes(&self) -> usize { - self.indices.nbytes() + self.values.nbytes() + self.metadata().len } +} - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) +impl AcceptArrayVisitor for SparseArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("indices", &self.indices())?; + visitor.visit_child("values", &self.values()) } +} - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } +impl ArrayStatisticsCompute for SparseArray<'_> {} - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.indices())?; - walker.visit_child(self.values()) +impl ArrayValidity for SparseArray<'_> { + fn is_valid(&self, index: usize) -> bool { + match self.find_index(index).unwrap() { + None => !self.fill_value().is_null(), + Some(idx) => self.values().with_dyn(|a| a.is_valid(idx)), + } } -} - -impl StatsCompute for SparseArray {} -impl ArrayValidity for SparseArray { - fn logical_validity(&self) -> Validity { + fn logical_validity(&self) -> LogicalValidity { let validity = if self.fill_value().is_null() { // If we have a null fill value, then the result is a Sparse array with a fill_value // of true, and patch values of false. SparseArray::try_new_with_offset( - self.indices.clone(), + self.indices(), ConstantArray::new(false, self.len()).into_array(), self.len(), - self.indices_offset, + self.indices_offset(), true.into(), ) } else { // If the fill_value is non-null, then the validity is based on the validity of the // existing values. SparseArray::try_new_with_offset( - self.indices.clone(), + self.indices(), self.values() - .logical_validity() - .to_bool_array() - .into_array(), + .with_dyn(|a| a.logical_validity().into_array()), self.len(), - self.indices_offset, + self.indices_offset(), true.into(), ) } .unwrap(); - Validity::Array(validity.into_array()) - } - - fn is_valid(&self, index: usize) -> bool { - match self.find_index(index).unwrap() { - None => !self.fill_value().is_null(), - Some(idx) => self.values().is_valid(idx), - } - } -} - -impl ArrayDisplay for SparseArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("offset", self.indices_offset())?; - f.child("indices", self.indices())?; - f.child("values", self.values()) + LogicalValidity::Array(validity.into_array_data()) } } -#[derive(Debug)] -pub struct SparseEncoding; - -impl SparseEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.sparse"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_SPARSE: EncodingRef = &SparseEncoding; - -impl Encoding for SparseEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} +impl EncodingCompression for SparseEncoding {} #[cfg(test)] mod test { @@ -243,37 +179,38 @@ mod test { use vortex_schema::Signedness::Signed; use vortex_schema::{DType, IntWidth}; + use crate::accessor::ArrayAccessor; use crate::array::sparse::SparseArray; - use crate::array::Array; - use crate::array::IntoArray; - use crate::compute::flatten::flatten_primitive; + use crate::compute::cast::cast; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; use crate::scalar::Scalar; + use crate::{Array, IntoArray, OwnedArray}; fn nullable_fill() -> Scalar { Scalar::null(&DType::Int(IntWidth::_32, Signed, Nullable)) } + + #[allow(dead_code)] fn non_nullable_fill() -> Scalar { Scalar::from(42i32) } - fn sparse_array(fill_value: Scalar) -> SparseArray { + fn sparse_array(fill_value: Scalar) -> OwnedArray { // merged array: [null, null, 100, null, null, 200, null, null, 300, null] - SparseArray::new( - vec![2u64, 5, 8].into_array(), - vec![100i32, 200, 300].into_array(), - 10, - fill_value, - ) + let mut values = vec![100i32, 200, 300].into_array(); + values = cast(&values, fill_value.dtype()).unwrap(); + + SparseArray::new(vec![2u64, 5, 8].into_array(), values, 10, fill_value).into_array() } - fn assert_sparse_array(sparse: &dyn Array, values: &[Option]) { - let sparse_arrow = flatten_primitive(sparse) - .unwrap() - .iter::() - .collect_vec(); - assert_eq!(sparse_arrow, values); + fn assert_sparse_array(sparse: &Array, values: &[Option]) { + let sparse_arrow = ArrayAccessor::::with_iterator( + &sparse.clone().flatten_primitive().unwrap(), + |iter| iter.map(|v| v.cloned()).collect_vec(), + ) + .unwrap(); + assert_eq!(&sparse_arrow, values); } #[test] @@ -327,7 +264,7 @@ mod test { #[test] pub fn test_find_index() { - let sparse = sparse_array(nullable_fill()); + let sparse = SparseArray::try_from(sparse_array(nullable_fill())).unwrap(); assert_eq!(sparse.find_index(0).unwrap(), None); assert_eq!(sparse.find_index(2).unwrap(), Some(0)); assert_eq!(sparse.find_index(5).unwrap(), Some(1)); diff --git a/vortex-array/src/array/sparse/serde.rs b/vortex-array/src/array/sparse/serde.rs deleted file mode 100644 index 125eb2ca7d..0000000000 --- a/vortex-array/src/array/sparse/serde.rs +++ /dev/null @@ -1,90 +0,0 @@ -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::array::sparse::{SparseArray, SparseEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::scalar::Scalar; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for SparseArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - // TODO(robert): Rewrite indices and don't store offset - ctx.write_usize(self.indices_offset())?; - ctx.write(self.indices())?; - ctx.write(self.values()) - } - - fn metadata(&self) -> VortexResult>> { - // FIXME(ngates): use flatbuffer / serde. - let mut vec = Vec::new(); - let mut ctx = WriteCtx::new(&mut vec); - ctx.write_usize(self.len())?; - // TODO(robert): Rewrite indices and don't store offset - ctx.write_usize(self.indices_offset())?; - Ok(Some(vec)) - } -} - -impl EncodingSerde for SparseEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let offset = ctx.read_usize()?; - let indices = ctx.with_schema(&DType::IDX).read()?; - let values = ctx.read()?; - let fill_type = values.dtype().clone().as_nullable(); - SparseArray::try_new_with_offset( - indices, - values, - len, - offset, - // NB: We should deserialize the fill value from the source, but currently do not, - // so everything that goes through this read path is nullable - Scalar::null(&fill_type), - ) - .map(|a| a.into_array()) - } -} - -#[cfg(test)] -mod test { - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::primitive::PrimitiveArray; - use crate::array::sparse::SparseArray; - use crate::array::Array; - use crate::array::IntoArray; - use crate::scalar::{NullScalar, Scalar}; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = SparseArray::new( - vec![7u64, 37, 71, 97].into_array(), - PrimitiveArray::from_iter(vec![Some(0), None, Some(2), Some(42)]).into_array(), - 100, - Scalar::Null(NullScalar::new()), - ); - - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!( - arr.indices().as_primitive().buffer().typed_data::(), - read_arr - .as_sparse() - .indices() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!( - arr.values().as_primitive().buffer().typed_data::(), - read_arr - .as_sparse() - .values() - .as_primitive() - .buffer() - .typed_data::() - ); - } -} diff --git a/vortex-array/src/array/struct_/compress.rs b/vortex-array/src/array/struct/compress.rs similarity index 100% rename from vortex-array/src/array/struct_/compress.rs rename to vortex-array/src/array/struct/compress.rs diff --git a/vortex-array/src/array/struct_/compute.rs b/vortex-array/src/array/struct/compute.rs similarity index 53% rename from vortex-array/src/array/struct_/compute.rs rename to vortex-array/src/array/struct/compute.rs index 6f1efbe5c8..e5233ada6a 100644 --- a/vortex-array/src/array/struct_/compute.rs +++ b/vortex-array/src/array/struct/compute.rs @@ -7,19 +7,18 @@ use arrow_schema::{Field, Fields}; use itertools::Itertools; use vortex_error::VortexResult; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::struct_::StructArray; -use crate::array::{Array, ArrayRef}; +use crate::array::r#struct::StructArray; use crate::compute::as_arrow::{as_arrow, AsArrowArray}; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::flatten::{flatten, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; use crate::compute::slice::{slice, SliceFn}; use crate::compute::take::{take, TakeFn}; use crate::compute::ArrayCompute; use crate::scalar::{Scalar, StructScalar}; +use crate::ArrayTrait; +use crate::{Array, ArrayDType, IntoArray, OwnedArray}; -impl ArrayCompute for StructArray { +impl ArrayCompute for StructArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } @@ -28,10 +27,6 @@ impl ArrayCompute for StructArray { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -45,13 +40,10 @@ impl ArrayCompute for StructArray { } } -impl AsArrowArray for StructArray { +impl AsArrowArray for StructArray<'_> { fn as_arrow(&self) -> VortexResult { - let field_arrays: Vec = self - .fields() - .iter() - .map(|f| as_arrow(f.as_ref())) - .try_collect()?; + let field_arrays: Vec = + self.children().map(|f| as_arrow(&f)).try_collect()?; let arrow_fields: Fields = self .names() @@ -62,7 +54,7 @@ impl AsArrowArray for StructArray { Field::new( name.as_str(), arrow_field.data_type().clone(), - vortex_field.dtype().is_nullable(), + vortex_field.is_nullable(), ) }) .map(Arc::new) @@ -76,74 +68,62 @@ impl AsArrowArray for StructArray { } } -impl AsContiguousFn for StructArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { +impl AsContiguousFn for StructArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { + let struct_arrays = arrays + .iter() + .map(StructArray::try_from) + .collect::>>()?; let mut fields = vec![Vec::new(); self.fields().len()]; - for array in arrays { + for array in struct_arrays.iter() { for f in 0..self.fields().len() { - fields[f].push(array.as_struct().fields()[f].clone()) + fields[f].push(array.child(f).unwrap()) } } - Ok(StructArray::new( + StructArray::try_new( self.names().clone(), fields .iter() .map(|field_arrays| as_contiguous(field_arrays)) .try_collect()?, - self.len, + self.len(), ) - .into_array()) - } -} - -impl FlattenFn for StructArray { - fn flatten(&self) -> VortexResult { - Ok(FlattenedArray::Struct(StructArray::new( - self.names().clone(), - self.fields() - .iter() - .map(|field| flatten(field.as_ref()).map(FlattenedArray::into_array)) - .try_collect()?, - self.len, - ))) + .map(|a| a.into_array()) } } -impl ScalarAtFn for StructArray { +impl ScalarAtFn for StructArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { Ok(StructScalar::new( - self.dtype.clone(), - self.fields - .iter() - .map(|field| scalar_at(field.as_ref(), index)) + self.dtype().clone(), + self.children() + .map(|field| scalar_at(&field, index)) .try_collect()?, ) .into()) } } -impl TakeFn for StructArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - Ok(StructArray::new( +impl TakeFn for StructArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + StructArray::try_new( self.names().clone(), - self.fields() - .iter() - .map(|field| take(field, indices)) + self.children() + .map(|field| take(&field, indices)) .try_collect()?, indices.len(), ) - .into_array()) + .map(|a| a.into_array()) } } -impl SliceFn for StructArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for StructArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { let fields = self - .fields - .iter() - .map(|field| slice(field, start, stop)) + .children() + .map(|field| slice(&field, start, stop)) .try_collect()?; - Ok(StructArray::new(self.names().clone(), fields, stop - start).into_array()) + StructArray::try_new(self.names().clone(), fields, stop - start).map(|a| a.into_array()) } } diff --git a/vortex-array2/src/array/struct/mod.rs b/vortex-array/src/array/struct/mod.rs similarity index 70% rename from vortex-array2/src/array/struct/mod.rs rename to vortex-array/src/array/struct/mod.rs index e17df95edf..d5261f6119 100644 --- a/vortex-array2/src/array/struct/mod.rs +++ b/vortex-array/src/array/struct/mod.rs @@ -1,16 +1,14 @@ -use std::collections::HashMap; - use serde::{Deserialize, Serialize}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, FieldNames}; +use vortex_schema::FieldNames; -use crate::compute::ArrayCompute; use crate::stats::ArrayStatisticsCompute; use crate::validity::{ArrayValidity, LogicalValidity}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::ArrayData; -use crate::ArrayFlatten; -use crate::{impl_encoding, ToArray}; +use crate::{impl_encoding, ArrayDType}; +use crate::{ArrayFlatten, IntoArrayData}; + +mod compute; impl_encoding!("vortex.struct", Struct); @@ -47,16 +45,19 @@ impl StructArray<'_> { } } +impl<'a> StructArray<'a> { + pub fn children(&'a self) -> impl Iterator> { + (0..self.nfields()).map(move |idx| self.child(idx).unwrap()) + } +} + impl StructArray<'_> { - pub fn try_new(names: FieldNames, fields: Vec, length: usize) -> VortexResult { + pub fn try_new(names: FieldNames, fields: Vec, length: usize) -> VortexResult { if names.len() != fields.len() { vortex_bail!("Got {} names and {} fields", names.len(), fields.len()); } - if fields - .iter() - .any(|a| a.to_array().with_dyn(|a| a.len()) != length) - { + if fields.iter().any(|a| a.with_dyn(|a| a.len()) != length) { vortex_bail!("Expected all struct fields to have length {}", length); } @@ -64,8 +65,7 @@ impl StructArray<'_> { Self::try_from_parts( DType::Struct(names, field_dtypes), StructMetadata { length }, - vec![].into(), - fields.into(), + fields.into_iter().map(|a| a.into_array_data()).collect(), HashMap::default(), ) } @@ -76,7 +76,18 @@ impl ArrayFlatten for StructArray<'_> { where Self: 'a, { - todo!() + Ok(Flattened::Struct(StructArray::try_new( + self.names().clone(), + (0..self.nfields()) + .map(|i| { + self.child(i) + .expect("Missing child") + .flatten() + .map(|f| f.into_array()) + }) + .collect::>>()?, + self.len(), + )?)) } } @@ -107,4 +118,5 @@ impl AcceptArrayVisitor for StructArray<'_> { } impl ArrayStatisticsCompute for StructArray<'_> {} -impl ArrayCompute for StructArray<'_> {} + +impl EncodingCompression for StructEncoding {} diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs deleted file mode 100644 index 3ec2b8e9d6..0000000000 --- a/vortex-array/src/array/struct_/mod.rs +++ /dev/null @@ -1,164 +0,0 @@ -use std::sync::{Arc, RwLock}; - -use linkme::distributed_slice; -use vortex_error::VortexResult; -use vortex_schema::{DType, FieldNames}; - -use super::{Array, ArrayRef}; -use crate::compress::EncodingCompression; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsCompute, StatsSet}; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::{impl_array, ArrayWalker}; - -mod compress; -mod compute; -mod serde; - -#[derive(Debug, Clone)] -pub struct StructArray { - fields: Vec, - dtype: DType, - len: usize, - stats: Arc>, -} - -impl StructArray { - pub fn new(names: FieldNames, fields: Vec, len: usize) -> Self { - assert!( - fields.iter().all(|v| v.len() == len), - "Fields didn't have the same length" - ); - let dtype = DType::Struct(names, fields.iter().map(|a| a.dtype().clone()).collect()); - Self { - fields, - dtype, - len, - stats: Arc::new(RwLock::new(StatsSet::new())), - } - } - - #[inline] - pub fn fields(&self) -> &[ArrayRef] { - &self.fields - } - - pub fn names(&self) -> &FieldNames { - if let DType::Struct(names, _fields) = self.dtype() { - names - } else { - panic!("dtype is not a struct") - } - } - - pub fn field_dtypes(&self) -> &[DType] { - if let DType::Struct(_names, fields) = self.dtype() { - fields - } else { - panic!("dtype is not a struct") - } - } -} - -impl Array for StructArray { - impl_array!(); - - fn len(&self) -> usize { - self.len - } - - #[inline] - fn is_empty(&self) -> bool { - self.len() == 0 - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &StructEncoding - } - - fn nbytes(&self) -> usize { - self.fields.iter().map(|arr| arr.nbytes()).sum() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - for field in self.fields() { - walker.visit_child(field)?; - } - Ok(()) - } -} - -impl StatsCompute for StructArray {} - -impl ArrayValidity for StructArray { - fn logical_validity(&self) -> Validity { - todo!() - } - - fn is_valid(&self, _index: usize) -> bool { - todo!() - } -} - -#[derive(Debug)] -pub struct StructEncoding; - -impl StructEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.struct"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_STRUCT: EncodingRef = &StructEncoding; - -impl Encoding for StructEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} - -impl ArrayDisplay for StructArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - let DType::Struct(n, _) = self.dtype() else { - unreachable!() - }; - for (name, field) in n.iter().zip(self.fields()) { - f.child(&format!("\"{}\"", name), field.as_ref())?; - } - Ok(()) - } -} diff --git a/vortex-array/src/array/struct_/serde.rs b/vortex-array/src/array/struct_/serde.rs deleted file mode 100644 index 6912f9603f..0000000000 --- a/vortex-array/src/array/struct_/serde.rs +++ /dev/null @@ -1,106 +0,0 @@ -use itertools::Itertools; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; - -use crate::array::struct_::{StructArray, StructEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; - -impl ArraySerde for StructArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - ctx.write_usize(self.fields().len())?; - for f in self.fields() { - ctx.write(f.as_ref())?; - } - Ok(()) - } - - fn metadata(&self) -> VortexResult>> { - let length = self.len() as u64; - Ok(Some(length.to_le_bytes().to_vec())) - } -} - -impl EncodingSerde for StructEncoding { - fn to_array(&self, view: &ArrayView) -> ArrayRef { - let DType::Struct(names, fields) = view.dtype() else { - panic!("Incorrect DType {}", view.dtype()) - }; - assert_eq!(fields.len(), view.nchildren()); - StructArray::new( - names.clone(), - fields - .iter() - .enumerate() - .map(|(i, field)| view.child(i, field).unwrap().into_array()) - .collect_vec(), - self.len(view), - ) - .into_array() - } - - fn len(&self, view: &ArrayView) -> usize { - let length = u64::from_le_bytes(view.metadata().unwrap().try_into().unwrap()); - length as usize - } - - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let num_fields = ctx.read_usize()?; - let mut fields = Vec::::with_capacity(num_fields); - // TODO(robert): use read_vectored - for i in 0..num_fields { - fields.push(ctx.subfield(i).read()?); - } - let DType::Struct(names, _) = ctx.schema() else { - vortex_bail!(MismatchedTypes: "any struct", ctx.schema()); - }; - Ok(StructArray::new(names.clone(), fields, len).into_array()) - } -} - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::primitive::PrimitiveArray; - use crate::array::struct_::StructArray; - use crate::array::Array; - use crate::array::IntoArray; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = StructArray::new( - vec![ - Arc::new("primes".to_string()), - Arc::new("nullable".to_string()), - ], - vec![ - vec![7u8, 37, 71, 97].into_array(), - PrimitiveArray::from_iter(vec![Some(0), None, Some(2), Some(42)]).into_array(), - ], - 4, - ); - - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!( - arr.fields()[0].as_primitive().buffer().typed_data::(), - read_arr.as_struct().fields()[0] - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!( - arr.fields()[1].as_primitive().buffer().typed_data::(), - read_arr.as_struct().fields()[1] - .as_primitive() - .buffer() - .typed_data::() - ); - } -} diff --git a/vortex-array/src/array/varbin/accessor.rs b/vortex-array/src/array/varbin/accessor.rs index ff6fd1b055..efdf4949e0 100644 --- a/vortex-array/src/array/varbin/accessor.rs +++ b/vortex-array/src/array/varbin/accessor.rs @@ -1,26 +1,47 @@ +use vortex_error::VortexResult; + use crate::accessor::ArrayAccessor; -use crate::array::downcast::DowncastArrayBuiltin; use crate::array::varbin::VarBinArray; +use crate::match_each_integer_ptype; use crate::validity::ArrayValidity; -impl<'a> ArrayAccessor<'a, &'a [u8]> for VarBinArray { - fn value(&'a self, index: usize) -> Option<&'a [u8]> { - if self.is_valid(index) { - let start = self.offset_at(index); - let end = self.offset_at(index + 1); - Some(&self.bytes().as_primitive().buffer()[start..end]) - } else { - None - } - } -} +impl ArrayAccessor<[u8]> for VarBinArray<'_> { + fn with_iterator(&self, f: F) -> VortexResult + where + F: for<'a> FnOnce(&mut (dyn Iterator>)) -> R, + { + // TODO(ngates): what happens if bytes is much larger than sliced_bytes? + let primitive = self.bytes().flatten_primitive()?; + let offsets = self.offsets().flatten_primitive()?; + let validity = self.logical_validity().to_null_buffer()?; + + match_each_integer_ptype!(offsets.ptype(), |$T| { + let offsets = offsets.typed_data::<$T>(); + let bytes = primitive.typed_data::(); -impl ArrayAccessor<'_, Vec> for VarBinArray { - fn value(&self, index: usize) -> Option> { - if self.is_valid(index) { - Some(self.bytes_at(index).unwrap()) - } else { - None - } + match validity { + None => { + let mut iter = offsets + .iter() + .zip(offsets.iter().skip(1)) + .map(|(start, end)| Some(&bytes[*start as usize..*end as usize])); + Ok(f(&mut iter)) + } + Some(validity) => { + let mut iter = offsets + .iter() + .zip(offsets.iter().skip(1)) + .zip(validity.iter()) + .map(|((start, end), valid)| { + if valid { + Some(&bytes[*start as usize..*end as usize]) + } else { + None + } + }); + Ok(f(&mut iter)) + } + } + }) } } diff --git a/vortex-array2/src/array/varbin/array.rs b/vortex-array/src/array/varbin/array.rs similarity index 93% rename from vortex-array2/src/array/varbin/array.rs rename to vortex-array/src/array/varbin/array.rs index 42ef307ac7..4a5bd2dc90 100644 --- a/vortex-array2/src/array/varbin/array.rs +++ b/vortex-array/src/array/varbin/array.rs @@ -18,7 +18,7 @@ impl ArrayValidity for VarBinArray<'_> { impl AcceptArrayVisitor for VarBinArray<'_> { fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { visitor.visit_child("offsets", &self.offsets())?; - visitor.visit_child("offsets", &self.bytes())?; + visitor.visit_child("bytes", &self.bytes())?; visitor.visit_validity(&self.validity()) } } diff --git a/vortex-array/src/array/varbin/builder.rs b/vortex-array/src/array/varbin/builder.rs index 33582696e1..0ab49595fe 100644 --- a/vortex-array/src/array/varbin/builder.rs +++ b/vortex-array/src/array/varbin/builder.rs @@ -1,11 +1,13 @@ +use std::mem; + use arrow_buffer::NullBufferBuilder; use vortex_schema::DType; use crate::array::primitive::PrimitiveArray; -use crate::array::varbin::VarBinArray; -use crate::array::Array; +use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; use crate::ptype::NativePType; use crate::validity::Validity; +use crate::IntoArray; pub struct VarBinBuilder { offsets: Vec, @@ -15,15 +17,11 @@ pub struct VarBinBuilder { impl VarBinBuilder { pub fn with_capacity(len: usize) -> Self { - Self::with_capacity_and_values_size(len, 0) - } - - pub fn with_capacity_and_values_size(len: usize, data_size: usize) -> Self { let mut offsets = Vec::with_capacity(len + 1); offsets.push(O::zero()); Self { offsets, - data: Vec::with_capacity(data_size), + data: Vec::new(), validity: NullBufferBuilder::new(len), } } @@ -50,24 +48,20 @@ impl VarBinBuilder { self.validity.append_null(); } - pub fn finish(mut self, dtype: DType) -> VarBinArray { - let offsets = PrimitiveArray::from(self.offsets); - let data = PrimitiveArray::from(self.data); + pub fn finish(&mut self, dtype: DType) -> OwnedVarBinArray { + let offsets = PrimitiveArray::from(mem::take(&mut self.offsets)); + let data = PrimitiveArray::from(mem::take(&mut self.data)); let nulls = self.validity.finish(); let validity = if dtype.is_nullable() { - Some( - nulls - .map(Validity::from) - .unwrap_or_else(|| Validity::Valid(offsets.len() - 1)), - ) + nulls.map(Validity::from).unwrap_or(Validity::AllValid) } else { assert!(nulls.is_none(), "dtype and validity mismatch"); - None + Validity::NonNullable }; - VarBinArray::new(offsets.into_array(), data.into_array(), dtype, validity) + VarBinArray::try_new(offsets.into_array(), data.into_array(), dtype, validity).unwrap() } } @@ -77,9 +71,9 @@ mod test { use vortex_schema::Nullability::Nullable; use crate::array::varbin::builder::VarBinBuilder; - use crate::array::Array; use crate::compute::scalar_at::scalar_at; use crate::scalar::Utf8Scalar; + use crate::{ArrayDType, IntoArray}; #[test] fn test_builder() { @@ -87,10 +81,10 @@ mod test { builder.push(Some(b"hello")); builder.push(None); builder.push(Some(b"world")); - let array = builder.finish(DType::Utf8(Nullable)); + let array = builder.finish(DType::Utf8(Nullable)).into_array(); assert_eq!(array.len(), 3); - assert_eq!(array.nullability(), Nullable); + assert_eq!(array.dtype().nullability(), Nullable); assert_eq!( scalar_at(&array, 0).unwrap(), Utf8Scalar::nullable("hello".to_owned()).into() diff --git a/vortex-array/src/array/varbin/compute/mod.rs b/vortex-array/src/array/varbin/compute/mod.rs index 8828407883..8783587d13 100644 --- a/vortex-array/src/array/varbin/compute/mod.rs +++ b/vortex-array/src/array/varbin/compute/mod.rs @@ -7,28 +7,25 @@ use itertools::Itertools; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; -use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::{varbin_scalar, VarBinArray}; -use crate::array::{Array, ArrayRef}; -use crate::arrow::wrappers::{as_nulls, as_offset_buffer}; +use crate::arrow::wrappers::as_offset_buffer; use crate::compute::as_arrow::AsArrowArray; use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; use crate::compute::cast::cast; -use crate::compute::flatten::{flatten, flatten_primitive, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::SliceFn; use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; use crate::ptype::PType; use crate::scalar::Scalar; -use crate::validity::{ArrayValidity, OwnedValidity, Validity}; -use crate::view::ToOwnedView; +use crate::validity::{ArrayValidity, Validity}; +use crate::{Array, ArrayDType, IntoArray, OwnedArray, ToArray}; mod slice; mod take; -impl ArrayCompute for VarBinArray { +impl ArrayCompute for VarBinArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } @@ -37,10 +34,6 @@ impl ArrayCompute for VarBinArray { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -54,27 +47,25 @@ impl ArrayCompute for VarBinArray { } } -impl AsContiguousFn for VarBinArray { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult { - let bytes_chunks: Vec = arrays +impl AsContiguousFn for VarBinArray<'_> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { + let bytes_chunks: Vec = arrays .iter() - .map(|a| a.as_varbin().sliced_bytes()) + .map(|a| VarBinArray::try_from(a).unwrap().sliced_bytes()) .try_collect()?; let bytes = as_contiguous(&bytes_chunks)?; let validity = if self.dtype().is_nullable() { - Some(Validity::from_iter( - arrays.iter().map(|a| a.logical_validity()), - )) + Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) } else { - None + Validity::NonNullable }; let mut offsets = Vec::new(); offsets.push(0); - for a in arrays.iter().map(|a| a.as_varbin()) { + for a in arrays.iter().map(|a| VarBinArray::try_from(a).unwrap()) { let first_offset: u64 = a.first_offset()?; - let offsets_array = flatten_primitive(cast(a.offsets(), PType::U64.into())?.as_ref())?; + let offsets_array = cast(&a.offsets(), PType::U64.into())?.flatten_primitive()?; let shift = offsets.last().copied().unwrap_or(0); offsets.extend( offsets_array @@ -87,40 +78,39 @@ impl AsContiguousFn for VarBinArray { let offsets_array = PrimitiveArray::from(offsets).into_array(); - Ok(VarBinArray::new(offsets_array, bytes, self.dtype.clone(), validity).into_array()) + VarBinArray::try_new(offsets_array, bytes, self.dtype().clone(), validity) + .map(|a| a.into_array()) } } -impl AsArrowArray for VarBinArray { +impl AsArrowArray for VarBinArray<'_> { fn as_arrow(&self) -> VortexResult { // Ensure the offsets are either i32 or i64 - let offsets = flatten_primitive(self.offsets())?; + let offsets = self.offsets().flatten_primitive()?; let offsets = match offsets.ptype() { PType::I32 | PType::I64 => offsets, // Unless it's u64, everything else can be converted into an i32. // FIXME(ngates): do not copy offsets again - PType::U64 => { - flatten_primitive(cast(&offsets.to_array(), PType::I64.into())?.as_ref())? - } - _ => flatten_primitive(cast(&offsets.to_array(), PType::I32.into())?.as_ref())?, + PType::U64 => cast(&offsets.to_array(), PType::I64.into())?.flatten_primitive()?, + _ => cast(&offsets.to_array(), PType::I32.into())?.flatten_primitive()?, }; - let nulls = as_nulls(self.logical_validity())?; + let nulls = self.logical_validity().to_null_buffer()?; - let data = flatten_primitive(self.bytes())?; + let data = self.bytes().flatten_primitive()?; assert_eq!(data.ptype(), PType::U8); - let data = data.buffer().clone(); + let data = data.buffer(); // Switch on Arrow DType. Ok(match self.dtype() { DType::Binary(_) => match offsets.ptype() { PType::I32 => Arc::new(BinaryArray::new( as_offset_buffer::(offsets), - data, + data.into(), nulls, )), PType::I64 => Arc::new(LargeBinaryArray::new( as_offset_buffer::(offsets), - data, + data.into(), nulls, )), _ => panic!("Invalid offsets type"), @@ -128,12 +118,12 @@ impl AsArrowArray for VarBinArray { DType::Utf8(_) => match offsets.ptype() { PType::I32 => Arc::new(StringArray::new( as_offset_buffer::(offsets), - data, + data.into(), nulls, )), PType::I64 => Arc::new(LargeStringArray::new( as_offset_buffer::(offsets), - data, + data.into(), nulls, )), _ => panic!("Invalid offsets type"), @@ -143,20 +133,7 @@ impl AsArrowArray for VarBinArray { } } -impl FlattenFn for VarBinArray { - fn flatten(&self) -> VortexResult { - let bytes = flatten(self.bytes())?.into_array(); - let offsets = flatten(self.offsets())?.into_array(); - Ok(FlattenedArray::VarBin(VarBinArray::new( - offsets, - bytes, - self.dtype.clone(), - self.validity().to_owned_view(), - ))) - } -} - -impl ScalarAtFn for VarBinArray { +impl ScalarAtFn for VarBinArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { self.bytes_at(index) diff --git a/vortex-array/src/array/varbin/compute/slice.rs b/vortex-array/src/array/varbin/compute/slice.rs index 74c62c0005..5d8834cdc8 100644 --- a/vortex-array/src/array/varbin/compute/slice.rs +++ b/vortex-array/src/array/varbin/compute/slice.rs @@ -1,18 +1,17 @@ use vortex_error::VortexResult; use crate::array::varbin::VarBinArray; -use crate::array::{Array, ArrayRef}; use crate::compute::slice::{slice, SliceFn}; -use crate::validity::OwnedValidity; +use crate::{ArrayDType, IntoArray, OwnedArray}; -impl SliceFn for VarBinArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(VarBinArray::new( - slice(self.offsets(), start, stop + 1)?, +impl SliceFn for VarBinArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + VarBinArray::try_new( + slice(&self.offsets(), start, stop + 1)?, self.bytes().clone(), self.dtype().clone(), - self.validity().map(|v| v.slice(start, stop)).transpose()?, + self.validity().slice(start, stop)?, ) - .into_array()) + .map(|a| a.into_array()) } } diff --git a/vortex-array/src/array/varbin/compute/take.rs b/vortex-array/src/array/varbin/compute/take.rs index 5704a76e24..5fc52d2123 100644 --- a/vortex-array/src/array/varbin/compute/take.rs +++ b/vortex-array/src/array/varbin/compute/take.rs @@ -1,27 +1,28 @@ +use arrow_buffer::NullBuffer; use vortex_error::VortexResult; use vortex_schema::DType; use crate::array::varbin::builder::VarBinBuilder; -use crate::array::varbin::VarBinArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::flatten::flatten_primitive; +use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; use crate::compute::take::TakeFn; use crate::match_each_integer_ptype; use crate::ptype::NativePType; -use crate::validity::OwnedValidity; -use crate::validity::ValidityView; +use crate::validity::Validity; +use crate::ArrayDType; +use crate::IntoArray; +use crate::{Array, OwnedArray}; -impl TakeFn for VarBinArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for VarBinArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { // TODO(ngates): support i64 indices. assert!( indices.len() < i32::MAX as usize, "indices.len() must be less than i32::MAX" ); - let offsets = flatten_primitive(self.offsets())?; - let data = flatten_primitive(self.bytes())?; - let indices = flatten_primitive(indices)?; + let offsets = self.offsets().flatten_primitive()?; + let data = self.bytes().flatten_primitive()?; + let indices = indices.clone().flatten_primitive()?; match_each_integer_ptype!(offsets.ptype(), |$O| { match_each_integer_ptype!(indices.ptype(), |$I| { Ok(take( @@ -30,7 +31,7 @@ impl TakeFn for VarBinArray { data.typed_data::(), indices.typed_data::<$I>(), self.validity(), - ).into_array()) + )?.into_array()) }) }) } @@ -41,10 +42,11 @@ fn take( offsets: &[O], data: &[u8], indices: &[I], - validity: Option, -) -> VarBinArray { - if let Some(v) = validity { - return take_nullable(dtype, offsets, data, indices, v); + validity: Validity, +) -> VortexResult { + let logical_validity = validity.to_logical(offsets.len() - 1); + if let Some(v) = logical_validity.to_null_buffer()? { + return Ok(take_nullable(dtype, offsets, data, indices, v)); } let mut builder = VarBinBuilder::::with_capacity(indices.len()); @@ -54,7 +56,7 @@ fn take( let stop = offsets[idx + 1].to_usize().unwrap(); builder.push(Some(&data[start..stop])); } - builder.finish(dtype) + Ok(builder.finish(dtype)) } fn take_nullable( @@ -62,12 +64,12 @@ fn take_nullable( offsets: &[O], data: &[u8], indices: &[I], - validity: ValidityView, -) -> VarBinArray { + null_buffer: NullBuffer, +) -> OwnedVarBinArray { let mut builder = VarBinBuilder::::with_capacity(indices.len()); for &idx in indices { let idx = idx.to_usize().unwrap(); - if validity.is_valid(idx) { + if null_buffer.is_valid(idx) { let start = offsets[idx].to_usize().unwrap(); let stop = offsets[idx + 1].to_usize().unwrap(); builder.push(Some(&data[start..stop])); diff --git a/vortex-array2/src/array/varbin/flatten.rs b/vortex-array/src/array/varbin/flatten.rs similarity index 100% rename from vortex-array2/src/array/varbin/flatten.rs rename to vortex-array/src/array/varbin/flatten.rs diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index 99f89d9c41..d516a44e42 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -1,64 +1,41 @@ -use std::sync::{Arc, RwLock}; - -use linkme::distributed_slice; use num_traits::AsPrimitive; -pub use stats::compute_stats; -pub use stats::VarBinAccumulator; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; +use serde::{Deserialize, Serialize}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::{IntWidth, Nullability, Signedness}; -use crate::array::downcast::DowncastArrayBuiltin; use crate::array::varbin::builder::VarBinBuilder; -use crate::array::{Array, ArrayRef}; -use crate::compress::EncodingCompression; -use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::iterator::ArrayIter; -use crate::match_each_native_ptype; use crate::ptype::NativePType; -use crate::scalar::{BinaryScalar, Scalar, Utf8Scalar}; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsSet}; -use crate::validity::OwnedValidity; -use crate::validity::{Validity, ValidityView}; -use crate::view::AsView; -use crate::{impl_array, ArrayWalker}; +use crate::scalar::{BinaryScalar, Utf8Scalar}; +use crate::validity::{Validity, ValidityMetadata}; +use crate::{impl_encoding, OwnedArray, ToArrayData}; +use crate::{match_each_native_ptype, ArrayDType}; mod accessor; +mod array; pub mod builder; -mod compress; mod compute; -mod serde; +mod flatten; mod stats; +pub use stats::compute_stats; -#[derive(Debug, Clone)] -pub struct VarBinArray { - offsets: ArrayRef, - bytes: ArrayRef, - dtype: DType, - validity: Option, - stats: Arc>, -} +use crate::array::primitive::PrimitiveArray; -impl VarBinArray { - pub fn new( - offsets: ArrayRef, - bytes: ArrayRef, - dtype: DType, - validity: Option, - ) -> Self { - Self::try_new(offsets, bytes, dtype, validity).unwrap() - } +impl_encoding!("vortex.varbin", VarBin); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VarBinMetadata { + validity: ValidityMetadata, + offsets_dtype: DType, +} +impl VarBinArray<'_> { pub fn try_new( - offsets: ArrayRef, - bytes: ArrayRef, + offsets: Array, + bytes: Array, dtype: DType, - validity: Option, + validity: Validity, ) -> VortexResult { if !matches!(offsets.dtype(), DType::Int(_, _, Nullability::NonNullable)) { vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype()); @@ -72,62 +49,71 @@ impl VarBinArray { if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) { vortex_bail!(MismatchedTypes: "utf8 or binary", dtype); } - - if let Some(v) = validity.as_view() { - assert_eq!(v.len(), offsets.len() - 1); + if dtype.is_nullable() == (validity == Validity::NonNullable) { + vortex_bail!("incorrect validity {:?}", validity); } - let dtype = if validity.is_some() && !dtype.is_nullable() { - dtype.as_nullable() - } else { - dtype + + let metadata = VarBinMetadata { + validity: validity.to_metadata(offsets.len() - 1)?, + offsets_dtype: offsets.dtype().clone(), }; - Ok(Self { - offsets, - bytes, - dtype, - validity, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) + let mut children = Vec::with_capacity(3); + children.push(offsets.to_array_data()); + children.push(bytes.to_array_data()); + if let Some(a) = validity.into_array_data() { + children.push(a) + } + + Self::try_from_parts(dtype, metadata, children.into(), HashMap::default()) } #[inline] - pub fn offsets(&self) -> &ArrayRef { - &self.offsets + pub fn offsets(&self) -> Array { + self.array() + .child(0, &self.metadata().offsets_dtype) + .expect("missing offsets") } pub fn first_offset(&self) -> VortexResult { - scalar_at(self.offsets(), 0)? + scalar_at(&self.offsets(), 0)? .cast(&DType::from(T::PTYPE))? .try_into() } #[inline] - pub fn bytes(&self) -> &ArrayRef { - &self.bytes + pub fn bytes(&self) -> Array { + self.array().child(1, &DType::BYTES).expect("missing bytes") + } + + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(2, &Validity::DTYPE)) } - pub fn sliced_bytes(&self) -> VortexResult { - let first_offset: usize = scalar_at(self.offsets(), 0)?.try_into()?; - let last_offset: usize = scalar_at(self.offsets(), self.offsets().len() - 1)?.try_into()?; - slice(self.bytes(), first_offset, last_offset) + pub fn sliced_bytes(&self) -> VortexResult { + let first_offset: usize = scalar_at(&self.offsets(), 0)?.try_into()?; + let last_offset: usize = + scalar_at(&self.offsets(), self.offsets().len() - 1)?.try_into()?; + slice(&self.bytes(), first_offset, last_offset) } pub fn from_vec>(vec: Vec, dtype: DType) -> Self { - let values_size: usize = vec.iter().map(|v| v.as_ref().len()).sum(); - if values_size < u32::MAX as usize { - Self::from_vec_sized::(vec, values_size, dtype) + let size: usize = vec.iter().map(|v| v.as_ref().len()).sum(); + if size < u32::MAX as usize { + Self::from_vec_sized::(vec, dtype) } else { - Self::from_vec_sized::(vec, values_size, dtype) + Self::from_vec_sized::(vec, dtype) } } - fn from_vec_sized(vec: Vec, values_size: usize, dtype: DType) -> Self + fn from_vec_sized(vec: Vec, dtype: DType) -> Self where K: NativePType, T: AsRef<[u8]>, { - let mut builder = VarBinBuilder::::with_capacity_and_values_size(vec.len(), values_size); + let mut builder = VarBinBuilder::::with_capacity(vec.len()); for v in vec { builder.push_value(v.as_ref()); } @@ -146,174 +132,73 @@ impl VarBinArray { builder.finish(dtype) } - pub fn iter_primitive(&self) -> VortexResult> { - self.bytes() - .maybe_primitive() - .ok_or_else(|| vortex_err!(ComputeError: "Bytes array was not a primitive array")) - .map(|_| ArrayIter::new(self)) - } - - pub fn iter(&self) -> VarBinIter> { - ArrayIter::new(self) - } - - fn offset_at(&self, index: usize) -> usize { - if let Some(parray) = self.offsets().maybe_primitive() { - match_each_native_ptype!(parray.ptype(), |$P| { - parray.typed_data::<$P>()[index].as_() + pub fn offset_at(&self, index: usize) -> usize { + PrimitiveArray::try_from(self.offsets()) + .ok() + .map(|p| { + match_each_native_ptype!(p.ptype(), |$P| { + p.typed_data::<$P>()[index].as_() + }) + }) + .unwrap_or_else(|| { + scalar_at(&self.offsets(), index) + .unwrap() + .try_into() + .unwrap() }) - } else { - scalar_at(self.offsets(), index) - .unwrap() - .try_into() - .unwrap() - } } pub fn bytes_at(&self, index: usize) -> VortexResult> { let start = self.offset_at(index); let end = self.offset_at(index + 1); - let sliced = slice(self.bytes(), start, end)?; - Ok(flatten_primitive(sliced.as_ref())? - .into_buffer() - .into_vec() - .unwrap_or_else(|buf| buf.to_vec())) - } -} - -pub type VarBinIter<'a, T> = ArrayIter<'a, VarBinArray, T>; - -impl Array for VarBinArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.offsets.len() - 1 - } - - #[inline] - fn is_empty(&self) -> bool { - self.offsets.len() <= 1 - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &VarBinEncoding - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn nbytes(&self) -> usize { - self.bytes.nbytes() + self.offsets.nbytes() - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.offsets())?; - walker.visit_child(self.bytes()) - } -} - -impl OwnedValidity for VarBinArray { - fn validity(&self) -> Option { - self.validity.as_view() + let sliced = slice(&self.bytes(), start, end)?; + Ok(sliced.flatten_primitive()?.buffer().as_slice().to_vec()) } } -#[derive(Debug)] -pub struct VarBinEncoding; - -impl VarBinEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.varbin"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_VARBIN: EncodingRef = &VarBinEncoding; - -impl Encoding for VarBinEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} - -impl ArrayDisplay for VarBinArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("offsets", self.offsets())?; - f.child("bytes", self.bytes())?; - f.validity(self.validity()) - } -} - -impl From> for VarBinArray { +impl From> for VarBinArray<'_> { fn from(value: Vec<&[u8]>) -> Self { VarBinArray::from_vec(value, DType::Binary(Nullability::NonNullable)) } } -impl From>> for VarBinArray { +impl From>> for VarBinArray<'_> { fn from(value: Vec>) -> Self { VarBinArray::from_vec(value, DType::Binary(Nullability::NonNullable)) } } -impl From> for VarBinArray { +impl From> for VarBinArray<'_> { fn from(value: Vec) -> Self { VarBinArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) } } -impl From> for VarBinArray { +impl From> for VarBinArray<'_> { fn from(value: Vec<&str>) -> Self { VarBinArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) } } -impl<'a> FromIterator> for VarBinArray { +impl<'a> FromIterator> for VarBinArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinArray::from_iter(iter, DType::Binary(Nullability::Nullable)) } } -impl FromIterator>> for VarBinArray { +impl FromIterator>> for VarBinArray<'_> { fn from_iter>>>(iter: T) -> Self { VarBinArray::from_iter(iter, DType::Binary(Nullability::Nullable)) } } -impl FromIterator> for VarBinArray { +impl FromIterator> for VarBinArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinArray::from_iter(iter, DType::Utf8(Nullability::Nullable)) } } -impl<'a> FromIterator> for VarBinArray { +impl<'a> FromIterator> for VarBinArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinArray::from_iter(iter, DType::Utf8(Nullability::Nullable)) } @@ -332,17 +217,20 @@ pub fn varbin_scalar(value: Vec, dtype: &DType) -> Scalar { } } +impl EncodingCompression for VarBinEncoding {} + #[cfg(test)] mod test { use vortex_schema::{DType, Nullability}; use crate::array::primitive::PrimitiveArray; use crate::array::varbin::VarBinArray; - use crate::array::Array; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; + use crate::validity::Validity; + use crate::{IntoArray, OwnedArray}; - fn binary_array() -> VarBinArray { + fn binary_array() -> OwnedArray { let values = PrimitiveArray::from( "hello worldhello world this is a long string" .as_bytes() @@ -350,12 +238,14 @@ mod test { ); let offsets = PrimitiveArray::from(vec![0, 11, 44]); - VarBinArray::new( + VarBinArray::try_new( offsets.into_array(), values.into_array(), DType::Utf8(Nullability::NonNullable), - None, + Validity::NonNullable, ) + .unwrap() + .into_array() } #[test] diff --git a/vortex-array/src/array/varbin/serde.rs b/vortex-array/src/array/varbin/serde.rs deleted file mode 100644 index c8bdd829d6..0000000000 --- a/vortex-array/src/array/varbin/serde.rs +++ /dev/null @@ -1,69 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::varbin::{VarBinArray, VarBinEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use crate::validity::OwnedValidity; - -impl ArraySerde for VarBinArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_validity(self.validity())?; - ctx.dtype(self.offsets().dtype())?; - ctx.write(self.offsets())?; - ctx.write(self.bytes()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for VarBinEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let validity = ctx.read_validity()?; - // TODO(robert): Stop writing this - let offsets_dtype = ctx.dtype()?; - let offsets = ctx.with_schema(&offsets_dtype).read()?; - let bytes = ctx.bytes().read()?; - Ok(VarBinArray::new(offsets, bytes, ctx.schema().clone(), validity).into_array()) - } -} - -#[cfg(test)] -mod test { - use vortex_schema::{DType, Nullability}; - - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::varbin::VarBinArray; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = VarBinArray::from_vec( - vec!["a", "def", "hello", "this", "is", "a", "test"], - DType::Utf8(Nullability::NonNullable), - ); - - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!( - arr.offsets().as_primitive().buffer().typed_data::(), - read_arr - .as_varbin() - .offsets() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!( - arr.bytes().as_primitive().buffer().typed_data::(), - read_arr - .as_varbin() - .bytes() - .as_primitive() - .buffer() - .typed_data::() - ); - } -} diff --git a/vortex-array/src/array/varbin/stats.rs b/vortex-array/src/array/varbin/stats.rs index e92e61f5c5..db8159af48 100644 --- a/vortex-array/src/array/varbin/stats.rs +++ b/vortex-array/src/array/varbin/stats.rs @@ -1,36 +1,30 @@ -use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashMap; use vortex_error::VortexResult; use vortex_schema::DType; +use crate::accessor::ArrayAccessor; use crate::array::varbin::{varbin_scalar, VarBinArray}; -use crate::array::Array; use crate::scalar::Scalar; -use crate::stats::{Stat, StatsCompute, StatsSet}; +use crate::stats::{ArrayStatisticsCompute, Stat}; +use crate::{ArrayDType, ArrayTrait}; -impl StatsCompute for VarBinArray { - fn compute(&self, _stat: &Stat) -> VortexResult { +impl ArrayStatisticsCompute for VarBinArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { if self.is_empty() { - return Ok(StatsSet::new()); + return Ok(HashMap::new()); } - - Ok(self - .iter_primitive() - .map(|prim_iter| compute_stats(&mut prim_iter.map(|s| s.map(Cow::from)), self.dtype())) - .unwrap_or_else(|_| { - compute_stats(&mut self.iter().map(|s| s.map(Cow::from)), self.dtype()) - })) + self.with_iterator(|iter| compute_stats(iter, self.dtype())) } } pub fn compute_stats( - iter: &mut dyn Iterator>>, + iter: &mut dyn Iterator>, dtype: &DType, -) -> StatsSet { +) -> HashMap { let mut leading_nulls: usize = 0; - let mut first_value: Option> = None; + let mut first_value: Option<&[u8]> = None; for v in &mut *iter { if v.is_none() { leading_nulls += 1; @@ -50,8 +44,8 @@ pub fn compute_stats( } } -fn all_null_stats(len: usize, dtype: &DType) -> StatsSet { - StatsSet::from(HashMap::from([ +fn all_null_stats(len: usize, dtype: &DType) -> HashMap { + HashMap::from([ (Stat::Min, Scalar::null(dtype)), (Stat::Max, Scalar::null(dtype)), (Stat::IsConstant, true.into()), @@ -59,26 +53,25 @@ fn all_null_stats(len: usize, dtype: &DType) -> StatsSet { (Stat::IsStrictSorted, (len < 2).into()), (Stat::RunCount, 1.into()), (Stat::NullCount, len.into()), - ])) + ]) } -#[derive(Debug, Default)] pub struct VarBinAccumulator<'a> { - min: Cow<'a, [u8]>, - max: Cow<'a, [u8]>, + min: &'a [u8], + max: &'a [u8], is_constant: bool, is_sorted: bool, is_strict_sorted: bool, - last_value: Cow<'a, [u8]>, + last_value: &'a [u8], null_count: usize, runs: usize, } impl<'a> VarBinAccumulator<'a> { - pub fn new(value: Cow<'a, [u8]>) -> Self { + pub fn new(value: &'a [u8]) -> Self { Self { - min: value.clone(), - max: value.clone(), + min: value, + max: value, is_constant: true, is_sorted: true, is_strict_sorted: true, @@ -88,7 +81,7 @@ impl<'a> VarBinAccumulator<'a> { } } - pub fn nullable_next(&mut self, val: Option>) { + pub fn nullable_next(&mut self, val: Option<&'a [u8]>) { match val { None => self.null_count += 1, Some(v) => self.next(v), @@ -99,14 +92,14 @@ impl<'a> VarBinAccumulator<'a> { self.null_count += null_count; } - pub fn next(&mut self, val: Cow<'a, [u8]>) { + pub fn next(&mut self, val: &'a [u8]) { if val < self.min { self.min.clone_from(&val); } else if val > self.max { self.max.clone_from(&val); } - match val.cmp(&self.last_value) { + match val.cmp(self.last_value) { Ordering::Less => self.is_sorted = false, Ordering::Equal => { self.is_strict_sorted = false; @@ -119,8 +112,8 @@ impl<'a> VarBinAccumulator<'a> { self.runs += 1; } - pub fn finish(&self, dtype: &DType) -> StatsSet { - StatsSet::from(HashMap::from([ + pub fn finish(&self, dtype: &DType) -> HashMap { + HashMap::from([ (Stat::Min, varbin_scalar(self.min.to_vec(), dtype)), (Stat::Max, varbin_scalar(self.max.to_vec(), dtype)), (Stat::RunCount, self.runs.into()), @@ -128,7 +121,7 @@ impl<'a> VarBinAccumulator<'a> { (Stat::IsStrictSorted, self.is_strict_sorted.into()), (Stat::IsConstant, self.is_constant.into()), (Stat::NullCount, self.null_count.into()), - ])) + ]) } } @@ -136,12 +129,10 @@ impl<'a> VarBinAccumulator<'a> { mod test { use vortex_schema::{DType, Nullability}; - use crate::array::varbin::VarBinArray; - use crate::array::Array; - use crate::scalar::Utf8Scalar; - use crate::stats::Stat; + use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; + use crate::stats::{ArrayStatistics, Stat}; - fn array(dtype: DType) -> VarBinArray { + fn array(dtype: DType) -> OwnedVarBinArray { VarBinArray::from_vec( vec!["hello world", "hello world this is a long string"], dtype, @@ -152,100 +143,47 @@ mod test { fn utf8_stats() { let arr = array(DType::Utf8(Nullability::NonNullable)); assert_eq!( - arr.stats().get_or_compute_as::(&Stat::Min).unwrap(), - "hello world".to_owned() + arr.statistics().compute_as::(Stat::Min).unwrap(), + String::from("hello world") ); assert_eq!( - arr.stats().get_or_compute_as::(&Stat::Max).unwrap(), - "hello world this is a long string".to_owned() + arr.statistics().compute_as::(Stat::Max).unwrap(), + String::from("hello world this is a long string") ); assert_eq!( - arr.stats() - .get_or_compute_as::(&Stat::RunCount) + arr.statistics() + .compute_as::(Stat::RunCount) .unwrap(), 2 ); assert!(!arr - .stats() - .get_or_compute_as::(&Stat::IsConstant) - .unwrap()); - assert!(arr - .stats() - .get_or_compute_as::(&Stat::IsSorted) + .statistics() + .compute_as::(Stat::IsConstant) .unwrap()); + assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); } #[test] fn binary_stats() { let arr = array(DType::Binary(Nullability::NonNullable)); assert_eq!( - arr.stats() - .get_or_compute_as::>(&Stat::Min) - .unwrap(), + arr.statistics().compute_as::>(Stat::Min).unwrap(), "hello world".as_bytes().to_vec() ); assert_eq!( - arr.stats() - .get_or_compute_as::>(&Stat::Max) - .unwrap(), + arr.statistics().compute_as::>(Stat::Max).unwrap(), "hello world this is a long string".as_bytes().to_vec() ); assert_eq!( - arr.stats() - .get_or_compute_as::(&Stat::RunCount) + arr.statistics() + .compute_as::(Stat::RunCount) .unwrap(), 2 ); assert!(!arr - .stats() - .get_or_compute_as::(&Stat::IsConstant) - .unwrap()); - assert!(arr - .stats() - .get_or_compute_as::(&Stat::IsSorted) + .statistics() + .compute_as::(Stat::IsConstant) .unwrap()); - } - - #[test] - fn some_nulls() { - let array = VarBinArray::from_iter( - vec![ - Some("hello world"), - None, - Some("hello world this is a long string"), - None, - ], - DType::Utf8(Nullability::Nullable), - ); - assert_eq!( - array - .stats() - .get_or_compute_as::(&Stat::Min) - .unwrap(), - "hello world".to_owned() - ); - assert_eq!( - array - .stats() - .get_or_compute_as::(&Stat::Max) - .unwrap(), - "hello world this is a long string".to_owned() - ); - } - - #[test] - fn all_nulls() { - let array = VarBinArray::from_iter( - vec![Option::<&str>::None, None, None], - DType::Utf8(Nullability::Nullable), - ); - assert_eq!( - array.stats().get_or_compute(&Stat::Min).unwrap(), - Utf8Scalar::none().into() - ); - assert_eq!( - array.stats().get_or_compute(&Stat::Max).unwrap(), - Utf8Scalar::none().into() - ); + assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); } } diff --git a/vortex-array/src/array/varbinview/accessor.rs b/vortex-array/src/array/varbinview/accessor.rs index 0b8996e15f..45ce8a70d8 100644 --- a/vortex-array/src/array/varbinview/accessor.rs +++ b/vortex-array/src/array/varbinview/accessor.rs @@ -1,31 +1,52 @@ +use vortex_error::VortexResult; + use crate::accessor::ArrayAccessor; -use crate::array::downcast::DowncastArrayBuiltin; +use crate::array::primitive::PrimitiveArray; use crate::array::varbinview::VarBinViewArray; use crate::validity::ArrayValidity; -impl<'a> ArrayAccessor<'a, &'a [u8]> for VarBinViewArray { - fn value(&'a self, index: usize) -> Option<&'a [u8]> { - if self.is_valid(index) { - let view = &self.view_slice()[index]; - if view.is_inlined() { - Some(unsafe { &view.inlined.data }) - } else { - let offset = unsafe { view._ref.offset as usize }; - let buffer_idx = unsafe { view._ref.buffer_index as usize }; - Some(&self.data()[buffer_idx].as_primitive().buffer()[offset..offset + view.size()]) - } - } else { - None - } - } -} +impl ArrayAccessor<[u8]> for VarBinViewArray<'_> { + fn with_iterator FnOnce(&mut dyn Iterator>) -> R, R>( + &self, + f: F, + ) -> VortexResult { + let views = self.view_slice(); + let bytes: Vec = (0..self.metadata().n_children) + .map(|i| self.bytes(i).flatten_primitive()) + .collect::>>()?; + let validity = self.logical_validity().to_null_buffer()?; -impl<'a> ArrayAccessor<'a, Vec> for VarBinViewArray { - fn value(&'a self, index: usize) -> Option> { - if self.is_valid(index) { - Some(self.bytes_at(index).unwrap()) - } else { - None + match validity { + None => { + let mut iter = views.iter().map(|view| { + if view.is_inlined() { + Some(unsafe { &view.inlined.data as &[u8] }) + } else { + let offset = unsafe { view._ref.offset as usize }; + let buffer_idx = unsafe { view._ref.buffer_index as usize }; + Some(&bytes[buffer_idx].typed_data::()[offset..offset + view.size()]) + } + }); + Ok(f(&mut iter)) + } + Some(validity) => { + let mut iter = views.iter().zip(validity.iter()).map(|(view, valid)| { + if valid { + if view.is_inlined() { + Some(unsafe { &view.inlined.data as &[u8] }) + } else { + let offset = unsafe { view._ref.offset as usize }; + let buffer_idx = unsafe { view._ref.buffer_index as usize }; + Some( + &bytes[buffer_idx].typed_data::()[offset..offset + view.size()], + ) + } + } else { + None + } + }); + Ok(f(&mut iter)) + } } } } diff --git a/vortex-array/src/array/varbinview/builder.rs b/vortex-array/src/array/varbinview/builder.rs index dff47f2d53..2570a47eeb 100644 --- a/vortex-array/src/array/varbinview/builder.rs +++ b/vortex-array/src/array/varbinview/builder.rs @@ -6,14 +6,16 @@ use arrow_buffer::NullBufferBuilder; use vortex_schema::DType; use crate::array::primitive::PrimitiveArray; -use crate::array::varbinview::{BinaryView, Inlined, Ref, VarBinViewArray, VIEW_SIZE}; -use crate::array::{Array, ArrayRef, IntoArray}; +use crate::array::varbinview::{ + BinaryView, Inlined, OwnedVarBinViewArray, Ref, VarBinViewArray, VIEW_SIZE, +}; use crate::validity::Validity; +use crate::{ArrayData, IntoArray, IntoArrayData, ToArray}; pub struct VarBinViewBuilder> { views: Vec, nulls: NullBufferBuilder, - completed: Vec, + completed: Vec, in_progress: Vec, block_size: u32, phantom: PhantomData, @@ -49,7 +51,8 @@ impl> VarBinViewBuilder { ); if !done.is_empty() { assert!(self.completed.len() < u32::MAX as usize); - self.completed.push(PrimitiveArray::from(done).into_array()); + self.completed + .push(PrimitiveArray::from(done).into_array_data()); } } @@ -79,22 +82,22 @@ impl> VarBinViewBuilder { self.nulls.append_null(); } - pub fn finish(mut self, dtype: DType) -> VarBinViewArray { - let mut completed = self.completed; + pub fn finish(mut self, dtype: DType) -> OwnedVarBinViewArray { + let mut completed = self + .completed + .into_iter() + .map(|d| d.into_array()) + .collect::>(); if !self.in_progress.is_empty() { completed.push(PrimitiveArray::from(self.in_progress).into_array()); } let nulls = self.nulls.finish(); let validity = if dtype.is_nullable() { - Some( - nulls - .map(Validity::from) - .unwrap_or_else(|| Validity::Valid(self.views.len())), - ) + nulls.map(Validity::from).unwrap_or(Validity::AllValid) } else { assert!(nulls.is_none(), "dtype and validity mismatch"); - None + Validity::NonNullable }; // convert Vec to Vec which can be stored as an array @@ -107,6 +110,12 @@ impl> VarBinViewBuilder { ) }; - VarBinViewArray::try_new(views_u8.into_array(), completed, dtype, validity).unwrap() + VarBinViewArray::try_new( + PrimitiveArray::from(views_u8).to_array(), + completed, + dtype, + validity, + ) + .unwrap() } } diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index f6d80d59e9..3ad2068aa6 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use arrow_array::{ArrayRef as ArrowArrayRef, BinaryViewArray, StringViewArray}; +use arrow_buffer::Buffer as ArrowBuffer; use arrow_buffer::ScalarBuffer; use itertools::Itertools; use vortex_error::{vortex_bail, VortexResult}; @@ -8,27 +9,20 @@ use vortex_schema::DType; use crate::array::varbin::varbin_scalar; use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE}; -use crate::array::{Array, ArrayRef}; -use crate::arrow::wrappers::as_nulls; use crate::compute::as_arrow::AsArrowArray; -use crate::compute::flatten::{flatten, flatten_primitive, FlattenFn, FlattenedArray}; use crate::compute::scalar_at::ScalarAtFn; use crate::compute::slice::{slice, SliceFn}; use crate::compute::ArrayCompute; use crate::ptype::PType; use crate::scalar::Scalar; -use crate::validity::{ArrayValidity, OwnedValidity}; -use crate::view::ToOwnedView; +use crate::validity::ArrayValidity; +use crate::{ArrayDType, IntoArray, IntoArrayData, OwnedArray}; -impl ArrayCompute for VarBinViewArray { +impl ArrayCompute for VarBinViewArray<'_> { fn as_arrow(&self) -> Option<&dyn AsArrowArray> { Some(self) } - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -38,7 +32,7 @@ impl ArrayCompute for VarBinViewArray { } } -impl ScalarAtFn for VarBinViewArray { +impl ScalarAtFn for VarBinViewArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { if self.is_valid(index) { self.bytes_at(index) @@ -49,35 +43,16 @@ impl ScalarAtFn for VarBinViewArray { } } -impl FlattenFn for VarBinViewArray { - fn flatten(&self) -> VortexResult { - let views = flatten(self.views())?.into_array(); - let data = self - .data() - .iter() - .map(|d| flatten(d.as_ref()).unwrap().into_array()) - .collect::>(); - Ok(FlattenedArray::VarBinView(VarBinViewArray::try_new( - views, - data, - self.dtype.clone(), - self.validity().to_owned_view(), - )?)) - } -} - -impl AsArrowArray for VarBinViewArray { +impl AsArrowArray for VarBinViewArray<'_> { fn as_arrow(&self) -> VortexResult { // Views should be buffer of u8 - let views = flatten_primitive(self.views())?; + let views = self.views().flatten_primitive()?; assert_eq!(views.ptype(), PType::U8); - let nulls = as_nulls(self.logical_validity())?; + let nulls = self.logical_validity().to_null_buffer()?; - let data = self - .data() - .iter() - .map(|d| flatten_primitive(d.as_ref()).unwrap()) - .collect::>(); + let data = (0..self.metadata().n_children) + .map(|i| self.bytes(i).flatten_primitive()) + .collect::>>()?; if !data.is_empty() { assert_eq!(data[0].ptype(), PType::U8); assert!(data.iter().map(|d| d.ptype()).all_equal()); @@ -85,18 +60,18 @@ impl AsArrowArray for VarBinViewArray { let data = data .iter() - .map(|p| p.buffer().to_owned()) + .map(|p| ArrowBuffer::from(p.buffer())) .collect::>(); // Switch on Arrow DType. Ok(match self.dtype() { DType::Binary(_) => Arc::new(BinaryViewArray::new( - ScalarBuffer::::from(views.buffer().clone()), + ScalarBuffer::::from(ArrowBuffer::from(views.buffer())), data, nulls, )), DType::Utf8(_) => Arc::new(StringViewArray::new( - ScalarBuffer::::from(views.buffer().clone()), + ScalarBuffer::::from(ArrowBuffer::from(views.buffer())), data, nulls, )), @@ -105,13 +80,17 @@ impl AsArrowArray for VarBinViewArray { } } -impl SliceFn for VarBinViewArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for VarBinViewArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(VarBinViewArray::try_new( - slice(self.views(), start * VIEW_SIZE, stop * VIEW_SIZE)?, - self.data().to_vec(), + slice(&self.views(), start * VIEW_SIZE, stop * VIEW_SIZE)? + .into_array_data() + .into_array(), + (0..self.metadata().n_children) + .map(|i| self.bytes(i)) + .collect::>(), self.dtype().clone(), - self.validity().map(|v| v.slice(start, stop)).transpose()?, + self.validity().slice(start, stop)?, )? .into_array()) } diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 30394fea1a..6ee34c976d 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -1,32 +1,21 @@ -use std::fmt::{Debug, Formatter}; -use std::sync::{Arc, RwLock}; +use std::fmt::Formatter; use std::{mem, slice}; -use linkme::distributed_slice; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; +use ::serde::{Deserialize, Serialize}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::{IntWidth, Nullability, Signedness}; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::primitive::PrimitiveEncoding; +use crate::array::primitive::PrimitiveArray; use crate::array::varbinview::builder::VarBinViewBuilder; -use crate::array::{Array, ArrayRef}; -use crate::compute::flatten::flatten_primitive; use crate::compute::slice::slice; -use crate::compute::ArrayCompute; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::iterator::ArrayIter; -use crate::serde::{ArraySerde, EncodingSerde}; -use crate::stats::{Stats, StatsSet}; -use crate::validity::OwnedValidity; -use crate::validity::{Validity, ValidityView}; -use crate::view::AsView; -use crate::{impl_array, ArrayWalker}; +use crate::validity::Validity; +use crate::validity::{ArrayValidity, LogicalValidity, ValidityMetadata}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{impl_encoding, ArrayDType, ArrayFlatten, ToArrayData}; mod accessor; mod builder; mod compute; -mod serde; mod stats; #[derive(Clone, Copy, Debug)] @@ -107,21 +96,20 @@ impl Debug for BinaryView { pub const VIEW_SIZE: usize = mem::size_of::(); -#[derive(Debug, Clone)] -pub struct VarBinViewArray { - views: ArrayRef, - data: Vec, - dtype: DType, - validity: Option, - stats: Arc>, +impl_encoding!("vortex.varbinview", VarBinView); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VarBinViewMetadata { + validity: ValidityMetadata, + n_children: usize, } -impl VarBinViewArray { +impl VarBinViewArray<'_> { pub fn try_new( - views: ArrayRef, - data: Vec, + views: Array, + data: Vec, dtype: DType, - validity: Option, + validity: Validity, ) -> VortexResult { if !matches!( views.dtype(), @@ -143,26 +131,33 @@ impl VarBinViewArray { vortex_bail!(MismatchedTypes: "utf8 or binary", dtype); } - let dtype = if validity.is_some() && !dtype.is_nullable() { - dtype.as_nullable() - } else { - dtype + if dtype.is_nullable() == (validity == Validity::NonNullable) { + vortex_bail!("incorrect validity {:?}", validity); + } + + let metadata = VarBinViewMetadata { + validity: validity.to_metadata(views.len() / VIEW_SIZE)?, + n_children: data.len(), }; - Ok(Self { - views, - data, - dtype, - validity, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) + let mut children = Vec::with_capacity(data.len() + 2); + children.push(views.to_array_data()); + children.extend(data.iter().map(|d| d.to_array_data())); + if let Some(a) = validity.into_array_data() { + children.push(a) + } + + Self::try_from_parts(dtype, metadata, children.into(), HashMap::default()) } fn view_slice(&self) -> &[BinaryView] { unsafe { slice::from_raw_parts( - self.views.as_primitive().typed_data::().as_ptr() as _, - self.views.len() / VIEW_SIZE, + PrimitiveArray::try_from(self.views()) + .expect("Views must be a primitive array") + .typed_data::() + .as_ptr() as _, + self.views().len() / VIEW_SIZE, ) } } @@ -172,13 +167,22 @@ impl VarBinViewArray { } #[inline] - pub fn views(&self) -> &ArrayRef { - &self.views + pub fn views(&self) -> Array { + self.array().child(0, &DType::BYTES).expect("missing views") } #[inline] - pub fn data(&self) -> &[ArrayRef] { - &self.data + pub fn bytes(&self, idx: usize) -> Array { + self.array() + .child(idx + 1, &DType::BYTES) + .expect("Missing data buffer") + } + + pub fn validity(&self) -> Validity { + self.metadata().validity.to_validity( + self.array() + .child(self.metadata().n_children + 1, &Validity::DTYPE), + ) } pub fn from_vec>(vec: Vec, dtype: DType) -> Self { @@ -201,35 +205,17 @@ impl VarBinViewArray { builder.finish(dtype) } - pub fn iter_primitive(&self) -> VortexResult> { - if self - .data() - .iter() - .all(|b| b.encoding().id() == PrimitiveEncoding::ID) - { - Ok(ArrayIter::new(self)) - } else { - Err(vortex_err!("Bytes array was not a primitive array")) - } - } - - pub fn iter(&self) -> ArrayIter<'_, VarBinViewArray, Vec> { - ArrayIter::new(self) - } - pub fn bytes_at(&self, index: usize) -> VortexResult> { let view = self.view_at(index); unsafe { if view.inlined.size > 12 { - let data_buf = flatten_primitive(&slice( - self.data.get(view._ref.buffer_index as usize).unwrap(), + let data_buf = slice( + &self.bytes(view._ref.buffer_index as usize), view._ref.offset as usize, (view._ref.size + view._ref.offset) as usize, - )?)?; - Ok(data_buf - .into_buffer() - .into_vec() - .unwrap_or_else(|buf| buf.to_vec())) + )? + .flatten_primitive()?; + Ok(data_buf.typed_data::().to_vec()) } else { Ok(view.inlined.data[..view.inlined.size as usize].to_vec()) } @@ -237,153 +223,101 @@ impl VarBinViewArray { } } -impl Array for VarBinViewArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.views.len() / std::mem::size_of::() - } - - #[inline] - fn is_empty(&self) -> bool { - self.views.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &VarBinViewEncoding - } - - fn nbytes(&self) -> usize { - self.views.nbytes() + self.data.iter().map(|arr| arr.nbytes()).sum::() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.views())?; - for data in self.data() { - walker.visit_child(data)?; - } - Ok(()) +impl ArrayFlatten for VarBinViewArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + Ok(Flattened::VarBinView(self)) } } -impl OwnedValidity for VarBinViewArray { - fn validity(&self) -> Option { - self.validity.as_view() +impl ArrayValidity for VarBinViewArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) } -} - -#[derive(Debug)] -pub struct VarBinViewEncoding; - -impl VarBinViewEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.varbinview"); -} - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_VARBINVIEW: EncodingRef = &VarBinViewEncoding; -impl Encoding for VarBinViewEncoding { - fn id(&self) -> EncodingId { - Self::ID + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) } +} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl AcceptArrayVisitor for VarBinViewArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("views", &self.views())?; + for i in 0..self.metadata().n_children { + visitor.visit_child(format!("bytes_{i}").as_str(), &self.bytes(i))?; + } + visitor.visit_validity(&self.validity()) } } -impl ArrayDisplay for VarBinViewArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("views", self.views())?; - for (i, d) in self.data().iter().enumerate() { - f.child(&format!("data_{}", i), d.as_ref())?; - } - f.validity(self.validity()) +impl ArrayTrait for VarBinViewArray<'_> { + fn len(&self) -> usize { + self.view_slice().len() } } -impl From> for VarBinViewArray { +impl From> for VarBinViewArray<'_> { fn from(value: Vec<&[u8]>) -> Self { VarBinViewArray::from_vec(value, DType::Binary(Nullability::NonNullable)) } } -impl From>> for VarBinViewArray { +impl From>> for VarBinViewArray<'_> { fn from(value: Vec>) -> Self { VarBinViewArray::from_vec(value, DType::Binary(Nullability::NonNullable)) } } -impl From> for VarBinViewArray { +impl From> for VarBinViewArray<'_> { fn from(value: Vec) -> Self { VarBinViewArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) } } -impl From> for VarBinViewArray { +impl From> for VarBinViewArray<'_> { fn from(value: Vec<&str>) -> Self { VarBinViewArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) } } -impl<'a> FromIterator> for VarBinViewArray { +impl<'a> FromIterator> for VarBinViewArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinViewArray::from_iter(iter, DType::Binary(Nullability::NonNullable)) } } -impl FromIterator>> for VarBinViewArray { +impl FromIterator>> for VarBinViewArray<'_> { fn from_iter>>>(iter: T) -> Self { VarBinViewArray::from_iter(iter, DType::Binary(Nullability::NonNullable)) } } -impl FromIterator> for VarBinViewArray { +impl FromIterator> for VarBinViewArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinViewArray::from_iter(iter, DType::Utf8(Nullability::NonNullable)) } } -impl<'a> FromIterator> for VarBinViewArray { +impl<'a> FromIterator> for VarBinViewArray<'_> { fn from_iter>>(iter: T) -> Self { VarBinViewArray::from_iter(iter, DType::Utf8(Nullability::NonNullable)) } } +impl EncodingCompression for VarBinViewEncoding {} + #[cfg(test)] mod test { use arrow_array::array::StringViewArray as ArrowStringViewArray; use crate::array::varbinview::VarBinViewArray; - use crate::array::Array; use crate::compute::as_arrow::as_arrow; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; use crate::scalar::Scalar; + use crate::{ArrayTrait, IntoArray}; #[test] pub fn varbin_view() { @@ -391,11 +325,11 @@ mod test { VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]); assert_eq!(binary_arr.len(), 2); assert_eq!( - scalar_at(&binary_arr, 0).unwrap(), + scalar_at(binary_arr.array(), 0).unwrap(), Scalar::from("hello world") ); assert_eq!( - scalar_at(&binary_arr, 1).unwrap(), + scalar_at(binary_arr.array(), 1).unwrap(), Scalar::from("hello world this is a long string") ); } @@ -403,7 +337,8 @@ mod test { #[test] pub fn slice_array() { let binary_arr = slice( - &VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]), + &VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]) + .into_array(), 1, 2, ) @@ -419,7 +354,7 @@ mod test { let binary_array = VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]); assert_eq!( - as_arrow(&binary_array) + as_arrow(binary_array.array()) .unwrap() .as_any() .downcast_ref::() diff --git a/vortex-array/src/array/varbinview/serde.rs b/vortex-array/src/array/varbinview/serde.rs deleted file mode 100644 index 40996c45d6..0000000000 --- a/vortex-array/src/array/varbinview/serde.rs +++ /dev/null @@ -1,71 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; -use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use crate::validity::OwnedValidity; - -impl ArraySerde for VarBinViewArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_validity(self.validity())?; - ctx.write(self.views())?; - ctx.write_usize(self.data().len())?; - for d in self.data() { - ctx.write(d.as_ref())?; - } - Ok(()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for VarBinViewEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let validity = ctx.read_validity()?; - let views = ctx.bytes().read()?; - let num_data = ctx.read_usize()?; - let mut data_bufs = Vec::::with_capacity(num_data); - for _ in 0..num_data { - data_bufs.push(ctx.bytes().read()?); - } - Ok( - VarBinViewArray::try_new(views, data_bufs, ctx.schema().clone(), validity) - .unwrap() - .into_array(), - ) - } -} - -#[cfg(test)] -mod test { - use crate::array::downcast::DowncastArrayBuiltin; - use crate::array::varbinview::VarBinViewArray; - use crate::serde::test::roundtrip_array; - - #[test] - fn roundtrip() { - let arr = VarBinViewArray::from(vec!["hello world", "hello world this is a long string"]); - - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!( - arr.views().as_primitive().buffer().typed_data::(), - read_arr - .as_varbinview() - .views() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!( - arr.data()[0].as_primitive().buffer().typed_data::(), - read_arr.as_varbinview().data()[0] - .as_primitive() - .buffer() - .typed_data::() - ); - } -} diff --git a/vortex-array/src/array/varbinview/stats.rs b/vortex-array/src/array/varbinview/stats.rs index b01ac64ded..5d7b8f6976 100644 --- a/vortex-array/src/array/varbinview/stats.rs +++ b/vortex-array/src/array/varbinview/stats.rs @@ -1,23 +1,19 @@ -use std::borrow::Cow; +use std::collections::HashMap; use vortex_error::VortexResult; +use crate::accessor::ArrayAccessor; use crate::array::varbin::compute_stats; use crate::array::varbinview::VarBinViewArray; -use crate::array::Array; -use crate::stats::{Stat, StatsCompute, StatsSet}; +use crate::scalar::Scalar; +use crate::stats::{ArrayStatisticsCompute, Stat}; +use crate::{ArrayDType, ArrayTrait}; -impl StatsCompute for VarBinViewArray { - fn compute(&self, _stat: &Stat) -> VortexResult { +impl ArrayStatisticsCompute for VarBinViewArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { if self.is_empty() { - return Ok(StatsSet::new()); + return Ok(HashMap::new()); } - - Ok(self - .iter_primitive() - .map(|prim_iter| compute_stats(&mut prim_iter.map(|s| s.map(Cow::from)), self.dtype())) - .unwrap_or_else(|_| { - compute_stats(&mut self.iter().map(|s| s.map(Cow::from)), self.dtype()) - })) + self.with_iterator(|iter| compute_stats(iter, self.dtype())) } } diff --git a/vortex-array2/src/arrow/array.rs b/vortex-array/src/arrow/array.rs similarity index 83% rename from vortex-array2/src/arrow/array.rs rename to vortex-array/src/arrow/array.rs index dc3d43d349..28fa0d0b1d 100644 --- a/vortex-array2/src/arrow/array.rs +++ b/vortex-array/src/arrow/array.rs @@ -21,19 +21,21 @@ use arrow_array::{BinaryViewArray, GenericByteViewArray, StringViewArray}; use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::{ArrowNativeType, Buffer, ScalarBuffer}; use arrow_schema::{DataType, TimeUnit}; -use vortex::ptype::NativePType; use vortex_schema::DType; use crate::array::bool::BoolArray; +use crate::array::constant::ConstantArray; +use crate::array::datetime::{LocalDateTime, LocalDateTimeArray}; use crate::array::primitive::PrimitiveArray; use crate::array::r#struct::StructArray; +use crate::array::varbin::VarBinArray; +use crate::array::varbinview::VarBinViewArray; +use crate::arrow::FromArrowArray; +use crate::ptype::NativePType; +use crate::scalar::NullScalar; use crate::stats::{Stat, Statistics}; use crate::validity::Validity; -use crate::{ArrayData, IntoArrayData}; - -pub trait FromArrowArray { - fn from_arrow(array: A, nullable: bool) -> Self; -} +use crate::{ArrayData, IntoArray, IntoArrayData}; impl IntoArrayData for Buffer { fn into_array_data(self) -> ArrayData { @@ -96,9 +98,18 @@ where } match T::DATA_TYPE { - DataType::Timestamp(_time_unit, _tz) => { - todo!("Port from vortex1") - } + DataType::Timestamp(time_unit, tz) => match tz { + // A timestamp with no timezone is the equivalent of an "unknown" timezone. + // Therefore, we must treat it as a LocalDateTime and not an Instant. + None => LocalDateTimeArray::new( + LocalDateTime::new((&time_unit).into()), + arr.into_array(), + ) + .as_composite() + .unwrap() + .into_array_data(), + Some(_tz) => todo!(), + }, DataType::Date32 => todo!(), DataType::Date64 => todo!(), DataType::Time32(_) => todo!(), @@ -110,25 +121,46 @@ where } } -impl FromArrowArray<&GenericByteArray> for ArrayData { - fn from_arrow(_value: &GenericByteArray, nullable: bool) -> Self { - let _dtype = match T::DATA_TYPE { +impl FromArrowArray<&GenericByteArray> for ArrayData +where + ::Offset: NativePType, +{ + fn from_arrow(value: &GenericByteArray, nullable: bool) -> Self { + let dtype = match T::DATA_TYPE { DataType::Binary | DataType::LargeBinary => DType::Binary(nullable.into()), DataType::Utf8 | DataType::LargeUtf8 => DType::Utf8(nullable.into()), _ => panic!("Invalid data type for ByteArray"), }; - todo!("PORT") + VarBinArray::try_new( + value.offsets().clone().into_array_data().into_array(), + value.values().clone().into_array_data().into_array(), + dtype, + nulls(value.nulls(), nullable), + ) + .unwrap() + .into_array_data() } } impl FromArrowArray<&GenericByteViewArray> for ArrayData { - fn from_arrow(_value: &GenericByteViewArray, nullable: bool) -> Self { - let _dtype = match T::DATA_TYPE { + fn from_arrow(value: &GenericByteViewArray, nullable: bool) -> Self { + let dtype = match T::DATA_TYPE { DataType::BinaryView => DType::Binary(nullable.into()), DataType::Utf8View => DType::Utf8(nullable.into()), _ => panic!("Invalid data type for ByteViewArray"), }; - todo!("PORT") + VarBinViewArray::try_new( + value.views().inner().clone().into_array_data().into_array(), + value + .data_buffers() + .iter() + .map(|b| b.clone().into_array_data().into_array()) + .collect::>(), + dtype, + nulls(value.nulls(), nullable), + ) + .unwrap() + .into_array_data() } } @@ -155,7 +187,9 @@ impl FromArrowArray<&ArrowStructArray> for ArrayData { .columns() .iter() .zip(value.fields()) - .map(|(c, field)| ArrayData::from_arrow(c.clone(), field.is_nullable())) + .map(|(c, field)| { + ArrayData::from_arrow(c.clone(), field.is_nullable()).into_array() + }) .collect(), value.len(), ) @@ -165,10 +199,9 @@ impl FromArrowArray<&ArrowStructArray> for ArrayData { } impl FromArrowArray<&ArrowNullArray> for ArrayData { - fn from_arrow(_value: &ArrowNullArray, nullable: bool) -> Self { + fn from_arrow(value: &ArrowNullArray, nullable: bool) -> Self { assert!(nullable); - todo!("PORT") - // ConstantArray::new(NullScalar::new(), value.len()).to_array_data() + ConstantArray::new(NullScalar::new(), value.len()).into_array_data() } } diff --git a/vortex-array/src/arrow/dtypes.rs b/vortex-array/src/arrow/dtypes.rs index fd9627628d..efad3c598e 100644 --- a/vortex-array/src/arrow/dtypes.rs +++ b/vortex-array/src/arrow/dtypes.rs @@ -6,8 +6,8 @@ use itertools::Itertools; use vortex_error::{vortex_err, VortexError, VortexResult}; use vortex_schema::{DType, FloatWidth, IntWidth, Nullability}; +use crate::array::datetime::{LocalDateTimeExtension, TimeUnit}; use crate::arrow::FromArrowType; -use crate::datetime::{LocalDateTimeExtension, TimeUnit}; use crate::ptype::PType; impl TryFrom<&DataType> for PType { diff --git a/vortex-array/src/arrow/mod.rs b/vortex-array/src/arrow/mod.rs index 0931622c66..9b5eaaa2a0 100644 --- a/vortex-array/src/arrow/mod.rs +++ b/vortex-array/src/arrow/mod.rs @@ -1,7 +1,12 @@ -pub mod dtypes; +mod array; +mod dtypes; mod recordbatch; pub mod wrappers; +pub trait FromArrowArray { + fn from_arrow(array: A, nullable: bool) -> Self; +} + pub trait FromArrowType: Sized { fn from_arrow(value: T) -> Self; } diff --git a/vortex-array/src/arrow/recordbatch.rs b/vortex-array/src/arrow/recordbatch.rs index fd103dc9de..2f9e61968a 100644 --- a/vortex-array/src/arrow/recordbatch.rs +++ b/vortex-array/src/arrow/recordbatch.rs @@ -2,13 +2,13 @@ use std::sync::Arc; use arrow_array::RecordBatch; -use crate::array::struct_::StructArray; -use crate::array::{Array, ArrayRef, IntoArray}; -use crate::encode::FromArrowArray; +use crate::array::r#struct::StructArray; +use crate::arrow::FromArrowArray; +use crate::{ArrayData, IntoArray, IntoArrayData, ToArrayData}; -impl IntoArray for &RecordBatch { - fn into_array(self) -> ArrayRef { - StructArray::new( +impl ToArrayData for RecordBatch { + fn to_array_data(&self) -> ArrayData { + StructArray::try_new( self.schema() .fields() .iter() @@ -19,10 +19,13 @@ impl IntoArray for &RecordBatch { self.columns() .iter() .zip(self.schema().fields()) - .map(|(array, field)| ArrayRef::from_arrow(array.clone(), field.is_nullable())) + .map(|(array, field)| { + ArrayData::from_arrow(array.clone(), field.is_nullable()).into_array() + }) .collect(), self.num_rows(), ) - .into_array() + .unwrap() + .into_array_data() } } diff --git a/vortex-array/src/arrow/wrappers.rs b/vortex-array/src/arrow/wrappers.rs index b7cd52056b..3621eb810a 100644 --- a/vortex-array/src/arrow/wrappers.rs +++ b/vortex-array/src/arrow/wrappers.rs @@ -1,26 +1,13 @@ -use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use vortex_error::VortexResult; +use arrow_buffer::{Buffer as ArrowBuffer, OffsetBuffer, ScalarBuffer}; use crate::array::primitive::PrimitiveArray; use crate::ptype::NativePType; -use crate::validity::Validity; -use crate::view::AsView; -pub fn as_scalar_buffer(array: PrimitiveArray) -> ScalarBuffer { +pub fn as_scalar_buffer(array: PrimitiveArray<'_>) -> ScalarBuffer { assert_eq!(array.ptype(), T::PTYPE); - ScalarBuffer::from(array.buffer().clone()) + ScalarBuffer::from(ArrowBuffer::from(array.buffer())) } -pub fn as_offset_buffer(array: PrimitiveArray) -> OffsetBuffer { +pub fn as_offset_buffer(array: PrimitiveArray<'_>) -> OffsetBuffer { OffsetBuffer::new(as_scalar_buffer(array)) } - -pub fn as_nulls(validity: Validity) -> VortexResult> { - match validity { - Validity::Valid(_) => Ok(None), - Validity::Invalid(_) => Ok(Some(NullBuffer::new_null(validity.as_view().len()))), - Validity::Array(_) => Ok(Some(NullBuffer::new( - validity.to_bool_array().buffer().clone(), - ))), - } -} diff --git a/vortex-array2/src/buffer.rs b/vortex-array/src/buffer.rs similarity index 92% rename from vortex-array2/src/buffer.rs rename to vortex-array/src/buffer.rs index 8be566938b..ceb733f00e 100644 --- a/vortex-array2/src/buffer.rs +++ b/vortex-array/src/buffer.rs @@ -1,6 +1,6 @@ use arrow_buffer::Buffer as ArrowBuffer; -use vortex::ptype::NativePType; +use crate::ptype::NativePType; use crate::ToStatic; #[derive(Debug, Clone)] @@ -87,3 +87,11 @@ impl From<&Buffer<'_>> for ArrowBuffer { } } } + +impl PartialEq for Buffer<'_> { + fn eq(&self, other: &Self) -> bool { + self.as_slice().eq(other.as_slice()) + } +} + +impl Eq for Buffer<'_> {} diff --git a/vortex-array/src/compress.rs b/vortex-array/src/compress.rs index 105a6811a1..4b29d8fce1 100644 --- a/vortex-array/src/compress.rs +++ b/vortex-array/src/compress.rs @@ -3,46 +3,47 @@ use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; use log::{debug, info, warn}; -use vortex_error::VortexResult; +use vortex_error::{vortex_bail, VortexResult}; -use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; +use crate::array::chunked::{Chunked, ChunkedArray, ChunkedEncoding}; use crate::array::composite::CompositeEncoding; -use crate::array::constant::ConstantArray; +use crate::array::constant::{Constant, ConstantArray}; +use crate::array::r#struct::{Struct, StructArray, StructEncoding}; use crate::array::sparse::SparseEncoding; -use crate::array::struct_::{StructArray, StructEncoding}; use crate::array::varbin::VarBinEncoding; -use crate::array::{Array, ArrayKind, ArrayRef}; -use crate::compute; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; -use crate::encoding::{Encoding, EncodingRef, ENCODINGS}; -use crate::formatter::display_tree; +use crate::encoding::{ArrayEncoding, EncodingRef, VORTEX_ENCODINGS}; use crate::sampling::stratified_slices; use crate::stats::Stat; -use crate::validity::{Validity, ValidityView}; -use crate::view::ToOwnedView; +use crate::validity::Validity; +use crate::{compute, Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray, ToStatic}; -pub trait EncodingCompression: Encoding { +pub trait EncodingCompression: ArrayEncoding { fn cost(&self) -> u8 { 1 } fn can_compress( &self, - array: &dyn Array, - config: &CompressConfig, - ) -> Option<&dyn EncodingCompression>; + _array: &Array, + _config: &CompressConfig, + ) -> Option<&dyn EncodingCompression> { + None + } fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, - ctx: CompressCtx, - ) -> VortexResult; + _array: &Array, + _like: Option<&Array>, + _ctx: CompressCtx, + ) -> VortexResult { + vortex_bail!(NotImplemented: "compress not implemented for {}", self.id().name()) + } // For an array returned by this encoding, give the size in bytes minus any constant overheads. - fn compressed_nbytes(&self, array: &dyn Array) -> usize { - array.nbytes() + fn compressed_nbytes(&self, array: &Array) -> usize { + array.with_dyn(|a| a.nbytes()) } } @@ -167,9 +168,9 @@ impl CompressCtx { // We don't take a reference to self to force the caller to think about whether to use // an auxilliary ctx. - pub fn compress(&self, arr: &dyn Array, like: Option<&ArrayRef>) -> VortexResult { + pub fn compress(&self, arr: &Array, like: Option<&Array>) -> VortexResult { if arr.is_empty() { - return Ok(arr.to_array()); + return Ok(arr.to_static()); } // Attempt to compress using the "like" array, otherwise fall back to sampled compression @@ -177,7 +178,8 @@ impl CompressCtx { if let Some(compressed) = l .encoding() .compression() - .map(|c| c.compress(arr, Some(l), self.for_encoding(c))) + .can_compress(arr, self.options().as_ref()) + .map(|c| c.compress(arr, Some(l), self.for_encoding(l.encoding().compression()))) { let compressed = compressed?; if compressed.dtype() != arr.dtype() { @@ -185,7 +187,7 @@ impl CompressCtx { "Compression changed dtype: {:?} -> {:?} for {}", arr.dtype(), compressed.dtype(), - display_tree(&compressed), + compressed.tree_display(), ); } return Ok(compressed); @@ -204,60 +206,53 @@ impl CompressCtx { "Compression changed dtype: {:?} -> {:?} for {}", arr.dtype(), compressed.dtype(), - display_tree(&compressed), + compressed.tree_display(), ); } Ok(compressed) } - // TODO(ngates): implement a compressor for validity #197 - pub fn compress_validity( - &self, - validity: Option, - ) -> VortexResult> { - if let Some(validity) = validity { - match validity { - ValidityView::Valid(_) | ValidityView::Invalid(_) => { - Ok(Some(validity.to_owned_view())) - } - ValidityView::Array(a) => Ok(Some(Validity::array(self.compress(a, None)?)?)), - } - } else { - Ok(None) + pub fn compress_validity<'a>(&self, validity: Validity<'a>) -> VortexResult> { + match validity { + Validity::Array(a) => Ok(Validity::Array(self.compress(&a, None)?)), + a => Ok(a), } } - fn compress_array(&self, arr: &dyn Array) -> VortexResult { - match ArrayKind::from(arr) { - ArrayKind::Chunked(chunked) => { + fn compress_array(&self, arr: &Array) -> VortexResult { + match arr.encoding().id() { + Chunked::ID => { // For chunked arrays, we compress each chunk individually - let compressed_chunks: VortexResult> = chunked + let chunked = ChunkedArray::try_from(arr)?; + let compressed_chunks: VortexResult> = chunked .chunks() - .iter() - .map(|chunk| self.compress_array(chunk)) + .map(|chunk| self.compress_array(&chunk)) .collect(); - Ok(ChunkedArray::new(compressed_chunks?, chunked.dtype().clone()).into_array()) + Ok( + ChunkedArray::try_new(compressed_chunks?, chunked.dtype().clone())? + .into_array(), + ) } - ArrayKind::Constant(constant) => { + Constant::ID => { // Not much better we can do than constant! - Ok(constant.clone().into_array()) + Ok(arr.to_static()) } - ArrayKind::Struct(strct) => { + Struct::ID => { // For struct arrays, we compress each field individually - let compressed_fields: VortexResult> = strct - .fields() - .iter() - .map(|field| self.compress_array(field)) - .collect(); + let strct = StructArray::try_from(arr)?; + let compressed_fields = strct + .children() + .map(|field| self.compress_array(&field)) + .collect::>>()?; Ok( - StructArray::new(strct.names().clone(), compressed_fields?, strct.len()) + StructArray::try_new(strct.names().clone(), compressed_fields, strct.len())? .into_array(), ) } _ => { // Otherwise, we run sampled compression over pluggable encodings let sampled = sampled_compression(arr, self)?; - Ok(sampled.unwrap_or_else(|| arr.to_array())) + Ok(sampled.unwrap_or_else(|| arr.to_static())) } } } @@ -269,24 +264,25 @@ impl Default for CompressCtx { } } -pub fn sampled_compression(array: &dyn Array, ctx: &CompressCtx) -> VortexResult> { +pub fn sampled_compression(array: &Array, ctx: &CompressCtx) -> VortexResult> { // First, we try constant compression and shortcut any sampling. if !array.is_empty() - && array - .stats() - .get_or_compute_as::(&Stat::IsConstant) - .unwrap_or(false) + && array.with_dyn(|a| { + a.statistics() + .compute_as::(Stat::IsConstant) + .unwrap_or(false) + }) { return Ok(Some( ConstantArray::new(scalar_at(array, 0)?, array.len()).into_array(), )); } - let mut candidates: Vec<&dyn EncodingCompression> = ENCODINGS + let mut candidates: Vec<&dyn EncodingCompression> = VORTEX_ENCODINGS .iter() .filter(|&encoding| ctx.options().is_enabled(*encoding)) .filter(|&encoding| !ctx.disabled_encodings.contains(encoding)) - .filter_map(|encoding| encoding.compression()) + .map(|encoding| encoding.compression()) .filter(|compression| { if compression .can_compress(array, ctx.options().as_ref()) @@ -357,9 +353,9 @@ pub fn sampled_compression(array: &dyn Array, ctx: &CompressCtx) -> VortexResult fn find_best_compression<'a>( candidates: Vec<&'a dyn EncodingCompression>, - sample: &dyn Array, + sample: &Array, ctx: &CompressCtx, -) -> VortexResult> { +) -> VortexResult> { let mut best = None; let mut best_ratio = 1.0; for compression in candidates { @@ -377,8 +373,8 @@ fn find_best_compression<'a>( } let compressed_sample = compression.compress(sample, None, ctx.for_encoding(compression))?; - let compressed_size = compression.compressed_nbytes(compressed_sample.as_ref()); - let ratio = compressed_size as f32 / sample.nbytes() as f32; + let compressed_size = compression.compressed_nbytes(&compressed_sample); + let ratio = compressed_size as f32 / sample.with_dyn(|a| a.nbytes()) as f32; debug!("{} ratio for {}: {}", ctx, compression.id(), ratio); if ratio < best_ratio { best_ratio = ratio; diff --git a/vortex-array/src/compute/add.rs b/vortex-array/src/compute/add.rs deleted file mode 100644 index 3ef8be42ae..0000000000 --- a/vortex-array/src/compute/add.rs +++ /dev/null @@ -1,51 +0,0 @@ -use vortex_error::{vortex_bail, VortexResult}; - -use crate::array::constant::ConstantArray; -use crate::array::{Array, ArrayKind, ArrayRef}; -use crate::scalar::Scalar; - -// TODO(ngates): convert this to arithmetic operations with macro over the kernel. -pub fn add(lhs: &dyn Array, rhs: &dyn Array) -> VortexResult { - // Check that the arrays are the same length. - let length = lhs.len(); - if rhs.len() != length { - vortex_bail!("Arrays have different lengths"); - } - - match (ArrayKind::from(lhs), ArrayKind::from(rhs)) { - (ArrayKind::Constant(lhs), ArrayKind::Constant(rhs)) => { - Ok(ConstantArray::new(add_scalars(lhs.scalar(), rhs.scalar())?, length).into_array()) - } - (ArrayKind::Constant(lhs), _) => add_scalar(rhs, lhs.scalar()), - (_, ArrayKind::Constant(rhs)) => add_scalar(lhs, rhs.scalar()), - _ => todo!("Implement default addition"), - } -} - -pub fn add_scalar(lhs: &dyn Array, rhs: &Scalar) -> VortexResult { - match ArrayKind::from(lhs) { - ArrayKind::Constant(lhs) => { - Ok(ConstantArray::new(add_scalars(lhs.scalar(), rhs)?, lhs.len()).into_array()) - } - _ => todo!("Implement default addition"), - } -} - -pub fn add_scalars(_lhs: &Scalar, _rhs: &Scalar) -> VortexResult { - // Might need to improve this implementation... - Ok(24.into()) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_add() { - let lhs = ConstantArray::new(47, 100); - let rhs = ConstantArray::new(47, 100); - let result = add(&lhs, &rhs).unwrap(); - assert_eq!(result.len(), 100); - // assert_eq!(scalar_at(result, 0), 94); - } -} diff --git a/vortex-array/src/compute/as_arrow.rs b/vortex-array/src/compute/as_arrow.rs index f3b0835372..59043d055c 100644 --- a/vortex-array/src/compute/as_arrow.rs +++ b/vortex-array/src/compute/as_arrow.rs @@ -1,38 +1,35 @@ use arrow_array::ArrayRef as ArrowArrayRef; -use itertools::Itertools; use vortex_error::{vortex_err, VortexResult}; -use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::{Array, WithArrayCompute}; -use crate::compute::flatten::flatten; +use crate::array::chunked::ChunkedArray; +use crate::{Array, IntoArray}; pub trait AsArrowArray { fn as_arrow(&self) -> VortexResult; } -pub fn as_arrow(array: &dyn Array) -> VortexResult { - array.with_compute(|c| { +pub fn as_arrow(array: &Array) -> VortexResult { + array.with_dyn(|a| { // If as_arrow is implemented, then invoke that. - if let Some(a) = c.as_arrow() { + if let Some(a) = a.as_arrow() { return a.as_arrow(); } // Otherwise, flatten and try again. - let array = flatten(array)?.into_array(); - c.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { + let array = array.clone().flatten()?.into_array(); + a.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { Err(vortex_err!(NotImplemented: "as_arrow", array.encoding().id().name())) }) }) } // TODO(ngates): return a RecordBatchReader instead? -pub fn as_arrow_chunks(array: &dyn Array) -> VortexResult> { - if let Some(chunked) = array.maybe_chunked() { +pub fn as_arrow_chunks(array: &Array) -> VortexResult> { + if let Ok(chunked) = ChunkedArray::try_from(array) { chunked .chunks() - .iter() - .map(|a| as_arrow(a.as_ref())) - .try_collect() + .map(|a| as_arrow(&a)) + .collect::>>() } else { as_arrow(array).map(|a| vec![a]) } diff --git a/vortex-array/src/compute/as_contiguous.rs b/vortex-array/src/compute/as_contiguous.rs index 773ecf93d0..2b6af58d6b 100644 --- a/vortex-array/src/compute/as_contiguous.rs +++ b/vortex-array/src/compute/as_contiguous.rs @@ -1,13 +1,13 @@ use itertools::Itertools; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef, WithArrayCompute}; +use crate::{Array, ArrayDType, OwnedArray}; pub trait AsContiguousFn { - fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult; + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult; } -pub fn as_contiguous(arrays: &[ArrayRef]) -> VortexResult { +pub fn as_contiguous(arrays: &[Array]) -> VortexResult { if arrays.is_empty() { vortex_bail!(ComputeError: "No arrays to concatenate"); } @@ -16,10 +16,15 @@ pub fn as_contiguous(arrays: &[ArrayRef]) -> VortexResult { "Chunks have differing encodings", ); } + if !arrays.iter().map(|chunk| chunk.dtype()).all_equal() { + vortex_bail!(ComputeError: + "Chunks have differing dtypes", + ); + } let first = arrays.first().unwrap(); - first.with_compute(|c| { - c.as_contiguous() + first.with_dyn(|a| { + a.as_contiguous() .map(|f| f.as_contiguous(arrays)) .unwrap_or_else(|| { Err(vortex_err!( diff --git a/vortex-array/src/compute/cast.rs b/vortex-array/src/compute/cast.rs index f254307f7f..6a7d538260 100644 --- a/vortex-array/src/compute/cast.rs +++ b/vortex-array/src/compute/cast.rs @@ -1,20 +1,20 @@ use vortex_error::{vortex_err, VortexResult}; use vortex_schema::DType; -use crate::array::{Array, ArrayRef, WithArrayCompute}; +use crate::{Array, ArrayDType, OwnedArray, ToStatic}; pub trait CastFn { - fn cast(&self, dtype: &DType) -> VortexResult; + fn cast(&self, dtype: &DType) -> VortexResult; } -pub fn cast(array: &dyn Array, dtype: &DType) -> VortexResult { +pub fn cast(array: &Array, dtype: &DType) -> VortexResult { if array.dtype() == dtype { - return Ok(array.to_array()); + return Ok(array.to_static()); } // TODO(ngates): check for null_count if dtype is non-nullable - array.with_compute(|c| { - c.cast().map(|f| f.cast(dtype)).unwrap_or_else(|| { + array.with_dyn(|a| { + a.cast().map(|f| f.cast(dtype)).unwrap_or_else(|| { Err(vortex_err!(NotImplemented: "cast", array.encoding().id().name())) }) }) diff --git a/vortex-array/src/compute/fill.rs b/vortex-array/src/compute/fill.rs index aa2a8a9083..4eae877b97 100644 --- a/vortex-array/src/compute/fill.rs +++ b/vortex-array/src/compute/fill.rs @@ -1,18 +1,18 @@ use vortex_error::{vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef, WithArrayCompute}; +use crate::{Array, ArrayDType, OwnedArray, ToStatic}; pub trait FillForwardFn { - fn fill_forward(&self) -> VortexResult; + fn fill_forward(&self) -> VortexResult; } -pub fn fill_forward(array: &dyn Array) -> VortexResult { +pub fn fill_forward(array: &Array) -> VortexResult { if !array.dtype().is_nullable() { - return Ok(array.to_array()); + return Ok(array.to_static()); } - array.with_compute(|c| { - c.fill_forward() + array.with_dyn(|a| { + a.fill_forward() .map(|t| t.fill_forward()) .unwrap_or_else(|| { Err(vortex_err!( diff --git a/vortex-array/src/compute/flatten.rs b/vortex-array/src/compute/flatten.rs deleted file mode 100644 index 446eedb684..0000000000 --- a/vortex-array/src/compute/flatten.rs +++ /dev/null @@ -1,81 +0,0 @@ -use vortex_error::{vortex_err, VortexResult}; - -use crate::array::bool::BoolArray; -use crate::array::chunked::ChunkedArray; -use crate::array::composite::CompositeArray; -use crate::array::primitive::PrimitiveArray; -use crate::array::struct_::StructArray; -use crate::array::varbin::VarBinArray; -use crate::array::varbinview::VarBinViewArray; -use crate::array::{Array, ArrayRef, WithArrayCompute}; - -pub trait FlattenFn { - fn flatten(&self) -> VortexResult; -} - -/// The set of encodings that can be converted to Arrow with zero-copy. -pub enum FlattenedArray { - Bool(BoolArray), - Chunked(ChunkedArray), - Composite(CompositeArray), - Primitive(PrimitiveArray), - Struct(StructArray), - VarBin(VarBinArray), - VarBinView(VarBinViewArray), -} - -impl FlattenedArray { - pub fn into_array(self) -> ArrayRef { - match self { - FlattenedArray::Bool(array) => array.into_array(), - FlattenedArray::Chunked(array) => array.into_array(), - FlattenedArray::Composite(array) => array.into_array(), - FlattenedArray::Primitive(array) => array.into_array(), - FlattenedArray::Struct(array) => array.into_array(), - FlattenedArray::VarBin(array) => array.into_array(), - FlattenedArray::VarBinView(array) => array.into_array(), - } - } -} - -/// Flatten an array into one of the flat encodings. -/// This does not guarantee that the array is recursively flattened. -pub fn flatten(array: &dyn Array) -> VortexResult { - array.with_compute(|c| { - c.flatten().map(|f| f.flatten()).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "flatten", array.encoding().id().name())) - }) - }) -} - -pub fn flatten_varbin(array: &dyn Array) -> VortexResult { - if let FlattenedArray::VarBin(vb) = flatten(array)? { - Ok(vb) - } else { - Err(vortex_err!("Cannot flatten array {} into varbin", array)) - } -} - -pub fn flatten_bool(array: &dyn Array) -> VortexResult { - if let FlattenedArray::Bool(b) = flatten(array)? { - Ok(b) - } else { - Err(vortex_err!("Cannot flatten array {} into bool", array)) - } -} - -pub fn flatten_primitive(array: &dyn Array) -> VortexResult { - if let FlattenedArray::Primitive(p) = flatten(array)? { - Ok(p) - } else { - Err(vortex_err!("Cannot flatten array {} into primitive", array)) - } -} - -pub fn flatten_struct(array: &dyn Array) -> VortexResult { - if let FlattenedArray::Struct(s) = flatten(array)? { - Ok(s) - } else { - Err(vortex_err!("Cannot flatten array {} into struct", array)) - } -} diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index 7b88bc239c..5ba54a0bda 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -2,21 +2,17 @@ use as_arrow::AsArrowArray; use as_contiguous::AsContiguousFn; use cast::CastFn; use fill::FillForwardFn; -use flatten::*; use patch::PatchFn; use scalar_at::ScalarAtFn; use search_sorted::SearchSortedFn; use slice::SliceFn; use take::TakeFn; -pub mod add; pub mod as_arrow; pub mod as_contiguous; pub mod cast; pub mod fill; -pub mod flatten; pub mod patch; -pub mod repeat; pub mod scalar_at; pub mod search_sorted; pub mod slice; @@ -35,10 +31,6 @@ pub trait ArrayCompute { None } - fn flatten(&self) -> Option<&dyn FlattenFn> { - None - } - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { None } diff --git a/vortex-array/src/compute/patch.rs b/vortex-array/src/compute/patch.rs index 8a98e5ea77..7a02649c53 100644 --- a/vortex-array/src/compute/patch.rs +++ b/vortex-array/src/compute/patch.rs @@ -1,13 +1,13 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef, WithArrayCompute}; +use crate::{Array, ArrayDType, OwnedArray}; pub trait PatchFn { - fn patch(&self, patch: &dyn Array) -> VortexResult; + fn patch(&self, patch: &Array) -> VortexResult; } /// Returns a new array where the non-null values from the patch array are replaced in the original. -pub fn patch(array: &dyn Array, patch: &dyn Array) -> VortexResult { +pub fn patch(array: &Array, patch: &Array) -> VortexResult { if array.len() != patch.len() { vortex_bail!( "patch array {} must have the same length as the original array {}", @@ -20,8 +20,8 @@ pub fn patch(array: &dyn Array, patch: &dyn Array) -> VortexResult { vortex_bail!(MismatchedTypes: array.dtype(), patch.dtype()); } - array.with_compute(|c| { - c.patch().map(|t| t.patch(patch)).unwrap_or_else(|| { + array.with_dyn(|a| { + a.patch().map(|t| t.patch(patch)).unwrap_or_else(|| { Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) }) }) diff --git a/vortex-array/src/compute/repeat.rs b/vortex-array/src/compute/repeat.rs deleted file mode 100644 index c82caf89f2..0000000000 --- a/vortex-array/src/compute/repeat.rs +++ /dev/null @@ -1,19 +0,0 @@ -use crate::array::constant::ConstantArray; -use crate::array::{Array, ArrayRef}; -use crate::scalar::Scalar; - -pub fn repeat(scalar: &Scalar, n: usize) -> ArrayRef { - ConstantArray::new(scalar.clone(), n).into_array() -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_repeat() { - let scalar: Scalar = 47.into(); - let array = repeat(&scalar, 100); - assert_eq!(array.len(), 100); - } -} diff --git a/vortex-array/src/compute/scalar_at.rs b/vortex-array/src/compute/scalar_at.rs index ee4855b50f..86787aff59 100644 --- a/vortex-array/src/compute/scalar_at.rs +++ b/vortex-array/src/compute/scalar_at.rs @@ -1,19 +1,19 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::{Array, WithArrayCompute}; use crate::scalar::Scalar; +use crate::Array; pub trait ScalarAtFn { fn scalar_at(&self, index: usize) -> VortexResult; } -pub fn scalar_at(array: &dyn Array, index: usize) -> VortexResult { +pub fn scalar_at(array: &Array, index: usize) -> VortexResult { if index >= array.len() { vortex_bail!(OutOfBounds: index, 0, array.len()); } - array.with_compute(|c| { - c.scalar_at() + array.with_dyn(|a| { + a.scalar_at() .map(|t| t.scalar_at(index)) .unwrap_or_else(|| { Err(vortex_err!(NotImplemented: "scalar_at", array.encoding().id().name())) diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 469452aa54..5f73e778a0 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -3,9 +3,9 @@ use std::cmp::Ordering::{Equal, Greater, Less}; use vortex_error::{vortex_err, VortexResult}; -use crate::array::{Array, WithArrayCompute}; use crate::compute::scalar_at::scalar_at; use crate::scalar::Scalar; +use crate::{Array, ArrayDType}; #[derive(Debug, Copy, Clone)] pub enum SearchSortedSide { @@ -40,18 +40,18 @@ pub trait SearchSortedFn { } pub fn search_sorted>( - array: &dyn Array, + array: &Array, target: T, side: SearchSortedSide, ) -> VortexResult { let scalar = target.into().cast(array.dtype())?; - array.with_compute(|c| { - if let Some(search_sorted) = c.search_sorted() { + array.with_dyn(|a| { + if let Some(search_sorted) = a.search_sorted() { return search_sorted.search_sorted(&scalar, side); } - if c.scalar_at().is_some() { - return Ok(SearchSorted::search_sorted(&array, &scalar, side)); + if a.scalar_at().is_some() { + return Ok(SearchSorted::search_sorted(array, &scalar, side)); } Err(vortex_err!( @@ -180,9 +180,9 @@ fn search_sorted_side_idx Ordering>( SearchResult::NotFound(left) } -impl IndexOrd for &dyn Array { +impl IndexOrd for Array<'_> { fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { - let scalar_a = scalar_at(*self, idx).ok()?; + let scalar_a = scalar_at(self, idx).ok()?; scalar_a.partial_cmp(elem) } } @@ -194,9 +194,9 @@ impl IndexOrd for [T] { } } -impl Len for &dyn Array { +impl Len for Array<'_> { fn len(&self) -> usize { - Array::len(*self) + Array::len(self) } } diff --git a/vortex-array/src/compute/slice.rs b/vortex-array/src/compute/slice.rs index 0837f3f924..e8358e8b23 100644 --- a/vortex-array/src/compute/slice.rs +++ b/vortex-array/src/compute/slice.rs @@ -1,16 +1,16 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef, WithArrayCompute}; +use crate::{Array, OwnedArray}; /// Limit array to start..stop range pub trait SliceFn { - fn slice(&self, start: usize, stop: usize) -> VortexResult; + fn slice(&self, start: usize, stop: usize) -> VortexResult; } -pub fn slice(array: &dyn Array, start: usize, stop: usize) -> VortexResult { +pub fn slice(array: &Array, start: usize, stop: usize) -> VortexResult { check_slice_bounds(array, start, stop)?; - array.with_compute(|c| { + array.with_dyn(|c| { c.slice().map(|t| t.slice(start, stop)).unwrap_or_else(|| { Err(vortex_err!( NotImplemented: "slice", @@ -20,7 +20,7 @@ pub fn slice(array: &dyn Array, start: usize, stop: usize) -> VortexResult VortexResult<()> { +fn check_slice_bounds(array: &Array, start: usize, stop: usize) -> VortexResult<()> { if start > array.len() { vortex_bail!(OutOfBounds: start, 0, array.len()); } diff --git a/vortex-array/src/compute/take.rs b/vortex-array/src/compute/take.rs index 0eae1ead8f..adc519870c 100644 --- a/vortex-array/src/compute/take.rs +++ b/vortex-array/src/compute/take.rs @@ -1,23 +1,22 @@ use log::info; use vortex_error::{vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef, WithArrayCompute}; -use crate::compute::flatten::flatten; +use crate::{Array, IntoArray, OwnedArray}; pub trait TakeFn { - fn take(&self, indices: &dyn Array) -> VortexResult; + fn take(&self, indices: &Array) -> VortexResult; } -pub fn take(array: &dyn Array, indices: &dyn Array) -> VortexResult { - array.with_compute(|c| { - if let Some(take) = c.take() { +pub fn take(array: &Array, indices: &Array) -> VortexResult { + array.with_dyn(|a| { + if let Some(take) = a.take() { return take.take(indices); } // Otherwise, flatten and try again. info!("TakeFn not implemented for {}, flattening", array); - flatten(array)?.into_array().with_compute(|c| { - c.take().map(|t| t.take(indices)).unwrap_or_else(|| { + array.clone().flatten()?.into_array().with_dyn(|a| { + a.take().map(|t| t.take(indices)).unwrap_or_else(|| { Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) }) }) diff --git a/vortex-array2/src/context.rs b/vortex-array/src/context.rs similarity index 96% rename from vortex-array2/src/context.rs rename to vortex-array/src/context.rs index 9a0641aea0..57482b01f9 100644 --- a/vortex-array2/src/context.rs +++ b/vortex-array/src/context.rs @@ -1,7 +1,6 @@ use std::sync::Arc; -use vortex::encoding::EncodingId; - +use crate::encoding::EncodingId; use crate::encoding::{EncodingRef, VORTEX_ENCODINGS}; /// TODO(ngates): I'm not too sure about this construct. Where it should live, or what scope it diff --git a/vortex-array2/src/data.rs b/vortex-array/src/data.rs similarity index 85% rename from vortex-array2/src/data.rs rename to vortex-array/src/data.rs index 8393d92ff1..5e6e791500 100644 --- a/vortex-array2/src/data.rs +++ b/vortex-array/src/data.rs @@ -1,22 +1,22 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use vortex::scalar::Scalar; use vortex_error::VortexResult; use vortex_schema::DType; use crate::buffer::{Buffer, OwnedBuffer}; use crate::encoding::EncodingRef; +use crate::scalar::Scalar; use crate::stats::Stat; use crate::stats::Statistics; -use crate::{Array, ArrayMetadata, IntoArray, ToArray}; +use crate::{Array, ArrayMetadata, IntoArray, OwnedArray, ToArray}; #[derive(Clone, Debug)] pub struct ArrayData { encoding: EncodingRef, - dtype: DType, + dtype: DType, // FIXME(ngates): Arc? metadata: Arc, - buffers: Arc<[OwnedBuffer]>, // Should this just be an Option, not an Arc? How many multi-buffer arrays are there? + buffer: Option, children: Arc<[ArrayData]>, stats_map: Arc>>, } @@ -26,7 +26,7 @@ impl ArrayData { encoding: EncodingRef, dtype: DType, metadata: Arc, - buffers: Arc<[OwnedBuffer]>, + buffer: Option, children: Arc<[ArrayData]>, statistics: HashMap, ) -> VortexResult { @@ -34,7 +34,7 @@ impl ArrayData { encoding, dtype, metadata, - buffers, + buffer, children, stats_map: Arc::new(RwLock::new(statistics)), }; @@ -59,8 +59,12 @@ impl ArrayData { &self.metadata } - pub fn buffers(&self) -> &[Buffer] { - &self.buffers + pub fn buffer(&self) -> Option<&Buffer> { + self.buffer.as_ref() + } + + pub fn into_buffer(self) -> Option { + self.buffer } pub fn child(&self, index: usize, dtype: &DType) -> Option<&ArrayData> { @@ -88,11 +92,11 @@ impl ArrayData { /// Return the buffer offsets and the total length of all buffers, assuming the given alignment. /// This includes all child buffers. pub fn all_buffer_offsets(&self, alignment: usize) -> Vec { - let mut offsets = Vec::with_capacity(self.buffers.len() + 1); + let mut offsets = vec![]; let mut offset = 0; for col_data in self.depth_first_traversal() { - for buffer in col_data.buffers() { + if let Some(buffer) = col_data.buffer() { offsets.push(offset as u64); let buffer_size = buffer.len(); @@ -106,7 +110,7 @@ impl ArrayData { } } -/// A depth-first iterator over a ArrayData. +/// A depth-first pre-order iterator over a ArrayData. pub struct ArrayDataIterator<'a> { stack: Vec<&'a ArrayData>, } @@ -116,7 +120,7 @@ impl<'a> Iterator for ArrayDataIterator<'a> { fn next(&mut self) -> Option { let next = self.stack.pop()?; - for child in next.children.as_ref().iter() { + for child in next.children.as_ref().iter().rev() { self.stack.push(child); } Some(next) @@ -125,12 +129,12 @@ impl<'a> Iterator for ArrayDataIterator<'a> { impl ToArray for ArrayData { fn to_array(&self) -> Array { - Array::DataRef(self) + Array::Data(self.clone()) } } impl IntoArray<'static> for ArrayData { - fn into_array(self) -> Array<'static> { + fn into_array(self) -> OwnedArray { Array::Data(self) } } diff --git a/vortex-array/src/datetime/localdatetime.rs b/vortex-array/src/datetime/localdatetime.rs deleted file mode 100644 index 0ec00aac8f..0000000000 --- a/vortex-array/src/datetime/localdatetime.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::fmt::{Display, Formatter}; -use std::sync::Arc; - -use arrow_array::{ - ArrayRef as ArrowArrayRef, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, -}; -use vortex_error::VortexResult; -use vortex_schema::CompositeID; - -use crate::array::composite::{composite_impl, TypedCompositeArray}; -use crate::arrow::wrappers::as_nulls; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::cast::cast; -use crate::compute::flatten::flatten_primitive; -use crate::datetime::TimeUnit; -use crate::ptype::PType; -use crate::serde::BytesSerde; -use crate::validity::ArrayValidity; - -#[derive(Debug, Clone)] -pub struct LocalDateTime { - time_unit: TimeUnit, -} - -composite_impl!("vortex.localdatetime", LocalDateTime); - -impl LocalDateTime { - pub fn new(time_unit: TimeUnit) -> Self { - Self { time_unit } - } - - #[inline] - pub fn time_unit(&self) -> TimeUnit { - self.time_unit - } -} - -impl Display for LocalDateTime { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.time_unit) - } -} - -impl BytesSerde for LocalDateTime { - fn serialize(&self) -> Vec { - self.time_unit.serialize() - } - - fn deserialize(metadata: &[u8]) -> VortexResult { - TimeUnit::deserialize(metadata).map(Self::new) - } -} - -pub type LocalDateTimeArray = TypedCompositeArray; - -impl ArrayCompute for LocalDateTimeArray { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - Some(self) - } -} - -impl AsArrowArray for LocalDateTimeArray { - fn as_arrow(&self) -> VortexResult { - // A LocalDateTime maps to an Arrow Timestamp array with no timezone. - let timestamps = flatten_primitive(cast(self.underlying(), PType::I64.into())?.as_ref())?; - let validity = as_nulls(timestamps.logical_validity())?; - let buffer = timestamps.scalar_buffer::(); - - Ok(match self.metadata().time_unit { - TimeUnit::Ns => Arc::new(TimestampNanosecondArray::new(buffer, validity)), - TimeUnit::Us => Arc::new(TimestampMicrosecondArray::new(buffer, validity)), - TimeUnit::Ms => Arc::new(TimestampMillisecondArray::new(buffer, validity)), - TimeUnit::S => Arc::new(TimestampSecondArray::new(buffer, validity)), - }) - } -} diff --git a/vortex-array/src/datetime/mod.rs b/vortex-array/src/datetime/mod.rs deleted file mode 100644 index 23f7243866..0000000000 --- a/vortex-array/src/datetime/mod.rs +++ /dev/null @@ -1,43 +0,0 @@ -use std::fmt::{Display, Formatter}; - -pub use localdatetime::*; -use vortex_error::{vortex_err, VortexResult}; - -use crate::serde::BytesSerde; - -mod localdatetime; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] -pub enum TimeUnit { - Ns, - Us, - Ms, - S, -} - -impl Display for TimeUnit { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - TimeUnit::Ns => write!(f, "ns"), - TimeUnit::Us => write!(f, "us"), - TimeUnit::Ms => write!(f, "ms"), - TimeUnit::S => write!(f, "s"), - } - } -} - -impl BytesSerde for TimeUnit { - fn serialize(&self) -> Vec { - vec![*self as u8] - } - - fn deserialize(data: &[u8]) -> VortexResult { - match data[0] { - 0x00 => Ok(TimeUnit::Ns), - 0x01 => Ok(TimeUnit::Us), - 0x02 => Ok(TimeUnit::Ms), - 0x03 => Ok(TimeUnit::S), - _ => Err(vortex_err!("Unknown timeunit variant")), - } - } -} diff --git a/vortex-array/src/encode.rs b/vortex-array/src/encode.rs deleted file mode 100644 index b099ca08ee..0000000000 --- a/vortex-array/src/encode.rs +++ /dev/null @@ -1,294 +0,0 @@ -use std::sync::Arc; - -use arrow_array::array::{ - Array as ArrowArray, ArrayRef as ArrowArrayRef, BooleanArray as ArrowBooleanArray, - GenericByteArray, NullArray as ArrowNullArray, PrimitiveArray as ArrowPrimitiveArray, - StructArray as ArrowStructArray, -}; -use arrow_array::array::{ArrowPrimitiveType, OffsetSizeTrait}; -use arrow_array::cast::{as_null_array, AsArray}; -use arrow_array::types::{ - ByteArrayType, ByteViewType, Date32Type, Date64Type, DurationMicrosecondType, - DurationMillisecondType, DurationNanosecondType, DurationSecondType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, -}; -use arrow_array::types::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; -use arrow_array::{BinaryViewArray, GenericByteViewArray, StringViewArray}; -use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; -use arrow_buffer::{ArrowNativeType, Buffer, ScalarBuffer}; -use arrow_schema::{DataType, TimeUnit}; -use vortex_schema::DType; - -use crate::array::bool::BoolArray; -use crate::array::constant::ConstantArray; -use crate::array::primitive::PrimitiveArray; -use crate::array::struct_::StructArray; -use crate::array::varbin::VarBinArray; -use crate::array::varbinview::VarBinViewArray; -use crate::array::IntoArray; -use crate::array::{Array, ArrayRef}; -use crate::datetime::{LocalDateTime, LocalDateTimeArray}; -use crate::ptype::{NativePType, PType}; -use crate::scalar::NullScalar; -use crate::stats::Stat; -use crate::validity::Validity; - -pub trait FromArrowArray { - fn from_arrow(array: A, nullable: bool) -> Self; -} - -impl IntoArray for Buffer { - fn into_array(self) -> ArrayRef { - PrimitiveArray::new(PType::U8, self.to_owned(), None).into_array() - } -} - -impl IntoArray for NullBuffer { - fn into_array(self) -> ArrayRef { - BoolArray::new(self.into_inner(), None).into_array() - } -} - -impl IntoArray for ScalarBuffer { - fn into_array(self) -> ArrayRef { - PrimitiveArray::new(T::PTYPE, self.into_inner(), None).into_array() - } -} - -impl IntoArray for OffsetBuffer { - fn into_array(self) -> ArrayRef { - let ptype = if O::IS_LARGE { PType::I64 } else { PType::I32 }; - let array = PrimitiveArray::new(ptype, self.into_inner().into_inner(), None).into_array(); - array.stats().set(Stat::IsSorted, true.into()); - array.stats().set(Stat::IsStrictSorted, true.into()); - array - } -} - -impl FromArrowArray<&ArrowPrimitiveArray> for ArrayRef { - fn from_arrow(value: &ArrowPrimitiveArray, nullable: bool) -> Self { - let ptype: PType = (&T::DATA_TYPE).try_into().unwrap(); - let arr = PrimitiveArray::new( - ptype, - value.values().inner().to_owned(), - nulls(value.nulls(), nullable, value.len()), - ) - .into_array(); - - if T::DATA_TYPE.is_numeric() { - return arr; - } - - match T::DATA_TYPE { - DataType::Timestamp(time_unit, tz) => match tz { - // A timestamp with no timezone is the equivalent of an "unknown" timezone. - // Therefore, we must treat it as a LocalDateTime and not an Instant. - None => LocalDateTimeArray::new(LocalDateTime::new((&time_unit).into()), arr) - .as_composite() - .into_array(), - Some(_tz) => todo!(), - }, - DataType::Date32 => todo!(), - DataType::Date64 => todo!(), - DataType::Time32(_) => todo!(), - DataType::Time64(_) => todo!(), - DataType::Duration(_) => todo!(), - DataType::Interval(_) => todo!(), - _ => panic!("Invalid data type for PrimitiveArray"), - } - } -} - -impl FromArrowArray<&GenericByteArray> for ArrayRef { - fn from_arrow(value: &GenericByteArray, nullable: bool) -> Self { - let dtype = match T::DATA_TYPE { - DataType::Binary | DataType::LargeBinary => DType::Binary(nullable.into()), - DataType::Utf8 | DataType::LargeUtf8 => DType::Utf8(nullable.into()), - _ => panic!("Invalid data type for ByteArray"), - }; - VarBinArray::new( - value.offsets().clone().into_array(), - value.values().clone().into_array(), - dtype, - nulls(value.nulls(), nullable, value.len()), - ) - .into_array() - } -} - -impl FromArrowArray<&GenericByteViewArray> for ArrayRef { - fn from_arrow(value: &GenericByteViewArray, nullable: bool) -> Self { - let dtype = match T::DATA_TYPE { - DataType::BinaryView => DType::Binary(nullable.into()), - DataType::Utf8View => DType::Utf8(nullable.into()), - _ => panic!("Invalid data type for ByteViewArray"), - }; - - VarBinViewArray::try_new( - value.views().inner().clone().into_array(), - value - .data_buffers() - .iter() - .map(|b| b.clone().into_array()) - .collect::>(), - dtype, - nulls(value.nulls(), nullable, value.len()), - ) - .unwrap() - .into_array() - } -} - -impl FromArrowArray<&ArrowBooleanArray> for ArrayRef { - fn from_arrow(value: &ArrowBooleanArray, nullable: bool) -> Self { - BoolArray::new( - value.values().to_owned(), - nulls(value.nulls(), nullable, value.len()), - ) - .into_array() - } -} - -impl FromArrowArray<&ArrowStructArray> for ArrayRef { - fn from_arrow(value: &ArrowStructArray, nullable: bool) -> Self { - // TODO(ngates): how should we deal with Arrow "logical nulls"? - assert!(!nullable); - StructArray::new( - value - .column_names() - .iter() - .map(|s| s.to_string()) - .map(Arc::new) - .collect(), - value - .columns() - .iter() - .zip(value.fields()) - .map(|(c, field)| ArrayRef::from_arrow(c.clone(), field.is_nullable())) - .collect(), - value.len(), - ) - .into_array() - } -} - -impl FromArrowArray<&ArrowNullArray> for ArrayRef { - fn from_arrow(value: &ArrowNullArray, nullable: bool) -> Self { - assert!(nullable); - ConstantArray::new(NullScalar::new(), value.len()).into_array() - } -} - -fn nulls(nulls: Option<&NullBuffer>, nullable: bool, len: usize) -> Option { - if nullable { - Some( - nulls - .map(|nulls| { - if nulls.null_count() == nulls.len() { - Validity::Invalid(len) - } else { - Validity::from(nulls.inner().clone()) - } - }) - .unwrap_or_else(|| Validity::Valid(len)), - ) - } else { - assert!(nulls.is_none()); - None - } -} - -impl FromArrowArray for ArrayRef { - fn from_arrow(array: ArrowArrayRef, nullable: bool) -> Self { - match array.data_type() { - DataType::Boolean => ArrayRef::from_arrow(array.as_boolean(), nullable), - DataType::UInt8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::UInt16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::UInt32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::UInt64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Int8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Int16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Int32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Int64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Float16 => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - DataType::Float32 => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - DataType::Float64 => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - DataType::Utf8 => ArrayRef::from_arrow(array.as_string::(), nullable), - DataType::LargeUtf8 => ArrayRef::from_arrow(array.as_string::(), nullable), - DataType::Binary => ArrayRef::from_arrow(array.as_binary::(), nullable), - DataType::LargeBinary => ArrayRef::from_arrow(array.as_binary::(), nullable), - DataType::BinaryView => ArrayRef::from_arrow( - array.as_any().downcast_ref::().unwrap(), - nullable, - ), - DataType::Utf8View => ArrayRef::from_arrow( - array.as_any().downcast_ref::().unwrap(), - nullable, - ), - DataType::Struct(_) => ArrayRef::from_arrow(array.as_struct(), nullable), - DataType::Null => ArrayRef::from_arrow(as_null_array(&array), nullable), - DataType::Timestamp(u, _) => match u { - TimeUnit::Second => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Millisecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Microsecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Nanosecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - }, - DataType::Date32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Date64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), - DataType::Time32(u) => match u { - TimeUnit::Second => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Millisecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - _ => unreachable!(), - }, - DataType::Time64(u) => match u { - TimeUnit::Microsecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Nanosecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - _ => unreachable!(), - }, - DataType::Duration(u) => match u { - TimeUnit::Second => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Millisecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Microsecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - TimeUnit::Nanosecond => { - ArrayRef::from_arrow(array.as_primitive::(), nullable) - } - }, - _ => panic!( - "TODO(robert): Missing array encoding for dtype {}", - array.data_type().clone() - ), - } - } -} diff --git a/vortex-array/src/encoding.rs b/vortex-array/src/encoding.rs index 9287f3b65d..ae6a01b6b0 100644 --- a/vortex-array/src/encoding.rs +++ b/vortex-array/src/encoding.rs @@ -1,10 +1,14 @@ +use std::any::Any; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; use linkme::distributed_slice; +use vortex_error::VortexResult; use crate::compress::EncodingCompression; -use crate::serde::EncodingSerde; +use crate::flatten::{ArrayFlatten, Flattened}; +use crate::ArrayDef; +use crate::{Array, ArrayTrait}; #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] pub struct EncodingId(&'static str); @@ -26,45 +30,73 @@ impl Display for EncodingId { } } -pub trait Encoding: Debug + Send + Sync + 'static { - fn id(&self) -> EncodingId; +#[distributed_slice] +pub static VORTEX_ENCODINGS: [EncodingRef] = [..]; - /// Whether this encoding provides a compressor. - fn compression(&self) -> Option<&dyn EncodingCompression> { - None - } +pub type EncodingRef = &'static dyn ArrayEncoding; - /// Array serialization - fn serde(&self) -> Option<&dyn EncodingSerde> { - None - } +pub fn find_encoding(id: &str) -> Option { + VORTEX_ENCODINGS + .iter() + .find(|&x| x.id().name() == id) + .cloned() } -impl Display for dyn Encoding { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.id()) - } -} +/// Object-safe encoding trait for an array. +pub trait ArrayEncoding: 'static + Sync + Send + Debug { + fn as_any(&self) -> &dyn Any; + + fn id(&self) -> EncodingId; + + /// Flatten the given array. + fn flatten<'a>(&self, array: Array<'a>) -> VortexResult>; -pub type EncodingRef = &'static dyn Encoding; + /// Unwrap the provided array into an implementation of ArrayTrait + fn with_dyn<'a>( + &self, + array: &'a Array<'a>, + f: &mut dyn for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> VortexResult<()>, + ) -> VortexResult<()>; -impl PartialEq for EncodingRef { + /// Return a compressor for this encoding. + fn compression(&self) -> &dyn EncodingCompression; +} + +impl PartialEq for dyn ArrayEncoding + '_ { fn eq(&self, other: &Self) -> bool { self.id() == other.id() } } - -impl Eq for EncodingRef {} - -impl Hash for EncodingRef { +impl Eq for dyn ArrayEncoding + '_ {} +impl Hash for dyn ArrayEncoding + '_ { fn hash(&self, state: &mut H) { self.id().hash(state) } } -#[distributed_slice] -pub static ENCODINGS: [EncodingRef] = [..]; +/// Non-object-safe extensions to the ArrayEncoding trait. +pub trait ArrayEncodingExt { + type D: ArrayDef; -pub fn find_encoding(id: &str) -> Option { - ENCODINGS.iter().find(|&x| x.id().name() == id).cloned() + fn flatten<'a>(array: Array<'a>) -> VortexResult> + where + ::D: 'a, + { + let typed = <::Array<'a> as TryFrom>::try_from(array)?; + ArrayFlatten::flatten(typed) + } + + fn with_dyn<'a, R, F>(array: &'a Array<'a>, mut f: F) -> R + where + F: for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> R, + ::D: 'a, + { + let typed = + <::Array<'a> as TryFrom>::try_from(array.clone()).unwrap(); + f(&typed) + } +} + +pub trait ArrayEncodingRef { + fn encoding(&self) -> EncodingRef; } diff --git a/vortex-array2/src/flatten.rs b/vortex-array/src/flatten.rs similarity index 85% rename from vortex-array2/src/flatten.rs rename to vortex-array/src/flatten.rs index 35a2b1e00c..c8b8e6a37b 100644 --- a/vortex-array2/src/flatten.rs +++ b/vortex-array/src/flatten.rs @@ -6,6 +6,7 @@ use crate::array::composite::CompositeArray; use crate::array::primitive::PrimitiveArray; use crate::array::r#struct::StructArray; use crate::array::varbin::VarBinArray; +use crate::array::varbinview::VarBinViewArray; use crate::encoding::ArrayEncoding; use crate::{Array, IntoArray}; @@ -17,6 +18,7 @@ pub enum Flattened<'a> { Primitive(PrimitiveArray<'a>), Struct(StructArray<'a>), VarBin(VarBinArray<'a>), + VarBinView(VarBinViewArray<'a>), } pub trait ArrayFlatten { @@ -37,6 +39,10 @@ impl<'a> Array<'a> { pub fn flatten_primitive(self) -> VortexResult> { PrimitiveArray::try_from(self.flatten()?.into_array()) } + + pub fn flatten_varbin(self) -> VortexResult> { + VarBinArray::try_from(self.flatten()?.into_array()) + } } impl<'a> IntoArray<'a> for Flattened<'a> { @@ -48,6 +54,7 @@ impl<'a> IntoArray<'a> for Flattened<'a> { Flattened::Chunked(a) => a.into_array(), Flattened::VarBin(a) => a.into_array(), Flattened::Composite(a) => a.into_array(), + Flattened::VarBinView(a) => a.into_array(), } } } diff --git a/vortex-array/src/formatter.rs b/vortex-array/src/formatter.rs deleted file mode 100644 index b196da6561..0000000000 --- a/vortex-array/src/formatter.rs +++ /dev/null @@ -1,133 +0,0 @@ -use std::fmt; -use std::fmt::{Display, Write}; - -use humansize::{format_size, DECIMAL}; - -use crate::array::{Array, ArrayRef}; -use crate::validity::ValidityView; - -pub trait ArrayDisplay { - fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> fmt::Result; -} - -pub struct ArrayFormatterWrapper<'a>(&'a dyn Array); - -impl<'a> ArrayFormatterWrapper<'a> { - pub fn new(array: &'a dyn Array) -> ArrayFormatterWrapper<'a> { - ArrayFormatterWrapper(array) - } -} - -impl<'a, 'b: 'a> Display for ArrayFormatterWrapper<'a> { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - let array = self.0; - let mut array_fmt = ArrayFormatter::new(fmt, "".to_string(), array.nbytes()); - array_fmt.child("root", array) - } -} - -pub fn display_tree(array: &dyn Array) -> String { - let mut string = String::new(); - write!(string, "{}", ArrayFormatterWrapper(array)).unwrap(); - string -} - -pub struct ArrayFormatter<'a, 'b: 'a> { - fmt: &'a mut fmt::Formatter<'b>, - indent: String, - total_size: usize, -} - -impl<'a, 'b: 'a> ArrayFormatter<'a, 'b> { - fn new( - fmt: &'a mut fmt::Formatter<'b>, - indent: String, - total_size: usize, - ) -> ArrayFormatter<'a, 'b> { - ArrayFormatter { - fmt, - indent, - total_size, - } - } - - pub fn property(&mut self, name: &str, value: T) -> fmt::Result { - writeln!(self.fmt, "{}{}: {}", self.indent, name, value) - } - - pub fn child(&mut self, name: &str, array: &dyn Array) -> fmt::Result { - writeln!( - self.fmt, - "{}{}: {} nbytes={} ({:.2}%)", - self.indent, - name, - array, - format_size(array.nbytes(), DECIMAL), - 100f64 * array.nbytes() as f64 / self.total_size as f64 - )?; - self.indent(|indent| ArrayDisplay::fmt(array, indent)) - } - - pub fn validity(&mut self, validity: Option) -> fmt::Result { - if let Some(validity) = validity { - match validity { - ValidityView::Valid(_) => Ok(()), - ValidityView::Invalid(_) => { - writeln!(self.fmt, "{}validity: all invalid", self.indent) - } - ValidityView::Array(a) => self.child("validity", a), - } - } else { - writeln!(self.fmt, "{}validity: None", self.indent) - } - } - - pub fn maybe_child(&mut self, name: &str, array: Option<&ArrayRef>) -> fmt::Result { - if let Some(array) = array { - self.child(&format!("{}?", name), array) - } else { - writeln!(self.fmt, "{}{}: None", self.indent, name) - } - } - - fn indent(&mut self, indented: F) -> fmt::Result - where - F: FnOnce(&mut ArrayFormatter) -> fmt::Result, - { - let original_ident = self.indent.clone(); - self.indent += " "; - let res = indented(self); - self.indent = original_ident; - res - } - - pub fn new_total_size(&mut self, total: usize, new_total: F) -> fmt::Result - where - F: FnOnce(&mut ArrayFormatter) -> fmt::Result, - { - let original_total = self.total_size; - self.total_size = total; - let res = new_total(self); - self.total_size = original_total; - res - } -} - -#[cfg(test)] -mod test { - use crate::array::ArrayRef; - use crate::array::IntoArray; - use crate::formatter::display_tree; - - #[test] - fn display_primitive() { - let arr: ArrayRef = (0..100).collect::>().into_array(); - assert_eq!(format!("{}", arr), "vortex.primitive(int(32), len=100)"); - } - - #[test] - fn tree_display_primitive() { - let arr: ArrayRef = (0..100).collect::>().into_array(); - assert_eq!(display_tree(&arr), "root: vortex.primitive(int(32), len=100) nbytes=400 B (100.00%)\n values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]...\n validity: None\n") - } -} diff --git a/vortex-array/src/implementation.rs b/vortex-array/src/implementation.rs new file mode 100644 index 0000000000..c07ff65649 --- /dev/null +++ b/vortex-array/src/implementation.rs @@ -0,0 +1,269 @@ +use vortex_error::{vortex_bail, VortexError, VortexResult}; +use vortex_schema::DType; + +use crate::buffer::{Buffer, OwnedBuffer}; +use crate::encoding::{ArrayEncoding, ArrayEncodingRef, EncodingRef}; +use crate::encoding::{ArrayEncodingExt, EncodingId}; +use crate::stats::{ArrayStatistics, Statistics}; +use crate::visitor::ArrayVisitor; +use crate::{ + Array, ArrayDType, ArrayData, ArrayMetadata, AsArray, GetArrayMetadata, IntoArray, + IntoArrayData, ToArrayData, ToStatic, +}; +use crate::{ArrayTrait, TryDeserializeArrayMetadata}; + +/// Trait the defines the set of types relating to an array. +/// Because it has associated types it can't be used as a trait object. +pub trait ArrayDef { + const ID: EncodingId; + const ENCODING: EncodingRef; + + type Array<'a>: ArrayTrait + TryFrom, Error = VortexError> + 'a; + type Metadata: ArrayMetadata + Clone + for<'m> TryDeserializeArrayMetadata<'m>; + type Encoding: ArrayEncoding + ArrayEncodingExt; +} + +#[macro_export] +macro_rules! impl_encoding { + ($id:literal, $Name:ident) => { + use $crate::vendored::paste::paste; + + paste! { + use $crate::{ + Array, + ArrayData, + ArrayDef, + ArrayMetadata, + ArrayTrait, + AsArray, + Flattened, + GetArrayMetadata, + IntoArray, + ToArray, + TypedArray, + }; + use $crate::compress::EncodingCompression; + use $crate::encoding::{ + ArrayEncoding, + ArrayEncodingExt, + EncodingId, + EncodingRef, + VORTEX_ENCODINGS, + }; + use $crate::stats::Stat; + use $crate::scalar::Scalar; + use std::any::Any; + use std::collections::HashMap; + use std::fmt::Debug; + use std::marker::{Send, Sync}; + use std::sync::Arc; + use vortex_error::VortexError; + use vortex_schema::DType; + + /// The array definition trait + #[derive(Debug, Clone)] + pub struct $Name; + impl ArrayDef for $Name { + const ID: EncodingId = EncodingId::new($id); + const ENCODING: EncodingRef = &[<$Name Encoding>]; + type Array<'a> = [<$Name Array>]<'a>; + type Metadata = [<$Name Metadata>]; + type Encoding = [<$Name Encoding>]; + } + + #[derive(Debug, Clone)] + pub struct [<$Name Array>]<'a> { + typed: TypedArray<'a, $Name> + } + pub type [] = [<$Name Array>]<'static>; + impl<'a> [<$Name Array>]<'a> { + pub fn array(&'a self) -> &'a Array<'a> { + self.typed.array() + } + pub fn metadata(&'a self) -> &'a [<$Name Metadata>] { + self.typed.metadata() + } + + #[allow(dead_code)] + fn try_from_parts( + dtype: DType, + metadata: [<$Name Metadata>], + children: Arc<[ArrayData]>, + stats: HashMap, + ) -> VortexResult { + Ok(Self { typed: TypedArray::try_from_parts(dtype, metadata, None, children, stats)? }) + } + } + impl<'a> GetArrayMetadata for [<$Name Array>]<'a> { + fn metadata(&self) -> Arc { + Arc::new(self.metadata().clone()) + } + } + impl<'a> AsArray for [<$Name Array>]<'a> { + fn as_array_ref(&self) -> &Array { + self.typed.array() + } + } + impl<'a> ToArray for [<$Name Array>]<'a> { + fn to_array(&self) -> Array { + self.typed.to_array() + } + } + impl<'a> IntoArray<'a> for [<$Name Array>]<'a> { + fn into_array(self) -> Array<'a> { + self.typed.into_array() + } + } + impl<'a> From> for [<$Name Array>]<'a> { + fn from(typed: TypedArray<'a, $Name>) -> Self { + Self { typed } + } + } + impl<'a> TryFrom> for [<$Name Array>]<'a> { + type Error = VortexError; + + fn try_from(array: Array<'a>) -> Result { + TypedArray::<$Name>::try_from(array).map(Self::from) + } + } + impl<'a> TryFrom<&'a Array<'a>> for [<$Name Array>]<'a> { + type Error = VortexError; + + fn try_from(array: &'a Array<'a>) -> Result { + TypedArray::<$Name>::try_from(array).map(Self::from) + } + } + + /// The array encoding + #[derive(Debug)] + pub struct [<$Name Encoding>]; + #[$crate::linkme::distributed_slice(VORTEX_ENCODINGS)] + #[allow(non_upper_case_globals)] + static []: EncodingRef = &[<$Name Encoding>]; + impl ArrayEncoding for [<$Name Encoding>] { + fn as_any(&self) -> &dyn Any { + self + } + + fn id(&self) -> EncodingId { + $Name::ID + } + + fn flatten<'a>(&self, array: Array<'a>) -> VortexResult> { + ::flatten(array) + } + + #[inline] + fn with_dyn<'a>( + &self, + array: &'a Array<'a>, + f: &mut dyn for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> VortexResult<()>, + ) -> VortexResult<()> { + ::with_dyn(array, f) + } + + fn compression(&self) -> &dyn EncodingCompression { + self + } + } + impl ArrayEncodingExt for [<$Name Encoding>] { + type D = $Name; + } + + /// Implement ArrayMetadata + impl ArrayMetadata for [<$Name Metadata>] { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + } + } + }; +} + +impl<'a> AsArray for Array<'a> { + fn as_array_ref(&self) -> &Array { + self + } +} + +impl ArrayEncodingRef for T { + fn encoding(&self) -> EncodingRef { + self.as_array_ref().encoding() + } +} + +impl ArrayDType for T { + fn dtype(&self) -> &DType { + match self.as_array_ref() { + Array::Data(d) => d.dtype(), + Array::View(v) => v.dtype(), + } + } +} + +impl ArrayStatistics for T { + fn statistics(&self) -> &(dyn Statistics + '_) { + match self.as_array_ref() { + Array::Data(d) => d.statistics(), + Array::View(v) => v.statistics(), + } + } +} + +impl<'a, T: IntoArray<'a> + ArrayEncodingRef + ArrayStatistics + GetArrayMetadata> IntoArrayData + for T +{ + fn into_array_data(self) -> ArrayData { + let encoding = self.encoding(); + let metadata = self.metadata(); + let stats = self.statistics().to_map(); + let array = self.into_array(); + match array { + Array::Data(d) => d, + Array::View(_) => { + struct Visitor { + buffer: Option, + children: Vec, + } + impl ArrayVisitor for Visitor { + fn visit_child(&mut self, _name: &str, array: &Array) -> VortexResult<()> { + self.children.push(array.to_array_data()); + Ok(()) + } + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { + if self.buffer.is_some() { + vortex_bail!("Multiple buffers found in view") + } + self.buffer = Some(buffer.to_static()); + Ok(()) + } + } + let mut visitor = Visitor { + buffer: None, + children: vec![], + }; + array.with_dyn(|a| a.accept(&mut visitor).unwrap()); + ArrayData::try_new( + encoding, + array.dtype().clone(), + metadata, + visitor.buffer, + visitor.children.into(), + stats, + ) + .unwrap() + } + } + } +} + +impl ToArrayData for T { + fn to_array_data(&self) -> ArrayData { + self.clone().into_array_data() + } +} diff --git a/vortex-array/src/iterator.rs b/vortex-array/src/iterator.rs deleted file mode 100644 index 40a85e1f03..0000000000 --- a/vortex-array/src/iterator.rs +++ /dev/null @@ -1,57 +0,0 @@ -use std::marker::PhantomData; - -use crate::accessor::ArrayAccessor; - -pub struct ArrayIter<'a, A: ArrayAccessor<'a, T>, T> { - array: &'a A, - current: usize, - end: usize, - phantom: PhantomData, -} - -impl<'a, A: ArrayAccessor<'a, T>, T> ArrayIter<'a, A, T> { - pub fn new(array: &'a A) -> Self { - let len = array.len(); - ArrayIter { - array, - current: 0, - end: len, - phantom: PhantomData, - } - } -} - -impl<'a, A: ArrayAccessor<'a, T>, T> Iterator for ArrayIter<'a, A, T> { - type Item = Option; - - #[inline] - fn next(&mut self) -> Option { - if self.current == self.end { - None - } else { - let old = self.current; - self.current += 1; - Some(self.array.value(old)) - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a, A: ArrayAccessor<'a, T>, T> DoubleEndedIterator for ArrayIter<'a, A, T> { - fn next_back(&mut self) -> Option { - if self.end == self.current { - None - } else { - self.end -= 1; - Some(self.array.value(self.end)) - } - } -} - -impl<'a, A: ArrayAccessor<'a, T>, T> ExactSizeIterator for ArrayIter<'a, A, T> {} diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 7881597a68..26d5ea0bf2 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -1,35 +1,246 @@ -extern crate core; - +pub mod accessor; pub mod array; pub mod arrow; -pub mod scalar; - -pub mod accessor; +pub mod buffer; pub mod compress; pub mod compute; -pub mod datetime; -pub mod encode; +mod context; +mod data; pub mod encoding; -pub mod formatter; -pub mod iterator; +mod flatten; +mod implementation; +mod metadata; pub mod ptype; mod sampling; -pub mod serde; +pub mod scalar; pub mod stats; +mod tree; +mod typed; pub mod validity; -pub mod view; -mod walk; +pub mod vendored; +mod view; +pub mod visitor; + +use std::fmt::{Debug, Display, Formatter}; + +pub use ::paste; +pub use context::*; +pub use data::*; +pub use flatten::*; +pub use implementation::*; +pub use linkme; +pub use metadata::*; +pub use typed::*; +pub use view::*; +use vortex_error::VortexResult; +use vortex_schema::DType; -pub use walk::*; +use crate::buffer::Buffer; +use crate::compute::ArrayCompute; +use crate::encoding::{ArrayEncodingRef, EncodingRef}; +use crate::stats::{ArrayStatistics, ArrayStatisticsCompute}; +use crate::validity::ArrayValidity; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; pub mod flatbuffers { - pub use generated::vortex::*; + pub use gen_array::vortex::*; + pub use gen_scalar::vortex::*; #[allow(unused_imports)] #[allow(dead_code)] #[allow(non_camel_case_types)] #[allow(clippy::all)] - mod generated { + mod gen_array { include!(concat!(env!("OUT_DIR"), "/flatbuffers/array.rs")); } + + #[allow(unused_imports)] + #[allow(dead_code)] + #[allow(non_camel_case_types)] + #[allow(clippy::all)] + mod gen_scalar { + include!(concat!(env!("OUT_DIR"), "/flatbuffers/scalar.rs")); + } + + mod deps { + pub mod dtype { + #[allow(unused_imports)] + pub use vortex_schema::flatbuffers as dtype; + } + } +} + +#[derive(Debug, Clone)] +pub enum Array<'v> { + Data(ArrayData), + View(ArrayView<'v>), +} + +pub type OwnedArray = Array<'static>; + +impl Array<'_> { + pub fn encoding(&self) -> EncodingRef { + match self { + Array::Data(d) => d.encoding(), + Array::View(v) => v.encoding(), + } + } + + pub fn len(&self) -> usize { + self.with_dyn(|a| a.len()) + } + + pub fn nbytes(&self) -> usize { + self.with_dyn(|a| a.nbytes()) + } + + pub fn is_empty(&self) -> bool { + self.with_dyn(|a| a.is_empty()) + } + + pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option> { + match self { + Array::Data(d) => d.child(idx, dtype).cloned().map(Array::Data), + Array::View(v) => v.child(idx, dtype).map(Array::View), + } + } + + pub fn buffer(&self) -> Option<&Buffer> { + match self { + Array::Data(d) => d.buffer(), + Array::View(v) => v.buffer(), + } + } +} + +impl<'a> Array<'a> { + pub fn into_buffer(self) -> Option> { + match self { + Array::Data(d) => d.into_buffer(), + Array::View(v) => v.buffer().map(|b| b.to_static()), + } + } +} + +impl ToStatic for Array<'_> { + type Static = OwnedArray; + + fn to_static(&self) -> Self::Static { + Array::Data(self.to_array_data()) + } +} + +pub trait ToArray { + fn to_array(&self) -> Array; +} + +pub trait IntoArray<'a> { + fn into_array(self) -> Array<'a>; +} + +pub trait ToArrayData { + fn to_array_data(&self) -> ArrayData; +} + +pub trait IntoArrayData { + fn into_array_data(self) -> ArrayData; +} + +pub trait ToStatic { + type Static; + + fn to_static(&self) -> Self::Static; +} + +pub trait AsArray { + fn as_array_ref(&self) -> &Array; +} + +/// Collects together the behaviour of an array. +pub trait ArrayTrait: + ArrayEncodingRef + + ArrayCompute + + ArrayDType + + ArrayFlatten + + ArrayValidity + + AcceptArrayVisitor + + ArrayStatistics + + ArrayStatisticsCompute + + ToArrayData +{ + fn len(&self) -> usize; + + fn is_empty(&self) -> bool { + // TODO(ngates): remove this default impl to encourage explicit implementation + self.len() == 0 + } + + fn nbytes(&self) -> usize { + let mut visitor = NBytesVisitor(0); + self.accept(&mut visitor).unwrap(); + visitor.0 + } +} + +pub trait ArrayDType { + // TODO(ngates): move into ArrayTrait? + fn dtype(&self) -> &DType; +} + +struct NBytesVisitor(usize); +impl ArrayVisitor for NBytesVisitor { + fn visit_child(&mut self, _name: &str, array: &Array) -> VortexResult<()> { + self.0 += array.with_dyn(|a| a.nbytes()); + Ok(()) + } + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { + self.0 += buffer.len(); + Ok(()) + } +} + +impl<'a> Array<'a> { + pub fn with_dyn(&'a self, mut f: F) -> R + where + F: FnMut(&dyn ArrayTrait) -> R, + { + let mut result = None; + + self.encoding() + .with_dyn(self, &mut |array| { + result = Some(f(array)); + Ok(()) + }) + .unwrap(); + + // Now we unwrap the optional, which we know to be populated by the closure. + result.unwrap() + } +} + +impl Display for Array<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let prefix = match self { + Array::Data(_) => "", + Array::View(_) => "$", + }; + write!( + f, + "{}{}({}, len={})", + prefix, + self.encoding().id(), + self.dtype(), + self.len() + ) + } +} + +impl IntoArrayData for Array<'_> { + fn into_array_data(self) -> ArrayData { + match self { + Array::Data(d) => d, + Array::View(_) => self.with_dyn(|a| a.to_array_data()), + } + } } diff --git a/vortex-array2/src/metadata.rs b/vortex-array/src/metadata.rs similarity index 87% rename from vortex-array2/src/metadata.rs rename to vortex-array/src/metadata.rs index de9a6177b0..0569f4f97b 100644 --- a/vortex-array2/src/metadata.rs +++ b/vortex-array/src/metadata.rs @@ -10,19 +10,21 @@ use vortex_error::{vortex_err, VortexResult}; /// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the /// metadata trait, and not the entire array trait. We require 'static so that we can downcast /// use the Any trait. +/// TODO(ngates): add Display pub trait ArrayMetadata: 'static + Send + Sync + Debug + TrySerializeArrayMetadata { fn as_any(&self) -> &dyn Any; fn as_any_arc(self: Arc) -> Arc; } +pub trait GetArrayMetadata { + fn metadata(&self) -> Arc; +} + pub trait TrySerializeArrayMetadata { fn try_serialize_metadata(&self) -> VortexResult>; } -// TODO(ngates): move 'm lifetime into the function body since the result isn't tied to it. -// Although maybe we should make the result tied to ti? pub trait TryDeserializeArrayMetadata<'m>: Sized { - // FIXME(ngates): we could push buffer/child validation into here. fn try_deserialize_metadata(metadata: Option<&'m [u8]>) -> VortexResult; } diff --git a/vortex-array/src/ptype.rs b/vortex-array/src/ptype.rs index 2989c87e3b..bd34934ba0 100644 --- a/vortex-array/src/ptype.rs +++ b/vortex-array/src/ptype.rs @@ -5,6 +5,7 @@ use arrow_array::types::*; use arrow_buffer::ArrowNativeType; use half::f16; use num_traits::{Num, NumCast}; +use serde::{Deserialize, Serialize}; use vortex_error::{vortex_err, VortexError, VortexResult}; use vortex_schema::DType::*; use vortex_schema::{DType, FloatWidth, IntWidth}; @@ -12,7 +13,7 @@ use vortex_schema::{DType, FloatWidth, IntWidth}; use crate::scalar::{PScalar, Scalar}; #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)] pub enum PType { U8, U16, diff --git a/vortex-array/src/scalar/mod.rs b/vortex-array/src/scalar/mod.rs index b3f70f078c..4c2c255961 100644 --- a/vortex-array/src/scalar/mod.rs +++ b/vortex-array/src/scalar/mod.rs @@ -7,7 +7,6 @@ use half::f16; pub use list::*; pub use null::*; pub use primitive::*; -pub use serde::*; pub use struct_::*; pub use utf8::*; use vortex_error::VortexResult; diff --git a/vortex-array/src/scalar/serde.rs b/vortex-array/src/scalar/serde.rs index 5728d85a6f..a749f20094 100644 --- a/vortex-array/src/scalar/serde.rs +++ b/vortex-array/src/scalar/serde.rs @@ -1,230 +1,276 @@ -use std::io; -use std::sync::Arc; - -use num_enum::{IntoPrimitive, TryFromPrimitive}; +use flatbuffers::{root, FlatBufferBuilder, WIPOffset}; +use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use vortex_error::VortexResult; -use vortex_schema::{DType, Nullability}; +use vortex_error::{vortex_bail, VortexError}; +use vortex_flatbuffers::{FlatBufferRoot, FlatBufferToBytes, ReadFlatBuffer, WriteFlatBuffer}; +use vortex_schema::{DTypeSerdeContext, Nullability}; +use crate::flatbuffers::scalar as fb; use crate::match_each_native_ptype; -use crate::scalar::composite::CompositeScalar; -use crate::scalar::{ - BinaryScalar, BoolScalar, ListScalar, NullScalar, PScalar, PrimitiveScalar, Scalar, - StructScalar, Utf8Scalar, -}; -use crate::serde::{ReadCtx, WriteCtx}; - -pub struct ScalarReader<'a, 'b> { - reader: &'b mut ReadCtx<'a>, -} +use crate::ptype::PType; +use crate::scalar::{PScalar, PrimitiveScalar, Scalar, Utf8Scalar}; -impl<'a, 'b> ScalarReader<'a, 'b> { - pub fn new(reader: &'b mut ReadCtx<'a>) -> Self { - Self { reader } - } +impl FlatBufferRoot for Scalar {} - pub fn read(&mut self) -> VortexResult { - let tag = ScalarTag::try_from(self.reader.read_nbytes::<1>()?[0]) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - let nullability = self.reader.nullability()?; +impl WriteFlatBuffer for Scalar { + type Target<'a> = fb::Scalar<'a>; - match tag { - ScalarTag::Binary => { - let slice = self.reader.read_optional_slice()?; - Ok(BinaryScalar::try_new(slice, nullability)?.into()) - } - ScalarTag::Bool => { - let is_present = self.reader.read_option_tag()?; - let bool = self.reader.read_nbytes::<1>()?[0] != 0; - Ok(BoolScalar::try_new(is_present.then_some(bool), nullability)?.into()) - } - ScalarTag::PrimitiveS => self.read_primitive_scalar(nullability).map(|p| p.into()), - ScalarTag::List => { - let is_present = self.reader.read_option_tag()?; - if is_present { - let elems = self.reader.read_usize()?; - let mut values = Vec::with_capacity(elems); - for _ in 0..elems { - values.push(self.read()?); - } - Ok(ListScalar::new(values[0].dtype().clone(), Some(values)).into()) - } else { - Ok(ListScalar::new(self.reader.dtype()?, None).into()) + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let union = match self { + Scalar::Binary(b) => { + let bytes = b.value().map(|bytes| fbb.create_vector(bytes)); + fb::ScalarArgs { + type_type: fb::Type::Binary, + type_: Some( + fb::Binary::create(fbb, &fb::BinaryArgs { value: bytes }).as_union_value(), + ), + nullability: self.nullability().into(), } } - ScalarTag::Null => Ok(NullScalar::new().into()), - ScalarTag::Struct => { - let field_num = self.reader.read_usize()?; - let mut names = Vec::with_capacity(field_num); - for _ in 0..field_num { - names.push(Arc::new( - self.reader - .read_slice() - .map(|v| unsafe { String::from_utf8_unchecked(v) })?, - )); + Scalar::Bool(b) => fb::ScalarArgs { + type_type: fb::Type::Bool, + // TODO(ngates): I think this optional is in the wrong place and should be inside BoolArgs. + // However I think Rust Flatbuffers has incorrectly generated non-optional BoolArgs. + type_: b + .value() + .map(|&value| fb::Bool::create(fbb, &fb::BoolArgs { value }).as_union_value()), + nullability: self.nullability().into(), + }, + Scalar::List(_) => panic!("List not supported in scalar serde"), + Scalar::Null(_) => fb::ScalarArgs { + type_type: fb::Type::Null, + type_: Some(fb::Null::create(fbb, &fb::NullArgs {}).as_union_value()), + nullability: self.nullability().into(), + }, + Scalar::Primitive(p) => { + let bytes = p.value().map(|pscalar| match pscalar { + PScalar::U8(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::U16(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::U32(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::U64(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::I8(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::I16(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::I32(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::I64(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::F16(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::F32(v) => fbb.create_vector(&v.to_le_bytes()), + PScalar::F64(v) => fbb.create_vector(&v.to_le_bytes()), + }); + let primitive = fb::Primitive::create( + fbb, + &fb::PrimitiveArgs { + ptype: p.ptype().into(), + bytes, + }, + ); + fb::ScalarArgs { + type_type: fb::Type::Primitive, + type_: Some(primitive.as_union_value()), + nullability: self.nullability().into(), } - let mut values = Vec::with_capacity(field_num); - for _ in 0..field_num { - values.push(self.read()?); - } - let dtypes = values.iter().map(|s| s.dtype().clone()).collect::>(); - Ok(StructScalar::new(DType::Struct(names, dtypes), values).into()) - } - ScalarTag::Utf8 => { - let value = self.reader.read_optional_slice()?; - Ok(Utf8Scalar::try_new( - value.map(|v| unsafe { String::from_utf8_unchecked(v) }), - nullability, - )? - .into()) } - ScalarTag::Composite => { - let dtype = self.reader.dtype()?; - let scalar = self.read()?; - Ok(CompositeScalar::new(dtype, Box::new(scalar)).into()) + Scalar::Struct(_) => panic!(), + Scalar::Utf8(utf) => { + let value = utf.value().map(|utf| fbb.create_string(utf)); + let value = fb::UTF8::create(fbb, &fb::UTF8Args { value }).as_union_value(); + fb::ScalarArgs { + type_type: fb::Type::UTF8, + type_: Some(value), + nullability: self.nullability().into(), + } } - } - } + Scalar::Composite(_) => panic!(), + }; - fn read_primitive_scalar(&mut self, nullability: Nullability) -> VortexResult { - let ptype = self.reader.ptype()?; - let is_present = self.reader.read_option_tag()?; - match_each_native_ptype!(ptype, |$P| { - let value = if is_present { - Some($P::from_le_bytes(self.reader.read_nbytes()?)) - } else { - None - }; - Ok(PrimitiveScalar::try_new::<$P>(value, nullability)?) - }) + fb::Scalar::create(fbb, &union) } } -pub struct ScalarWriter<'a, 'b> { - writer: &'b mut WriteCtx<'a>, -} +impl ReadFlatBuffer for Scalar { + type Source<'a> = fb::Scalar<'a>; + type Error = VortexError; -impl<'a, 'b> ScalarWriter<'a, 'b> { - pub fn new(writer: &'b mut WriteCtx<'a>) -> Self { - Self { writer } - } - - pub fn write(&mut self, scalar: &Scalar) -> VortexResult<()> { - self.writer - .write_fixed_slice([ScalarTag::from(scalar).into()])?; - self.writer.nullability(scalar.nullability())?; - - match scalar { - Scalar::Binary(b) => self - .writer - .write_optional_slice(b.value().map(|b| b.as_slice())), - Scalar::Bool(b) => { - self.writer.write_option_tag(b.value().is_some())?; - if let Some(&v) = b.value() { - self.writer.write_fixed_slice([v as u8])?; - } - Ok(()) + fn read_flatbuffer( + _ctx: &DTypeSerdeContext, + fb: &Self::Source<'_>, + ) -> Result { + let nullability = Nullability::from(fb.nullability()); + match fb.type_type() { + fb::Type::Binary => { + todo!() } - Scalar::List(ls) => { - self.writer.write_option_tag(ls.values().is_some())?; - if let Some(vs) = ls.values() { - self.writer.write_usize(vs.len())?; - for elem in vs { - self.write(elem)?; - } - } else { - self.writer.dtype(ls.dtype())?; - } - Ok(()) + fb::Type::Bool => { + todo!() } - Scalar::Null(_) => Ok(()), - Scalar::Primitive(p) => self.write_primitive_scalar(p), - Scalar::Struct(s) => { - let names = s.names(); - self.writer.write_usize(names.len())?; - for n in names { - self.writer.write_slice(n.as_bytes())?; - } - for field in s.values() { - self.write(field)?; - } - Ok(()) + fb::Type::List => { + todo!() + } + fb::Type::Null => { + todo!() + } + fb::Type::Primitive => { + let primitive = fb.type__as_primitive().expect("missing Primitive value"); + let ptype = primitive.ptype().try_into()?; + Ok(match_each_native_ptype!(ptype, |$T| { + Scalar::Primitive(PrimitiveScalar::try_new( + if let Some(bytes) = primitive.bytes() { + Some($T::from_le_bytes(bytes.bytes().try_into()?)) + } else { + None + }, + nullability, + )?) + })) } - Scalar::Utf8(u) => self - .writer - .write_optional_slice(u.value().map(|s| s.as_bytes())), - Scalar::Composite(c) => { - self.writer.dtype(c.dtype())?; - self.write(c.scalar()) + fb::Type::Struct_ => { + todo!() } + fb::Type::UTF8 => Ok(Scalar::Utf8(Utf8Scalar::try_new( + fb.type__as_utf8() + .expect("missing UTF8 value") + .value() + .map(|s| s.to_string()), + nullability, + )?)), + fb::Type::Composite => { + todo!() + } + _ => vortex_bail!(InvalidSerde: "Unrecognized scalar type"), } } +} - fn write_primitive_scalar(&mut self, scalar: &PrimitiveScalar) -> VortexResult<()> { - self.writer.ptype(scalar.ptype())?; - self.writer.write_option_tag(scalar.value().is_some())?; - if let Some(ps) = scalar.value() { - match ps { - PScalar::F16(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, - PScalar::F32(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, - PScalar::F64(f) => self.writer.write_fixed_slice(f.to_le_bytes())?, - PScalar::I16(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, - PScalar::I32(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, - PScalar::I64(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, - PScalar::I8(i) => self.writer.write_fixed_slice(i.to_le_bytes())?, - PScalar::U16(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, - PScalar::U32(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, - PScalar::U64(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, - PScalar::U8(u) => self.writer.write_fixed_slice(u.to_le_bytes())?, - } +impl From for fb::PType { + fn from(value: PType) -> Self { + match value { + PType::U8 => fb::PType::U8, + PType::U16 => fb::PType::U16, + PType::U32 => fb::PType::U32, + PType::U64 => fb::PType::U64, + PType::I8 => fb::PType::I8, + PType::I16 => fb::PType::I16, + PType::I32 => fb::PType::I32, + PType::I64 => fb::PType::I64, + PType::F16 => fb::PType::F16, + PType::F32 => fb::PType::F32, + PType::F64 => fb::PType::F64, } - Ok(()) } } -#[derive(Copy, Clone, IntoPrimitive, TryFromPrimitive)] -#[repr(u8)] -enum ScalarTag { - Binary, - Bool, - List, - Null, - // TODO(robert): rename to primitive once we stop using enum for serialization - PrimitiveS, - Struct, - Utf8, - Composite, -} +impl TryFrom for PType { + type Error = VortexError; -impl From<&Scalar> for ScalarTag { - fn from(value: &Scalar) -> Self { - match value { - Scalar::Binary(_) => ScalarTag::Binary, - Scalar::Bool(_) => ScalarTag::Bool, - Scalar::List(_) => ScalarTag::List, - Scalar::Null(_) => ScalarTag::Null, - Scalar::Primitive(_) => ScalarTag::PrimitiveS, - Scalar::Struct(_) => ScalarTag::Struct, - Scalar::Utf8(_) => ScalarTag::Utf8, - Scalar::Composite(_) => ScalarTag::Composite, - } + fn try_from(value: fb::PType) -> Result { + Ok(match value { + fb::PType::U8 => PType::U8, + fb::PType::U16 => PType::U16, + fb::PType::U32 => PType::U32, + fb::PType::U64 => PType::U64, + fb::PType::I8 => PType::I8, + fb::PType::I16 => PType::I16, + fb::PType::I32 => PType::I32, + fb::PType::I64 => PType::I64, + fb::PType::F16 => PType::F16, + fb::PType::F32 => PType::F32, + fb::PType::F64 => PType::F64, + _ => vortex_bail!(InvalidSerde: "Unrecognized PType"), + }) } } impl Serialize for Scalar { - fn serialize(&self, _serializer: S) -> Result + fn serialize(&self, serializer: S) -> Result where S: Serializer, { - todo!() + self.with_flatbuffer_bytes(|bytes| serializer.serialize_bytes(bytes)) } } +struct ScalarDeserializer(DTypeSerdeContext); + +impl<'de> Visitor<'de> for ScalarDeserializer { + type Value = Scalar; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a vortex dtype") + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: serde::de::Error, + { + let fb = root::(v).map_err(E::custom)?; + Scalar::read_flatbuffer(&self.0, &fb).map_err(E::custom) + } +} + +// TODO(ngates): Should we just inline composites in scalars? impl<'de> Deserialize<'de> for Scalar { - fn deserialize(_deserializer: D) -> Result + fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { - todo!() + let ctx = DTypeSerdeContext::new(vec![]); + deserializer.deserialize_bytes(ScalarDeserializer(ctx)) } } + +// impl<'a, 'b> ScalarReader<'a, 'b> { +// pub fn read(&mut self) -> VortexResult { +// let bytes = self.reader.read_slice()?; +// let scalar = root::(&bytes) +// .map_err(|_e| VortexError::InvalidArgument("Invalid FlatBuffer".into())) +// .unwrap(); + +// } +// +// fn read_primitive_scalar(&mut self) -> VortexResult { +// let ptype = self.reader.ptype()?; +// let is_present = self.reader.read_option_tag()?; +// if is_present { +// let pscalar = match ptype { +// PType::U8 => PrimitiveScalar::some(PScalar::U8(u8::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::U16 => PrimitiveScalar::some(PScalar::U16(u16::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::U32 => PrimitiveScalar::some(PScalar::U32(u32::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::U64 => PrimitiveScalar::some(PScalar::U64(u64::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::I8 => PrimitiveScalar::some(PScalar::I8(i8::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::I16 => PrimitiveScalar::some(PScalar::I16(i16::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::I32 => PrimitiveScalar::some(PScalar::I32(i32::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::I64 => PrimitiveScalar::some(PScalar::I64(i64::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::F16 => PrimitiveScalar::some(PScalar::F16(f16::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::F32 => PrimitiveScalar::some(PScalar::F32(f32::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// PType::F64 => PrimitiveScalar::some(PScalar::F64(f64::from_le_bytes( +// self.reader.read_nbytes()?, +// ))), +// }; +// Ok(pscalar) +// } else { +// Ok(PrimitiveScalar::none(ptype)) +// } +// } +// } diff --git a/vortex-array/src/serde/context.rs b/vortex-array/src/serde/context.rs deleted file mode 100644 index 9017e379a0..0000000000 --- a/vortex-array/src/serde/context.rs +++ /dev/null @@ -1,39 +0,0 @@ -use std::sync::Arc; - -use itertools::Itertools; - -use crate::encoding::{EncodingId, EncodingRef, ENCODINGS}; - -#[derive(Debug)] -pub struct SerdeContext { - encodings: Arc<[EncodingRef]>, -} - -impl SerdeContext { - pub fn new(encodings: Arc<[EncodingRef]>) -> Self { - Self { encodings } - } - - pub fn encodings(&self) -> &[EncodingRef] { - self.encodings.as_ref() - } - - pub fn find_encoding(&self, encoding_id: u16) -> Option { - self.encodings.get(encoding_id as usize).cloned() - } - - pub fn encoding_idx(&self, encoding_id: EncodingId) -> Option { - self.encodings - .iter() - .position(|e| e.id() == encoding_id) - .map(|i| i as u16) - } -} - -impl Default for SerdeContext { - fn default() -> Self { - Self { - encodings: ENCODINGS.iter().cloned().collect_vec().into(), - } - } -} diff --git a/vortex-array/src/serde/data.rs b/vortex-array/src/serde/data.rs deleted file mode 100644 index d65f2b872a..0000000000 --- a/vortex-array/src/serde/data.rs +++ /dev/null @@ -1,130 +0,0 @@ -use arrow_buffer::Buffer; -use vortex_error::{vortex_err, VortexResult}; - -use crate::array::Array; -use crate::encoding::EncodingId; -use crate::walk::ArrayWalker; - -pub struct ArrayData { - columns: Vec, -} - -impl ArrayData { - pub fn new(columns: Vec) -> Self { - Self { columns } - } - - pub fn columns(&self) -> &[ColumnData] { - &self.columns - } -} - -#[derive(Debug)] -pub struct ColumnData { - encoding: EncodingId, - metadata: Option, - children: Vec, - buffers: Vec, -} - -impl ColumnData { - pub fn try_from_array(array: &dyn Array) -> VortexResult { - let mut data = ColumnData { - encoding: array.encoding().id(), - metadata: array - .serde() - .ok_or_else(|| { - vortex_err!(InvalidSerde: "Array {} does not support serde", array.encoding()) - })? - .metadata()? - .map(Buffer::from_vec), - children: Vec::new(), - buffers: Vec::new(), - }; - array.walk(&mut data)?; - Ok(data) - } - - pub fn new( - encoding: EncodingId, - metadata: Option, - children: Vec, - buffers: Vec, - ) -> Self { - Self { - encoding, - metadata, - children, - buffers, - } - } - - pub fn encoding(&self) -> EncodingId { - self.encoding - } - - pub fn metadata(&self) -> Option<&Buffer> { - self.metadata.as_ref() - } - - pub fn children(&self) -> &[ColumnData] { - &self.children - } - - pub fn buffers(&self) -> &[Buffer] { - &self.buffers - } - - pub fn depth_first_traversal(&self) -> ColumnDataIterator { - ColumnDataIterator { stack: vec![self] } - } - - /// Return the buffer offsets and the total length of all buffers, assuming the given alignment. - /// This includes all child buffers. - pub fn all_buffer_offsets(&self, alignment: usize) -> Vec { - let mut offsets = Vec::with_capacity(self.buffers.len() + 1); - let mut offset = 0; - - for col_data in self.depth_first_traversal() { - for buffer in col_data.buffers() { - offsets.push(offset as u64); - - let buffer_size = buffer.len(); - let aligned_size = (buffer_size + (alignment - 1)) & !(alignment - 1); - offset += aligned_size; - } - } - offsets.push(offset as u64); - - offsets - } -} - -impl ArrayWalker for ColumnData { - fn visit_child(&mut self, array: &dyn Array) -> VortexResult<()> { - self.children.push(ColumnData::try_from_array(array)?); - Ok(()) - } - - fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { - self.buffers.push(buffer.clone()); - Ok(()) - } -} - -/// A depth-first iterator over a ColumnData. -pub struct ColumnDataIterator<'a> { - stack: Vec<&'a ColumnData>, -} - -impl<'a> Iterator for ColumnDataIterator<'a> { - type Item = &'a ColumnData; - - fn next(&mut self) -> Option { - let next = self.stack.pop()?; - for child in &next.children { - self.stack.push(child); - } - Some(next) - } -} diff --git a/vortex-array/src/serde/mod.rs b/vortex-array/src/serde/mod.rs deleted file mode 100644 index 598e8a1f3d..0000000000 --- a/vortex-array/src/serde/mod.rs +++ /dev/null @@ -1,378 +0,0 @@ -use std::io; -use std::io::{Cursor, ErrorKind, Read, Write}; - -use arrow_buffer::buffer::Buffer; -use arrow_buffer::BooleanBuffer; -use flatbuffers::root; -use itertools::Itertools; -pub use view::*; -use vortex_error::{vortex_err, VortexResult}; -use vortex_flatbuffers::{FlatBufferToBytes, ReadFlatBuffer}; -use vortex_schema::DTypeSerdeContext; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - -use crate::array::bool::BoolArray; -use crate::array::composite::COMPOSITE_EXTENSIONS; -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::encoding::{find_encoding, EncodingId, ENCODINGS}; -use crate::ptype::PType; -use crate::scalar::{Scalar, ScalarReader, ScalarWriter}; -use crate::serde::ptype::PTypeTag; -use crate::validity::{Validity, ValidityView}; - -pub mod context; -pub mod data; -mod ptype; -pub mod view; - -pub trait ArraySerde { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()>; - - fn metadata(&self) -> VortexResult>>; -} - -pub trait EncodingSerde { - fn validate(&self, _view: &ArrayView) -> VortexResult<()> { - Ok(()) - // todo!("Validate not implemented for {}", _view.encoding().id()); - } - - fn to_array(&self, view: &ArrayView) -> ArrayRef { - BoolArray::new( - BooleanBuffer::new(view.buffers().first().unwrap().clone(), 0, view.len()), - view.child(0, &Validity::DTYPE) - .map(|c| Validity::Array(c.into_array())), - ) - .into_array() - } - - // TODO(ngates): remove this ideally? It can error... Maybe store lengths in array views? - fn len(&self, _view: &ArrayView) -> usize { - todo!( - "EncodingSerde.len not implemented for {}", - _view.encoding().id() - ); - } - - fn with_view_compute<'view>( - &self, - _view: &'view ArrayView, - _f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - Err(vortex_err!(ComputeError: "Compute not implemented")) - } - - fn read(&self, ctx: &mut ReadCtx) -> VortexResult; -} - -pub trait BytesSerde -where - Self: Sized, -{ - fn serialize(&self) -> Vec; - - fn deserialize(data: &[u8]) -> VortexResult; -} - -impl BytesSerde for usize { - fn serialize(&self) -> Vec { - let mut vec = Vec::new(); - // IOError only happens on EOF. - leb128::write::unsigned(&mut vec, *self as u64).unwrap(); - vec - } - - fn deserialize(data: &[u8]) -> VortexResult { - let mut cursor = Cursor::new(data); - leb128::read::unsigned(&mut cursor) - .map(|v| v as usize) - .map_err(|e| vortex_err!(InvalidSerde: "Failed to parse leb128 {}", e)) - } -} - -pub struct ReadCtx<'a> { - schema: &'a DType, - encodings: Vec, - r: &'a mut dyn Read, -} - -pub trait Serde: Sized { - fn read(ctx: &mut ReadCtx) -> VortexResult; - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()>; -} - -impl<'a> ReadCtx<'a> { - pub fn new(schema: &'a DType, r: &'a mut dyn Read) -> Self { - let encodings = ENCODINGS.iter().map(|e| e.id()).collect::>(); - Self { - schema, - encodings, - r, - } - } - - #[inline] - pub fn schema(&self) -> &DType { - self.schema - } - - pub fn subfield(&mut self, idx: usize) -> ReadCtx { - let DType::Struct(_, fs) = self.schema else { - panic!("Schema was not a struct") - }; - self.with_schema(&fs[idx]) - } - - #[inline] - pub fn with_schema<'b>(&'b mut self, schema: &'b DType) -> ReadCtx { - ReadCtx::new(schema, self.r) - } - - #[inline] - pub fn bytes(&mut self) -> ReadCtx { - self.with_schema(&DType::Int( - IntWidth::_8, - Signedness::Unsigned, - Nullability::NonNullable, - )) - } - - #[inline] - pub fn dtype(&mut self) -> VortexResult { - let dtype_bytes = self.read_slice()?; - let ctx = DTypeSerdeContext::new(COMPOSITE_EXTENSIONS.iter().map(|e| e.id()).collect_vec()); - DType::read_flatbuffer( - &ctx, - &(root::(&dtype_bytes)?), - ) - } - - pub fn ptype(&mut self) -> VortexResult { - let typetag = PTypeTag::try_from(self.read_nbytes::<1>()?[0]) - .map_err(|e| io::Error::new(ErrorKind::InvalidInput, e))?; - Ok(typetag.into()) - } - - pub fn nullability(&mut self) -> VortexResult { - match self.read_nbytes::<1>()? { - [0] => Ok(Nullability::NonNullable), - [1] => Ok(Nullability::Nullable), - _ => Err(vortex_err!("Invalid nullability tag")), - } - } - - #[inline] - pub fn scalar(&mut self) -> VortexResult { - ScalarReader::new(self).read() - } - - pub fn read_optional_slice(&mut self) -> VortexResult>> { - let is_present = self.read_option_tag()?; - is_present.then(|| self.read_slice()).transpose() - } - - pub fn read_slice(&mut self) -> VortexResult> { - let len = self.read_usize()?; - let mut data = Vec::::with_capacity(len); - self.r.take(len as u64).read_to_end(&mut data)?; - Ok(data) - } - - pub fn read_buffer usize>( - &mut self, - byte_len: F, - ) -> VortexResult<(usize, Buffer)> { - let logical_len = self.read_usize()?; - let buffer_len = byte_len(logical_len); - let mut buf = Vec::with_capacity(buffer_len); - self.r.take(buffer_len as u64).read_to_end(&mut buf)?; - Ok((logical_len, Buffer::from_vec(buf))) - } - - pub fn read_nbytes(&mut self) -> VortexResult<[u8; N]> { - let mut bytes: [u8; N] = [0; N]; - self.r.read_exact(&mut bytes)?; - Ok(bytes) - } - - pub fn read_usize(&mut self) -> VortexResult { - leb128::read::unsigned(self.r) - .map_err(|_| vortex_err!("Failed to parse leb128 usize")) - .map(|u| u as usize) - } - - pub fn read_option_tag(&mut self) -> VortexResult { - let mut tag = [0; 1]; - self.r.read_exact(&mut tag)?; - Ok(tag[0] == 0x01) - } - - pub fn read_optional_array(&mut self) -> VortexResult> { - if self.read_option_tag()? { - self.read().map(Some) - } else { - Ok(None) - } - } - - pub fn read_validity(&mut self) -> VortexResult> { - if self.read_option_tag()? { - match self.read_nbytes::<1>()? { - [0u8] => Ok(Some(Validity::Valid(self.read_usize()?))), - [1u8] => Ok(Some(Validity::Invalid(self.read_usize()?))), - [2u8] => Ok(Some(Validity::array( - self.with_schema(&Validity::DTYPE).read()?, - )?)), - _ => panic!("Invalid validity tag"), - } - } else { - Ok(None) - } - } - - pub fn read(&mut self) -> VortexResult { - let encoding_id = self.read_usize()?; - if let Some(serde) = - find_encoding(self.encodings[encoding_id].name()).and_then(|e| e.serde()) - { - serde.read(self) - } else { - Err(vortex_err!("Failed to recognize encoding ID")) - } - } -} - -pub struct WriteCtx<'a> { - w: &'a mut dyn Write, - available_encodings: Vec, -} - -impl<'a> WriteCtx<'a> { - pub fn new(w: &'a mut dyn Write) -> Self { - let available_encodings = ENCODINGS.iter().map(|e| e.id()).collect::>(); - Self { - w, - available_encodings, - } - } - - pub fn dtype(&mut self, dtype: &DType) -> VortexResult<()> { - let (bytes, offset) = dtype.flatbuffer_to_bytes(); - self.write_slice(&bytes[offset..]) - } - - pub fn ptype(&mut self, ptype: PType) -> VortexResult<()> { - self.write_fixed_slice([PTypeTag::from(ptype).into()]) - } - - pub fn nullability(&mut self, nullability: Nullability) -> VortexResult<()> { - match nullability { - Nullability::NonNullable => self.write_fixed_slice([0u8]), - Nullability::Nullable => self.write_fixed_slice([1u8]), - } - } - - pub fn scalar(&mut self, scalar: &Scalar) -> VortexResult<()> { - ScalarWriter::new(self).write(scalar) - } - - pub fn write_usize(&mut self, u: usize) -> VortexResult<()> { - leb128::write::unsigned(self.w, u as u64) - .map_err(|_| vortex_err!("Failed to write leb128 usize")) - .map(|_| ()) - } - - pub fn write_fixed_slice(&mut self, slice: [u8; N]) -> VortexResult<()> { - self.w.write_all(&slice).map_err(|e| e.into()) - } - - pub fn write_slice(&mut self, slice: &[u8]) -> VortexResult<()> { - self.write_usize(slice.len())?; - self.w.write_all(slice).map_err(|e| e.into()) - } - - pub fn write_optional_slice(&mut self, slice: Option<&[u8]>) -> VortexResult<()> { - self.write_option_tag(slice.is_some())?; - if let Some(s) = slice { - self.write_slice(s) - } else { - Ok(()) - } - } - - pub fn write_buffer(&mut self, logical_len: usize, buf: &Buffer) -> VortexResult<()> { - self.write_usize(logical_len)?; - self.w.write_all(buf.as_slice()).map_err(|e| e.into()) - } - - pub fn write_option_tag(&mut self, present: bool) -> VortexResult<()> { - self.w - .write_all(&[if present { 0x01 } else { 0x00 }]) - .map_err(|e| e.into()) - } - - pub fn write_optional_array(&mut self, array: Option<&ArrayRef>) -> VortexResult<()> { - self.write_option_tag(array.is_some())?; - if let Some(array) = array { - self.write(array) - } else { - Ok(()) - } - } - - pub fn write_validity(&mut self, validity: Option) -> VortexResult<()> { - match validity { - None => self.write_option_tag(false), - Some(v) => { - self.write_option_tag(true)?; - match v { - ValidityView::Valid(len) => { - self.write_fixed_slice([0u8])?; - self.write_usize(len) - } - ValidityView::Invalid(len) => { - self.write_fixed_slice([1u8])?; - self.write_usize(len) - } - ValidityView::Array(a) => { - self.write_fixed_slice([2u8])?; - self.write(a) - } - } - } - } - } - - pub fn write(&mut self, array: &dyn Array) -> VortexResult<()> { - let encoding_id = self - .available_encodings - .iter() - .position(|e| e.name() == array.encoding().id().name()) - .ok_or(io::Error::new(ErrorKind::InvalidInput, "unknown encoding"))?; - self.write_usize(encoding_id)?; - array.serde().map(|s| s.write(self)).unwrap_or_else(|| { - Err(vortex_err!( - "Serialization not supported for {}", - array.encoding().id() - )) - }) - } -} - -#[cfg(test)] -pub mod test { - use vortex_error::VortexResult; - - use crate::array::{Array, ArrayRef}; - use crate::serde::{ReadCtx, WriteCtx}; - - pub fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } -} diff --git a/vortex-array/src/serde/ptype.rs b/vortex-array/src/serde/ptype.rs deleted file mode 100644 index 4481a6b3c0..0000000000 --- a/vortex-array/src/serde/ptype.rs +++ /dev/null @@ -1,55 +0,0 @@ -use num_enum::{IntoPrimitive, TryFromPrimitive}; - -use crate::ptype::PType; - -#[derive(IntoPrimitive, TryFromPrimitive)] -#[repr(u8)] -pub enum PTypeTag { - U8, - U16, - U32, - U64, - I8, - I16, - I32, - I64, - F16, - F32, - F64, -} - -impl From for PTypeTag { - fn from(value: PType) -> Self { - match value { - PType::U8 => PTypeTag::U8, - PType::U16 => PTypeTag::U16, - PType::U32 => PTypeTag::U32, - PType::U64 => PTypeTag::U64, - PType::I8 => PTypeTag::I8, - PType::I16 => PTypeTag::I16, - PType::I32 => PTypeTag::I32, - PType::I64 => PTypeTag::I64, - PType::F16 => PTypeTag::F16, - PType::F32 => PTypeTag::F32, - PType::F64 => PTypeTag::F64, - } - } -} - -impl From for PType { - fn from(value: PTypeTag) -> Self { - match value { - PTypeTag::U8 => PType::U8, - PTypeTag::U16 => PType::U16, - PTypeTag::U32 => PType::U32, - PTypeTag::U64 => PType::U64, - PTypeTag::I8 => PType::I8, - PTypeTag::I16 => PType::I16, - PTypeTag::I32 => PType::I32, - PTypeTag::I64 => PType::I64, - PTypeTag::F16 => PType::F16, - PTypeTag::F32 => PType::F32, - PTypeTag::F64 => PType::F64, - } - } -} diff --git a/vortex-array/src/serde/view.rs b/vortex-array/src/serde/view.rs deleted file mode 100644 index 12d67b683a..0000000000 --- a/vortex-array/src/serde/view.rs +++ /dev/null @@ -1,224 +0,0 @@ -use std::any::Any; -use std::fmt::{Debug, Formatter}; -use std::sync::Arc; - -use arrow_buffer::Buffer; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::DType; - -use crate::array::{Array, ArrayRef}; -use crate::compute::ArrayCompute; -use crate::encoding::EncodingRef; -use crate::flatbuffers::array as fb; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::context::SerdeContext; -use crate::serde::EncodingSerde; -use crate::stats::Stats; -use crate::validity::ArrayValidity; -use crate::validity::Validity; -use crate::ArrayWalker; - -#[derive(Clone)] -pub struct ArrayView<'a> { - encoding: EncodingRef, - dtype: &'a DType, - array: fb::Array<'a>, - buffers: &'a [Buffer], - ctx: &'a SerdeContext, -} - -impl<'a> Debug for ArrayView<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ArrayView") - .field("encoding", &self.encoding) - .field("dtype", &self.dtype) - // .field("array", &self.array) - .field("buffers", &self.buffers) - .field("ctx", &self.ctx) - .finish() - } -} - -impl<'a> ArrayView<'a> { - pub fn try_new( - ctx: &'a SerdeContext, - dtype: &'a DType, - array: fb::Array<'a>, - buffers: &'a [Buffer], - ) -> VortexResult { - let encoding = ctx - .find_encoding(array.encoding()) - .ok_or_else(|| vortex_err!(InvalidSerde: "Encoding ID out of bounds"))?; - let _vtable = encoding.serde().ok_or_else(|| { - // TODO(ngates): we could fall-back to heap-allocating? - vortex_err!(InvalidSerde: "Encoding {} does not support serde", encoding) - })?; - - if buffers.len() != Self::cumulative_nbuffers(array) { - vortex_bail!(InvalidSerde: - "Incorrect number of buffers {}, expected {}", - buffers.len(), - Self::cumulative_nbuffers(array) - ) - } - - Ok(Self { - encoding, - dtype, - array, - buffers, - ctx, - }) - } - - pub fn encoding(&self) -> EncodingRef { - self.encoding - } - - pub fn vtable(&self) -> &dyn EncodingSerde { - self.encoding.serde().unwrap() - } - - pub fn dtype(&self) -> &DType { - self.dtype - } - - pub fn metadata(&self) -> Option<&'a [u8]> { - self.array.metadata().map(|m| m.bytes()) - } - - pub fn nchildren(&self) -> usize { - self.array.children().map(|c| c.len()).unwrap_or_default() - } - - pub fn child(&self, idx: usize, dtype: &'a vortex_schema::DType) -> Option> { - let child = self.array_child(idx)?; - - // Figure out how many buffers to skip... - // We store them depth-first. - let buffer_offset = self - .array - .children()? - .iter() - .take(idx) - .map(|child| Self::cumulative_nbuffers(child)) - .sum(); - let buffer_count = Self::cumulative_nbuffers(child); - - Some( - Self::try_new( - self.ctx, - dtype, - child, - &self.buffers[buffer_offset..][0..buffer_count], - ) - .unwrap(), - ) - } - - fn array_child(&self, idx: usize) -> Option> { - let children = self.array.children()?; - if idx < children.len() { - Some(children.get(idx)) - } else { - None - } - } - - /// The number of buffers used by the current Array. - pub fn nbuffers(&self) -> usize { - self.array.nbuffers() as usize - } - - /// The number of buffers used by the current Array and all its children. - fn cumulative_nbuffers(array: fb::Array) -> usize { - let mut nbuffers = array.nbuffers() as usize; - for child in array.children().unwrap_or_default() { - nbuffers += Self::cumulative_nbuffers(child) - } - nbuffers - } - - pub fn buffers(&self) -> &'a [Buffer] { - // This is only true for the immediate current node? - &self.buffers[0..self.nbuffers()] - } -} - -impl<'a> Array for ArrayView<'a> { - fn as_any(&self) -> &dyn Any { - panic!("Not implemented for ArrayView") - } - - fn into_any(self: Arc) -> Arc { - panic!("Not implemented for ArrayView") - } - - fn to_array(&self) -> ArrayRef { - self.vtable().to_array(self) - } - - fn into_array(self) -> ArrayRef { - // Not much point adding VTable.into_array for ArrayView since everything is by-reference. - self.vtable().to_array(&self) - } - - fn len(&self) -> usize { - self.vtable().len(self) - } - - fn is_empty(&self) -> bool { - todo!() - // self.vtable.is_empty(self).unwrap() - } - - fn dtype(&self) -> &DType { - self.dtype - } - - fn stats(&self) -> Stats { - // TODO(ngates): implement a dynamic trait for stats? - todo!() - } - - fn encoding(&self) -> EncodingRef { - self.encoding - } - - fn nbytes(&self) -> usize { - self.buffers.iter().map(|b| b.len()).sum() - } - - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - self.encoding() - .serde() - .expect("TODO(ngates): heap allocate ArrayView and invoke compute") - .with_view_compute(self, f) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - todo!() - } -} - -impl ArrayValidity for ArrayView<'_> { - fn logical_validity(&self) -> Validity { - todo!() - } - - fn is_valid(&self, _index: usize) -> bool { - todo!() - } -} - -impl<'a> ArrayDisplay for ArrayView<'a> { - fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { - fmt.property("encoding", self.encoding)?; - fmt.property("dtype", self.dtype)?; - fmt.property("metadata", format!("{:?}", self.array.metadata()))?; - fmt.property("nchildren", self.nchildren()) - } -} diff --git a/vortex-array/src/stats.rs b/vortex-array/src/stats.rs index 6eb9f6b601..89b0177159 100644 --- a/vortex-array/src/stats.rs +++ b/vortex-array/src/stats.rs @@ -1,14 +1,10 @@ -use std::cmp::Ordering; -use std::collections::hash_map::Entry; use std::collections::HashMap; -use std::sync::RwLock; -use itertools::Itertools; -use vortex_error::{VortexError, VortexResult}; +use vortex_error::VortexResult; use vortex_schema::DType; use crate::ptype::NativePType; -use crate::scalar::{ListScalarVec, Scalar}; +use crate::scalar::Scalar; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Stat { @@ -24,263 +20,52 @@ pub enum Stat { NullCount, } -#[derive(Debug, Clone, Default)] -pub struct StatsSet(HashMap); - -impl StatsSet { - pub fn new() -> Self { - StatsSet(HashMap::new()) - } - - pub fn from(map: HashMap) -> Self { - StatsSet(map) - } - - pub fn of(stat: Stat, value: Scalar) -> Self { - StatsSet(HashMap::from([(stat, value)])) - } - - fn get_as TryFrom<&'a Scalar, Error = VortexError>>( - &self, - stat: &Stat, - ) -> VortexResult> { - self.0.get(stat).map(|v| T::try_from(v)).transpose() - } - - pub fn set(&mut self, stat: Stat, value: Scalar) { - self.0.insert(stat, value); - } - - pub fn merge(&mut self, other: &Self) -> &Self { - // FIXME(ngates): make adding a new stat a compile error - self.merge_min(other); - self.merge_max(other); - self.merge_is_constant(other); - self.merge_is_sorted(other); - self.merge_true_count(other); - self.merge_null_count(other); - self.merge_bit_width_freq(other); - self.merge_run_count(other); - - self - } - - fn merge_min(&mut self, other: &Self) { - match self.0.entry(Stat::Min) { - Entry::Occupied(mut e) => { - if let Some(omin) = other.0.get(&Stat::Min) { - match omin.partial_cmp(e.get()) { - None => { - e.remove(); - } - Some(Ordering::Less) => { - e.insert(omin.clone()); - } - Some(Ordering::Equal) | Some(Ordering::Greater) => {} - } - } - } - Entry::Vacant(e) => { - if let Some(min) = other.0.get(&Stat::Min) { - e.insert(min.clone()); - } - } - } - } - - fn merge_max(&mut self, other: &Self) { - match self.0.entry(Stat::Max) { - Entry::Occupied(mut e) => { - if let Some(omin) = other.0.get(&Stat::Max) { - match omin.partial_cmp(e.get()) { - None => { - e.remove(); - } - Some(Ordering::Greater) => { - e.insert(omin.clone()); - } - Some(Ordering::Equal) | Some(Ordering::Less) => {} - } - } - } - Entry::Vacant(e) => { - if let Some(min) = other.0.get(&Stat::Max) { - e.insert(min.clone()); - } - } - } - } - - fn merge_is_constant(&mut self, other: &Self) { - if let Some(is_constant) = self.get_as::(&Stat::IsConstant).unwrap() { - if let Some(other_is_constant) = other.get_as::(&Stat::IsConstant).unwrap() { - if is_constant - && other_is_constant - && self.0.get(&Stat::Min) == other.0.get(&Stat::Min) - { - return; - } - } - self.0.insert(Stat::IsConstant, false.into()); - } - } - - fn merge_is_sorted(&mut self, other: &Self) { - if let Some(is_sorted) = self.get_as::(&Stat::IsSorted).unwrap() { - if let Some(other_is_sorted) = other.get_as::(&Stat::IsSorted).unwrap() { - if is_sorted && other_is_sorted && self.0.get(&Stat::Max) <= other.0.get(&Stat::Min) - { - return; - } - } - self.0.insert(Stat::IsSorted, false.into()); - } - } - - fn merge_true_count(&mut self, other: &Self) { - self.merge_scalar_stat(other, &Stat::TrueCount) - } - - fn merge_null_count(&mut self, other: &Self) { - self.merge_scalar_stat(other, &Stat::NullCount) - } - - fn merge_scalar_stat(&mut self, other: &Self, stat: &Stat) { - match self.0.entry(*stat) { - Entry::Occupied(mut e) => { - if let Some(other_value) = other.get_as::(stat).unwrap() { - let self_value: usize = e.get().try_into().unwrap(); - e.insert((self_value + other_value).into()); - } - } - Entry::Vacant(e) => { - if let Some(min) = other.0.get(stat) { - e.insert(min.clone()); - } - } - } - } - - fn merge_bit_width_freq(&mut self, other: &Self) { - match self.0.entry(Stat::BitWidthFreq) { - Entry::Occupied(mut e) => { - if let Some(other_value) = other - .get_as::>(&Stat::BitWidthFreq) - .unwrap() - { - // TODO(robert): Avoid the copy here. We could e.get_mut() but need to figure out casting - let self_value: ListScalarVec = e.get().try_into().unwrap(); - e.insert( - ListScalarVec( - self_value - .0 - .iter() - .zip_eq(other_value.0.iter()) - .map(|(s, o)| *s + *o) - .collect::>(), - ) - .into(), - ); - } - } - Entry::Vacant(e) => { - if let Some(min) = other.0.get(&Stat::BitWidthFreq) { - e.insert(min.clone()); - } - } - } - } - - /// Merged run count is an upper bound where we assume run is interrupted at the boundary - fn merge_run_count(&mut self, other: &Self) { - match self.0.entry(Stat::RunCount) { - Entry::Occupied(mut e) => { - if let Some(other_value) = other.get_as::(&Stat::RunCount).unwrap() { - let self_value: usize = e.get().try_into().unwrap(); - e.insert((self_value + other_value + 1).into()); - } - } - Entry::Vacant(e) => { - if let Some(min) = other.0.get(&Stat::RunCount) { - e.insert(min.clone()); - } - } - } +pub trait ArrayStatistics { + fn statistics(&self) -> &(dyn Statistics + '_) { + &EmptyStatistics } } -pub trait StatsCompute { - fn compute(&self, _stat: &Stat) -> VortexResult { - Ok(StatsSet::new()) +pub trait ArrayStatisticsCompute { + /// Compute the requested statistic. Can return additional stats. + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { + Ok(HashMap::new()) } } -pub struct Stats<'a> { - cache: &'a RwLock, - compute: &'a dyn StatsCompute, +pub trait Statistics { + fn compute(&self, stat: Stat) -> Option; + fn get(&self, stat: Stat) -> Option; + fn set(&self, stat: Stat, value: Scalar); + fn to_map(&self) -> HashMap; } -impl<'a> Stats<'a> { - pub fn new(cache: &'a RwLock, compute: &'a dyn StatsCompute) -> Self { - Self { cache, compute } - } - - pub fn set_many(&self, other: &Stats, stats: Vec<&Stat>) { - stats.into_iter().for_each(|stat| { - if let Some(v) = other.get(stat) { - self.cache.write().unwrap().set(*stat, v) - } - }); +impl dyn Statistics + '_ { + pub fn compute_as>(&self, stat: Stat) -> Option { + self.compute(stat).and_then(|s| T::try_from(s).ok()) } - pub fn set(&self, stat: Stat, value: Scalar) { - self.cache.write().unwrap().set(stat, value); + pub fn compute_as_cast(&self, stat: Stat) -> Option { + self.compute(stat) + .and_then(|s| s.cast(&DType::from(T::PTYPE)).ok()) + .and_then(|s| T::try_from(s).ok()) } - pub fn get_all(&self) -> StatsSet { - self.cache.read().unwrap().clone() + pub fn get_as>(&self, stat: Stat) -> Option { + self.get(stat).and_then(|s| T::try_from(s).ok()) } +} - pub fn get(&self, stat: &Stat) -> Option { - self.cache.read().unwrap().0.get(stat).cloned() - } - - pub fn get_as>(&self, stat: &Stat) -> Option { - self.get(stat).map(|v| T::try_from(v).unwrap()) - } - - pub fn get_or_compute(&self, stat: &Stat) -> Option { - if let Some(value) = self.cache.read().unwrap().0.get(stat) { - return Some(value.clone()); - } - - self.cache - .write() - .unwrap() - .0 - .extend(self.compute.compute(stat).unwrap().0); - self.get(stat) +pub struct EmptyStatistics; +impl Statistics for EmptyStatistics { + fn compute(&self, _stat: Stat) -> Option { + None } - - pub fn get_or_compute_cast(&self, stat: &Stat) -> Option { - self.get_or_compute(stat) - // TODO(ngates): fix the API so we don't convert the result to optional - .and_then(|v: Scalar| v.cast(&DType::from(T::PTYPE)).ok()) - .and_then(|v| T::try_from(v).ok()) + fn get(&self, _stat: Stat) -> Option { + None } - - pub fn get_or_compute_as>( - &self, - stat: &Stat, - ) -> Option { - self.get_or_compute(stat).and_then(|v| T::try_from(v).ok()) - } - - pub fn get_or_compute_or>( - &self, - default: T, - stat: &Stat, - ) -> T { - self.get_or_compute_as(stat).unwrap_or(default) + fn set(&self, _stat: Stat, _value: Scalar) {} + fn to_map(&self) -> HashMap { + HashMap::default() } } diff --git a/vortex-array2/src/tree.rs b/vortex-array/src/tree.rs similarity index 65% rename from vortex-array2/src/tree.rs rename to vortex-array/src/tree.rs index f9f554e9ed..4aaf8871ca 100644 --- a/vortex-array2/src/tree.rs +++ b/vortex-array/src/tree.rs @@ -4,9 +4,11 @@ use humansize::{format_size, DECIMAL}; use serde::ser::Error; use vortex_error::{VortexError, VortexResult}; +use crate::array::chunked::ChunkedArray; +use crate::array::r#struct::StructArray; use crate::buffer::Buffer; use crate::visitor::ArrayVisitor; -use crate::Array; +use crate::{Array, ToArrayData}; impl Array<'_> { pub fn tree_display(&self) -> TreeDisplayWrapper { @@ -24,8 +26,7 @@ impl<'a> TreeDisplayWrapper<'a> { impl<'a, 'fmt: 'a> fmt::Display for TreeDisplayWrapper<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let array = self.0; - let nbytes = array.with_dyn(|a| a.nbytes()); - let mut array_fmt = TreeFormatter::new(f, "".to_string(), nbytes); + let mut array_fmt = TreeFormatter::new(f, "".to_string()); array_fmt .visit_child("root", array) .map_err(fmt::Error::custom) @@ -35,7 +36,7 @@ impl<'a, 'fmt: 'a> fmt::Display for TreeDisplayWrapper<'a> { pub struct TreeFormatter<'a, 'b: 'a> { fmt: &'a mut fmt::Formatter<'b>, indent: String, - total_size: usize, + total_size: Option, } /// TODO(ngates): I think we want to go back to the old explicit style. It gives arrays more @@ -44,6 +45,7 @@ impl<'a, 'b: 'a> ArrayVisitor for TreeFormatter<'a, 'b> { fn visit_child(&mut self, name: &str, array: &Array) -> VortexResult<()> { array.with_dyn(|a| { let nbytes = a.nbytes(); + let total_size = self.total_size.unwrap_or(nbytes); writeln!( self.fmt, "{}{}: {} nbytes={} ({:.2}%)", @@ -51,10 +53,31 @@ impl<'a, 'b: 'a> ArrayVisitor for TreeFormatter<'a, 'b> { name, array, format_size(nbytes, DECIMAL), - 100f64 * nbytes as f64 / self.total_size as f64 + 100f64 * nbytes as f64 / total_size as f64 )?; + self.indent(|i| { + writeln!( + i.fmt, + // TODO(ngates): use Display for metadata + "{}metadata: {:?}", + i.indent, + array.to_array_data().metadata() + ) + })?; + + let old_total_size = self.total_size; + if ChunkedArray::try_from(array).is_ok() || StructArray::try_from(array).is_ok() { + // Clear the total size so each chunk is treated as a new root. + self.total_size = None + } else { + self.total_size = Some(total_size); + } + self.indent(|i| a.accept(i).map_err(fmt::Error::custom)) - .map_err(VortexError::from) + .map_err(VortexError::from)?; + + self.total_size = old_total_size; + Ok(()) }) } @@ -69,15 +92,11 @@ impl<'a, 'b: 'a> ArrayVisitor for TreeFormatter<'a, 'b> { } impl<'a, 'b: 'a> TreeFormatter<'a, 'b> { - fn new( - fmt: &'a mut fmt::Formatter<'b>, - indent: String, - total_size: usize, - ) -> TreeFormatter<'a, 'b> { + fn new(fmt: &'a mut fmt::Formatter<'b>, indent: String) -> TreeFormatter<'a, 'b> { TreeFormatter { fmt, indent, - total_size, + total_size: None, } } @@ -91,16 +110,4 @@ impl<'a, 'b: 'a> TreeFormatter<'a, 'b> { self.indent = original_ident; res } - - #[allow(dead_code)] - pub fn new_total_size(&mut self, total: usize, new_total: F) -> fmt::Result - where - F: FnOnce(&mut TreeFormatter) -> fmt::Result, - { - let original_total = self.total_size; - self.total_size = total; - let res = new_total(self); - self.total_size = original_total; - res - } } diff --git a/vortex-array/src/typed.rs b/vortex-array/src/typed.rs new file mode 100644 index 0000000000..26ff5b2fa1 --- /dev/null +++ b/vortex-array/src/typed.rs @@ -0,0 +1,92 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use vortex_error::{vortex_err, VortexError, VortexResult}; +use vortex_schema::DType; + +use crate::buffer::OwnedBuffer; +use crate::scalar::Scalar; +use crate::stats::Stat; +use crate::{Array, ArrayData, ArrayDef, AsArray, IntoArray, ToArray, TryDeserializeArrayMetadata}; + +#[derive(Debug, Clone)] +pub struct TypedArray<'a, D: ArrayDef> { + array: Array<'a>, + metadata: D::Metadata, +} + +impl TypedArray<'_, D> { + pub fn try_from_parts( + dtype: DType, + metadata: D::Metadata, + buffer: Option, + children: Arc<[ArrayData]>, + stats: HashMap, + ) -> VortexResult { + let array = Array::Data(ArrayData::try_new( + D::ENCODING, + dtype, + Arc::new(metadata.clone()), + buffer, + children, + stats, + )?); + Ok(Self { array, metadata }) + } + + pub fn metadata(&self) -> &D::Metadata { + &self.metadata + } +} + +impl<'a, 'b, D: ArrayDef> TypedArray<'b, D> { + pub fn array(&'a self) -> &'a Array<'b> { + &self.array + } +} + +impl<'a, D: ArrayDef> TryFrom> for TypedArray<'a, D> { + type Error = VortexError; + + fn try_from(array: Array<'a>) -> Result { + if array.encoding().id() != D::ENCODING.id() { + return Err(vortex_err!("incorrect encoding")); + } + let metadata = match &array { + Array::Data(d) => d + .metadata() + .as_any() + .downcast_ref::() + .unwrap() + .clone(), + Array::View(v) => D::Metadata::try_deserialize_metadata(v.metadata())?, + }; + Ok(TypedArray { array, metadata }) + } +} + +impl<'a, D: ArrayDef> TryFrom<&'a Array<'a>> for TypedArray<'a, D> { + type Error = VortexError; + + fn try_from(value: &'a Array<'a>) -> Result { + value.clone().try_into() + } +} + +impl<'a, D: ArrayDef> AsArray for TypedArray<'a, D> { + fn as_array_ref(&self) -> &Array { + &self.array + } +} + +impl ToArray for TypedArray<'_, D> { + fn to_array(&self) -> Array { + self.array.clone() + } +} + +impl<'a, D: ArrayDef> IntoArray<'a> for TypedArray<'a, D> { + fn into_array(self) -> Array<'a> { + self.array + } +} diff --git a/vortex-array2/src/validity.rs b/vortex-array/src/validity.rs similarity index 71% rename from vortex-array2/src/validity.rs rename to vortex-array/src/validity.rs index 9588c541b6..6a38b63d9d 100644 --- a/vortex-array2/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -4,10 +4,11 @@ use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, Nullability}; use crate::array::bool::BoolArray; +use crate::compute::as_contiguous::as_contiguous; use crate::compute::scalar_at::scalar_at; use crate::compute::slice::slice; use crate::compute::take::take; -use crate::{Array, ArrayData, IntoArray, IntoArrayData, ToArray, ToArrayData}; +use crate::{Array, ArrayData, IntoArray, IntoArrayData, OwnedArray, ToArray, ToArrayData}; pub trait ArrayValidity { fn is_valid(&self, index: usize) -> bool; @@ -179,15 +180,30 @@ impl From for OwnedValidity { } } -impl<'a> FromIterator> for OwnedValidity { - fn from_iter>>(_iter: T) -> Self { - todo!() - } -} - impl FromIterator for OwnedValidity { - fn from_iter>(_iter: T) -> Self { - todo!() + fn from_iter>(iter: T) -> Self { + let validities: Vec = iter.into_iter().collect(); + + // If they're all valid, then return a single validity. + if validities.iter().all(|v| v.is_all_valid()) { + return Self::AllValid; + } + // If they're all invalid, then return a single invalidity. + if validities.iter().all(|v| v.is_all_invalid()) { + return Self::AllInvalid; + } + + // Otherwise, map each to a bool array and concatenate them. + let arrays = validities + .iter() + .map(|v| { + v.to_present_null_buffer() + .unwrap() + .into_array_data() + .into_array() + }) + .collect::>(); + Self::Array(as_contiguous(&arrays).unwrap()) } } @@ -216,7 +232,55 @@ impl LogicalValidity { } } + pub fn to_present_null_buffer(&self) -> VortexResult { + match self { + LogicalValidity::AllValid(l) => Ok(NullBuffer::new_valid(*l)), + LogicalValidity::AllInvalid(l) => Ok(NullBuffer::new_null(*l)), + LogicalValidity::Array(a) => Ok(NullBuffer::new( + a.to_array().flatten_bool()?.boolean_buffer(), + )), + } + } + pub fn is_all_valid(&self) -> bool { matches!(self, LogicalValidity::AllValid(_)) } + + pub fn is_all_invalid(&self) -> bool { + matches!(self, LogicalValidity::AllInvalid(_)) + } + + pub fn len(&self) -> usize { + match self { + LogicalValidity::AllValid(n) => *n, + LogicalValidity::AllInvalid(n) => *n, + LogicalValidity::Array(a) => a.to_array().len(), + } + } + + pub fn is_empty(&self) -> bool { + match self { + LogicalValidity::AllValid(n) => *n == 0, + LogicalValidity::AllInvalid(n) => *n == 0, + LogicalValidity::Array(a) => a.to_array().is_empty(), + } + } + + pub fn into_validity<'a>(self) -> Validity<'a> { + match self { + LogicalValidity::AllValid(_) => Validity::AllValid, + LogicalValidity::AllInvalid(_) => Validity::AllInvalid, + LogicalValidity::Array(a) => Validity::Array(a.into_array()), + } + } +} + +impl IntoArray<'static> for LogicalValidity { + fn into_array(self) -> OwnedArray { + match self { + LogicalValidity::AllValid(len) => BoolArray::from(vec![true; len]).into_array(), + LogicalValidity::AllInvalid(len) => BoolArray::from(vec![false; len]).into_array(), + LogicalValidity::Array(a) => a.into_array(), + } + } } diff --git a/vortex-array/src/validity/array.rs b/vortex-array/src/validity/array.rs deleted file mode 100644 index 9af078eb77..0000000000 --- a/vortex-array/src/validity/array.rs +++ /dev/null @@ -1,26 +0,0 @@ -use crate::array::Array; -use crate::validity::owned::Validity; -use crate::validity::ValidityView; - -pub trait ArrayValidity { - fn logical_validity(&self) -> Validity; - - fn is_valid(&self, index: usize) -> bool; -} - -pub trait OwnedValidity { - fn validity(&self) -> Option; -} - -impl ArrayValidity for T { - fn logical_validity(&self) -> Validity { - self.validity() - .and_then(|v| v.logical_validity()) - .unwrap_or_else(|| Validity::Valid(self.len())) - } - - fn is_valid(&self, index: usize) -> bool { - self.validity() - .map_or(true, |v| ValidityView::is_valid(&v, index)) - } -} diff --git a/vortex-array/src/validity/encoding.rs b/vortex-array/src/validity/encoding.rs deleted file mode 100644 index 9449f4ba0f..0000000000 --- a/vortex-array/src/validity/encoding.rs +++ /dev/null @@ -1,32 +0,0 @@ -use linkme::distributed_slice; -use vortex_error::VortexResult; - -use crate::array::ArrayRef; -use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; -use crate::serde::{EncodingSerde, ReadCtx}; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_VALIDITY: EncodingRef = &ValidityEncoding; - -#[derive(Debug)] -pub struct ValidityEncoding; - -impl ValidityEncoding { - const ID: EncodingId = EncodingId::new("vortex.validity"); -} - -impl Encoding for ValidityEncoding { - fn id(&self) -> EncodingId { - ValidityEncoding::ID - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} - -impl EncodingSerde for ValidityEncoding { - fn read(&self, _ctx: &mut ReadCtx) -> VortexResult { - todo!() - } -} diff --git a/vortex-array/src/validity/mod.rs b/vortex-array/src/validity/mod.rs deleted file mode 100644 index 30cf4fc5a4..0000000000 --- a/vortex-array/src/validity/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub use array::*; -pub use encoding::*; -pub use owned::*; -pub use view::*; - -mod array; -mod encoding; -mod owned; -mod view; diff --git a/vortex-array/src/validity/owned.rs b/vortex-array/src/validity/owned.rs deleted file mode 100644 index 369b3fd348..0000000000 --- a/vortex-array/src/validity/owned.rs +++ /dev/null @@ -1,257 +0,0 @@ -use std::any::Any; -use std::sync::Arc; - -use arrow_buffer::{BooleanBuffer, NullBuffer}; -use itertools::Itertools; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, Nullability}; - -use crate::array::bool::BoolArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::as_contiguous::as_contiguous; -use crate::compute::slice::SliceFn; -use crate::compute::ArrayCompute; -use crate::encoding::EncodingRef; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, WriteCtx}; -use crate::stats::Stats; -use crate::validity::{ArrayValidity, ValidityEncoding}; -use crate::view::AsView; -use crate::ArrayWalker; - -#[derive(Debug, Clone)] -pub enum Validity { - Valid(usize), - Invalid(usize), - Array(ArrayRef), -} - -impl Validity { - pub const DTYPE: DType = DType::Bool(Nullability::NonNullable); - - pub fn array(array: ArrayRef) -> VortexResult { - if !matches!(array.dtype(), &Validity::DTYPE) { - vortex_bail!("Validity array must be of type bool"); - } - Ok(Self::Array(array)) - } - - pub fn try_from_logical( - logical: Validity, - nullability: Nullability, - ) -> VortexResult> { - match nullability { - Nullability::NonNullable => { - if !logical.as_view().all_valid() { - vortex_bail!("Non-nullable validity must be all valid"); - } - Ok(None) - } - Nullability::Nullable => Ok(Some(logical)), - } - } - - pub fn to_bool_array(&self) -> BoolArray { - self.as_view().to_bool_array() - } - - pub fn logical_validity(&self) -> Validity { - if self.as_view().all_valid() { - return Validity::Valid(self.len()); - } - if self.as_view().all_invalid() { - return Validity::Invalid(self.len()); - } - self.clone() - } - - #[allow(clippy::len_without_is_empty)] - pub fn len(&self) -> usize { - self.as_view().len() - } - - pub fn slice(&self, start: usize, stop: usize) -> VortexResult { - self.as_view().slice(start, stop) - } -} - -impl From for Validity { - fn from(value: NullBuffer) -> Self { - if value.null_count() == 0 { - Self::Valid(value.len()) - } else if value.null_count() == value.len() { - Self::Invalid(value.len()) - } else { - Self::Array(BoolArray::new(value.into_inner(), None).into_array()) - } - } -} - -impl From for Validity { - fn from(value: BooleanBuffer) -> Self { - if value.iter().all(|v| v) { - Self::Valid(value.len()) - } else if value.iter().all(|v| !v) { - Self::Invalid(value.len()) - } else { - Self::Array(BoolArray::new(value, None).into_array()) - } - } -} - -impl From> for Validity { - fn from(value: Vec) -> Self { - if value.iter().all(|v| *v) { - Self::Valid(value.len()) - } else if value.iter().all(|v| !*v) { - Self::Invalid(value.len()) - } else { - Self::Array(BoolArray::from(value).into_array()) - } - } -} - -impl PartialEq for Validity { - fn eq(&self, other: &Self) -> bool { - if self.len() != other.len() { - return false; - } - - match (self, other) { - (Self::Valid(_), Self::Valid(_)) => true, - (Self::Invalid(_), Self::Invalid(_)) => true, - _ => { - // TODO(ngates): use compute to dispatch an all() function. - self.to_bool_array().buffer() == other.to_bool_array().buffer() - } - } - } -} - -impl Eq for Validity {} - -impl FromIterator for Validity { - fn from_iter>(iter: T) -> Self { - let validities: Vec = iter.into_iter().collect(); - let total_len = validities.iter().map(|v| v.len()).sum(); - - // If they're all valid, then return a single validity. - if validities.iter().all(|v| v.as_view().all_valid()) { - return Self::Valid(total_len); - } - // If they're all invalid, then return a single invalidity. - if validities.iter().all(|v| v.as_view().all_invalid()) { - return Self::Invalid(total_len); - } - - // Otherwise, map each to a bool array and concatenate them. - let arrays = validities - .iter() - .map(|v| v.to_bool_array().into_array()) - .collect_vec(); - Self::Array(as_contiguous(&arrays).unwrap()) - } -} - -impl Array for Validity { - fn as_any(&self) -> &dyn Any { - todo!() - } - - fn into_any(self: Arc) -> Arc { - todo!() - } - - fn to_array(&self) -> ArrayRef { - todo!() - } - - fn into_array(self) -> ArrayRef { - todo!() - } - - fn len(&self) -> usize { - self.len() - } - - fn is_empty(&self) -> bool { - match self { - Validity::Valid(len) | Validity::Invalid(len) => *len == 0, - Validity::Array(a) => a.is_empty(), - } - } - - fn dtype(&self) -> &DType { - &Validity::DTYPE - } - - fn stats(&self) -> Stats { - todo!() - } - - fn encoding(&self) -> EncodingRef { - &ValidityEncoding - } - - fn nbytes(&self) -> usize { - match self { - Validity::Valid(_) | Validity::Invalid(_) => 8, - Validity::Array(a) => a.nbytes(), - } - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - Ok(()) - } -} - -impl ArrayValidity for Validity { - fn logical_validity(&self) -> Validity { - // Validity is a non-nullable boolean array. - Validity::Valid(self.len()) - } - - fn is_valid(&self, _index: usize) -> bool { - true - } -} - -impl ArrayDisplay for Validity { - fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { - self.as_view().fmt(fmt) - } -} - -impl ArrayCompute for Validity { - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } -} - -impl ArraySerde for Validity { - fn write(&self, _ctx: &mut WriteCtx) -> VortexResult<()> { - todo!() - } - - fn metadata(&self) -> VortexResult>> { - self.as_view().serde().unwrap().metadata() - } -} - -impl SliceFn for Validity { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(Arc::new(self.as_view().slice(start, stop)?)) - } -} diff --git a/vortex-array/src/validity/view.rs b/vortex-array/src/validity/view.rs deleted file mode 100644 index 79f3a79ad7..0000000000 --- a/vortex-array/src/validity/view.rs +++ /dev/null @@ -1,263 +0,0 @@ -use std::any::Any; -use std::sync::Arc; - -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::array::bool::BoolArray; -use crate::array::constant::ConstantArray; -use crate::array::{Array, ArrayRef}; -use crate::compute::flatten::flatten_bool; -use crate::compute::scalar_at::scalar_at; -use crate::compute::slice::{slice, SliceFn}; -use crate::compute::take::take; -use crate::compute::ArrayCompute; -use crate::encoding::EncodingRef; -use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, ArrayView, WriteCtx}; -use crate::stats::{Stat, Stats}; -use crate::validity::owned::Validity; -use crate::validity::{ArrayValidity, ValidityEncoding}; -use crate::view::{AsView, ToOwnedView}; -use crate::ArrayWalker; - -#[derive(Debug, Clone)] -pub enum ValidityView<'a> { - Valid(usize), - Invalid(usize), - Array(&'a dyn Array), -} - -impl<'v> AsView<'v, ValidityView<'v>> for Validity { - fn as_view(&'v self) -> ValidityView<'v> { - match self { - Self::Valid(len) => ValidityView::Valid(*len), - Self::Invalid(len) => ValidityView::Invalid(*len), - Self::Array(a) => ValidityView::Array(a.as_ref()), - } - } -} - -impl<'v> ToOwnedView<'v> for ValidityView<'v> { - type Owned = Validity; - - fn to_owned_view(&self) -> Self::Owned { - match self { - Self::Valid(len) => Validity::Valid(*len), - Self::Invalid(len) => Validity::Invalid(*len), - Self::Array(a) => Validity::Array(a.to_array()), - } - } -} - -impl ValidityView<'_> { - pub fn len(&self) -> usize { - match self { - Self::Valid(len) | Self::Invalid(len) => *len, - Self::Array(a) => a.len(), - } - } - - pub fn is_empty(&self) -> bool { - match self { - Self::Valid(len) | Self::Invalid(len) => *len == 0, - Self::Array(a) => a.is_empty(), - } - } - - pub fn all_valid(&self) -> bool { - match self { - Self::Valid(_) => true, - Self::Invalid(_) => false, - Self::Array(a) => a - .stats() - .get_or_compute_as::(&Stat::TrueCount) - .map(|true_count| true_count == self.len()) - .unwrap_or(false), - } - } - - pub fn all_invalid(&self) -> bool { - match self { - Self::Valid(_) => false, - Self::Invalid(_) => true, - Self::Array(a) => a - .stats() - .get_or_compute_as::(&Stat::TrueCount) - .map(|true_count| true_count == 0) - .unwrap_or(false), - } - } - - pub fn to_array(&self) -> ArrayRef { - match self { - Self::Valid(len) => ConstantArray::new(true, *len).into_array(), - Self::Invalid(len) => ConstantArray::new(false, *len).into_array(), - Self::Array(a) => a.to_array(), - } - } - - pub fn to_bool_array(&self) -> BoolArray { - match self { - Self::Valid(len) => BoolArray::from(vec![true; *len]), - Self::Invalid(len) => BoolArray::from(vec![false; *len]), - Self::Array(a) => flatten_bool(*a).unwrap(), - } - } - - pub fn logical_validity(&self) -> Option { - match self.all_valid() { - true => None, - false => Some(self.to_owned_view()), - } - } - - pub fn is_valid(&self, idx: usize) -> bool { - match self { - Self::Valid(_) => true, - Self::Invalid(_) => false, - Self::Array(a) => scalar_at(*a, idx).and_then(|s| s.try_into()).unwrap(), - } - } - - pub fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(match self { - Self::Valid(_) => Validity::Valid(stop - start), - Self::Invalid(_) => Validity::Invalid(stop - start), - Self::Array(a) => Validity::Array(slice(*a, start, stop)?), - }) - } - - pub fn take(&self, indices: &dyn Array) -> VortexResult { - match self { - Self::Valid(_) => Ok(Validity::Valid(indices.len())), - Self::Invalid(_) => Ok(Validity::Invalid(indices.len())), - Self::Array(a) => Ok(Validity::Array(take(*a, indices)?)), - } - } - - pub fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } -} - -impl<'a> From> for ValidityView<'a> { - fn from(_value: ArrayView<'a>) -> Self { - // FIXME(ngates): parse the metadata, and return the appropriate ValidityView - ValidityView::Valid(100) - } -} - -impl Array for ValidityView<'_> { - fn as_any(&self) -> &dyn Any { - todo!() - } - - fn into_any(self: Arc) -> Arc { - todo!() - } - - fn to_array(&self) -> ArrayRef { - todo!() - } - - fn into_array(self) -> ArrayRef { - todo!() - } - - fn len(&self) -> usize { - match self { - ValidityView::Valid(len) | ValidityView::Invalid(len) => *len, - ValidityView::Array(a) => a.len(), - } - } - - fn is_empty(&self) -> bool { - match self { - ValidityView::Valid(len) | ValidityView::Invalid(len) => *len == 0, - ValidityView::Array(a) => a.is_empty(), - } - } - - fn dtype(&self) -> &DType { - &Validity::DTYPE - } - - fn stats(&self) -> Stats { - todo!() - } - - fn encoding(&self) -> EncodingRef { - &ValidityEncoding - } - - fn nbytes(&self) -> usize { - match self { - ValidityView::Valid(_) | ValidityView::Invalid(_) => 8, - ValidityView::Array(a) => a.nbytes(), - } - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - Ok(()) - } -} - -impl ArrayValidity for ValidityView<'_> { - fn logical_validity(&self) -> Validity { - // Validity is a non-nullable boolean array. - Validity::Valid(self.len()) - } - - fn is_valid(&self, _index: usize) -> bool { - true - } -} - -impl ArrayDisplay for ValidityView<'_> { - fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { - match self { - ValidityView::Valid(_) => fmt.property("all", "valid"), - ValidityView::Invalid(_) => fmt.property("all", "invalid"), - ValidityView::Array(a) => fmt.child("validity", *a), - } - } -} - -impl ArrayCompute for ValidityView<'_> { - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } -} - -impl ArraySerde for ValidityView<'_> { - fn write(&self, _ctx: &mut WriteCtx) -> VortexResult<()> { - todo!() - } - - fn metadata(&self) -> VortexResult>> { - // TODO: Implement this - Ok(None) - } -} - -impl SliceFn for ValidityView<'_> { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(Arc::new(self.slice(start, stop)?)) - } -} diff --git a/vortex-array/src/vendored.rs b/vortex-array/src/vendored.rs new file mode 100644 index 0000000000..31e6999145 --- /dev/null +++ b/vortex-array/src/vendored.rs @@ -0,0 +1 @@ +pub use paste; diff --git a/vortex-array/src/view.rs b/vortex-array/src/view.rs index 14fce23588..eeec5cd974 100644 --- a/vortex-array/src/view.rs +++ b/vortex-array/src/view.rs @@ -1,31 +1,153 @@ -// Similar trait to Borrow, except can return a struct with a lifetime. -pub trait AsView<'v, View: Sized + 'v> { - fn as_view(&'v self) -> View; -} +use std::fmt::{Debug, Formatter}; + +use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult}; +use vortex_schema::DType; -pub trait ToOwnedView<'v>: Sized -where - Self: 'v, -{ - type Owned: AsView<'v, Self>; +use crate::buffer::Buffer; +use crate::encoding::EncodingRef; +use crate::flatbuffers::array as fb; +use crate::stats::{EmptyStatistics, Statistics}; +use crate::SerdeContext; +use crate::{Array, IntoArray, ToArray}; - fn to_owned_view(&'v self) -> Self::Owned; +#[derive(Clone)] +pub struct ArrayView<'v> { + encoding: EncodingRef, + dtype: &'v DType, + array: fb::Array<'v>, + buffers: &'v [Buffer<'v>], + ctx: &'v SerdeContext, + // TODO(ngates): a store a Projection. A projected ArrayView contains the full fb::Array + // metadata, but only the buffers from the selected columns. Therefore we need to know + // which fb:Array children to skip when calculating how to slice into buffers. } -impl<'v, View: 'v, Owned: AsView<'v, View>> AsView<'v, Option> for Option { - fn as_view(&'v self) -> Option { - self.as_ref().map(|owned| owned.as_view()) +impl<'a> Debug for ArrayView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrayView") + .field("encoding", &self.encoding) + .field("dtype", self.dtype) + // .field("array", &self.array) + .field("buffers", &self.buffers) + .field("ctx", &self.ctx) + .finish() } } -impl<'v, Owned, View> ToOwnedView<'v> for Option -where - View: ToOwnedView<'v, Owned = Owned> + 'v, - Owned: AsView<'v, View>, -{ - type Owned = Option; +impl<'v> ArrayView<'v> { + pub fn try_new( + ctx: &'v SerdeContext, + dtype: &'v DType, + array: fb::Array<'v>, + buffers: &'v [Buffer], + ) -> VortexResult { + let encoding = ctx + .find_encoding(array.encoding()) + .ok_or_else(|| vortex_err!(InvalidSerde: "Encoding ID out of bounds"))?; + + if buffers.len() != Self::cumulative_nbuffers(array) { + vortex_bail!(InvalidSerde: + "Incorrect number of buffers {}, expected {}", + buffers.len(), + Self::cumulative_nbuffers(array) + ) + } + + let view = Self { + encoding, + dtype, + array, + buffers, + ctx, + }; + + // Validate here that the metadata correctly parses, so that an encoding can infallibly + // implement Encoding::with_view(). + // FIXME(ngates): validate the metadata + view.to_array().with_dyn(|_| Ok::<(), VortexError>(()))?; + + Ok(view) + } + + pub fn encoding(&self) -> EncodingRef { + self.encoding + } + + pub fn dtype(&self) -> &DType { + self.dtype + } + + pub fn metadata(&self) -> Option<&'v [u8]> { + self.array.metadata().map(|m| m.bytes()) + } + + // TODO(ngates): should we separate self and DType lifetimes? Should DType be cloned? + pub fn child(&'v self, idx: usize, dtype: &'v DType) -> Option> { + let child = self.array_child(idx)?; + + // Figure out how many buffers to skip... + // We store them depth-first. + let buffer_offset = self + .array + .children()? + .iter() + .take(idx) + .map(|child| Self::cumulative_nbuffers(child)) + .sum(); + let buffer_count = Self::cumulative_nbuffers(child); + + Some( + Self::try_new( + self.ctx, + dtype, + child, + &self.buffers[buffer_offset..][0..buffer_count], + ) + .unwrap(), + ) + } + + fn array_child(&self, idx: usize) -> Option> { + let children = self.array.children()?; + if idx < children.len() { + Some(children.get(idx)) + } else { + None + } + } + + /// Whether the current Array makes use of a buffer + pub fn has_buffer(&self) -> bool { + self.array.has_buffer() + } + + /// The number of buffers used by the current Array and all its children. + fn cumulative_nbuffers(array: fb::Array) -> usize { + let mut nbuffers = if array.has_buffer() { 1 } else { 0 }; + for child in array.children().unwrap_or_default() { + nbuffers += Self::cumulative_nbuffers(child) + } + nbuffers + } + + pub fn buffer(&self) -> Option<&'v Buffer<'v>> { + self.has_buffer().then(|| &self.buffers[0]) + } + + pub fn statistics(&self) -> &dyn Statistics { + // TODO(ngates): store statistics in FlatBuffers + &EmptyStatistics + } +} + +impl ToArray for ArrayView<'_> { + fn to_array(&self) -> Array { + Array::View(self.clone()) + } +} - fn to_owned_view(&'v self) -> Self::Owned { - self.as_ref().map(|view| view.to_owned_view()) +impl<'v> IntoArray<'v> for ArrayView<'v> { + fn into_array(self) -> Array<'v> { + Array::View(self) } } diff --git a/vortex-array2/src/visitor.rs b/vortex-array/src/visitor.rs similarity index 100% rename from vortex-array2/src/visitor.rs rename to vortex-array/src/visitor.rs diff --git a/vortex-array/src/walk.rs b/vortex-array/src/walk.rs deleted file mode 100644 index 251b858119..0000000000 --- a/vortex-array/src/walk.rs +++ /dev/null @@ -1,10 +0,0 @@ -use arrow_buffer::Buffer; -use vortex_error::VortexResult; - -use crate::array::Array; - -pub trait ArrayWalker { - fn visit_child(&mut self, array: &dyn Array) -> VortexResult<()>; - - fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()>; -} diff --git a/vortex-array2/Cargo.toml b/vortex-array2/Cargo.toml deleted file mode 100644 index 891caeafcf..0000000000 --- a/vortex-array2/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -[package] -name = "vortex-array2" -version = { workspace = true } -description = "Vortex in memory columnar data format" -homepage = { workspace = true } -repository = { workspace = true } -authors = { workspace = true } -license = { workspace = true } -keywords = { workspace = true } -include = { workspace = true } -edition = { workspace = true } -rust-version = { workspace = true } - -[dependencies] -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-schema = { workspace = true } -flatbuffers = { workspace = true } -flexbuffers = { workspace = true } -half = { workspace = true } -humansize = { workspace = true } -itertools = { workspace = true } -linkme = { workspace = true } -log = { workspace = true } -num-traits = { workspace = true } -paste = { workspace = true } -serde = { workspace = true, features = ["derive"] } -vortex-array = { path = "../vortex-array" } -vortex-error = { path = "../vortex-error", features = ["flexbuffers"] } -vortex-flatbuffers = { path = "../vortex-flatbuffers" } -vortex-schema = { path = "../vortex-schema", features = ["serde"] } - -[lints] -workspace = true diff --git a/vortex-array2/src/accessor.rs b/vortex-array2/src/accessor.rs deleted file mode 100644 index 6688dec15f..0000000000 --- a/vortex-array2/src/accessor.rs +++ /dev/null @@ -1,10 +0,0 @@ -use vortex_error::VortexResult; - -pub trait ArrayAccessor { - type Item<'a>; - - fn with_iterator FnOnce(&mut dyn Iterator>) -> R, R>( - &self, - f: F, - ) -> VortexResult; -} diff --git a/vortex-array2/src/array/bool/compute/mod.rs b/vortex-array2/src/array/bool/compute/mod.rs deleted file mode 100644 index f804de0538..0000000000 --- a/vortex-array2/src/array/bool/compute/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -use crate::array::bool::BoolArray; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::fill::FillForwardFn; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; - -mod as_arrow; -mod as_contiguous; -mod fill; -mod flatten; -mod scalar_at; -mod take; - -impl ArrayCompute for BoolArray<'_> { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - Some(self) - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} diff --git a/vortex-array2/src/array/bool/compute/take.rs b/vortex-array2/src/array/bool/compute/take.rs deleted file mode 100644 index 7b4f16024d..0000000000 --- a/vortex-array2/src/array/bool/compute/take.rs +++ /dev/null @@ -1,59 +0,0 @@ -use arrow_buffer::BooleanBuffer; -use num_traits::AsPrimitive; -use vortex::match_each_integer_ptype; -use vortex_error::VortexResult; - -use crate::array::bool::BoolArray; -use crate::compute::take::TakeFn; -use crate::IntoArray; -use crate::{Array, OwnedArray}; - -impl TakeFn for BoolArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - let validity = self.validity(); - let indices = indices.clone().flatten_primitive()?; - match_each_integer_ptype!(indices.ptype(), |$I| { - Ok(BoolArray::from_vec( - take_bool(&self.boolean_buffer(), indices.typed_data::<$I>()), - validity.take(indices.array())?, - ).into_array()) - }) - } -} - -fn take_bool>(bools: &BooleanBuffer, indices: &[I]) -> Vec { - indices.iter().map(|&idx| bools.value(idx.as_())).collect() -} - -#[cfg(test)] -mod test { - use crate::array::bool::BoolArray; - use crate::array::primitive::PrimitiveArray; - use crate::compute::take::take; - use crate::IntoArray; - - #[test] - fn take_nullable() { - let reference = BoolArray::from_iter(vec![ - Some(false), - Some(true), - Some(false), - None, - Some(false), - ]) - .into_array(); - - let b = BoolArray::try_from( - take( - &reference, - &PrimitiveArray::from(vec![0, 3, 4]).into_array(), - ) - .unwrap(), - ) - .unwrap(); - assert_eq!( - b.boolean_buffer(), - BoolArray::from_iter(vec![Some(false), None, Some(false)]).boolean_buffer() - ); - } -} diff --git a/vortex-array2/src/array/bool/mod.rs b/vortex-array2/src/array/bool/mod.rs deleted file mode 100644 index cc8ad0f829..0000000000 --- a/vortex-array2/src/array/bool/mod.rs +++ /dev/null @@ -1,135 +0,0 @@ -mod compute; -mod stats; - -use std::collections::HashMap; - -use arrow_buffer::BooleanBuffer; -use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::buffer::Buffer; -use crate::validity::{ArrayValidity, ValidityMetadata}; -use crate::validity::{LogicalValidity, Validity}; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayFlatten}; - -impl_encoding!("vortex.bool", Bool); - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct BoolMetadata { - validity: ValidityMetadata, - length: usize, -} - -impl BoolArray<'_> { - pub fn buffer(&self) -> &Buffer { - self.array().buffer(0).expect("missing buffer") - } - - pub fn boolean_buffer(&self) -> BooleanBuffer { - BooleanBuffer::new(BoolArray::buffer(self).clone().into(), 0, self.len()) - } - - pub fn validity(&self) -> Validity { - self.metadata() - .validity - .to_validity(self.array().child(0, &Validity::DTYPE)) - } -} - -impl BoolArray<'_> { - pub fn try_new(buffer: BooleanBuffer, validity: Validity) -> VortexResult { - Self::try_from_parts( - DType::Bool(validity.nullability()), - BoolMetadata { - validity: validity.to_metadata(buffer.len())?, - length: buffer.len(), - }, - vec![Buffer::Owned(buffer.into_inner())].into(), - validity.into_array_data().into_iter().collect_vec().into(), - HashMap::default(), - ) - } - - pub fn from_vec(bools: Vec, validity: Validity) -> Self { - let buffer = BooleanBuffer::from(bools); - Self::try_new(buffer, validity).unwrap() - } -} - -impl From for OwnedBoolArray { - fn from(value: BooleanBuffer) -> Self { - BoolArray::try_new(value, Validity::NonNullable).unwrap() - } -} - -impl From> for OwnedBoolArray { - fn from(value: Vec) -> Self { - BoolArray::from_vec(value, Validity::NonNullable) - } -} - -impl FromIterator> for OwnedBoolArray { - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (lower, _) = iter.size_hint(); - - let mut validity: Vec = Vec::with_capacity(lower); - let values: Vec = iter - .map(|i| { - validity.push(i.is_some()); - i.unwrap_or_default() - }) - .collect::>(); - - BoolArray::try_new(BooleanBuffer::from(values), Validity::from(validity)).unwrap() - } -} - -impl ArrayTrait for BoolArray<'_> { - fn len(&self) -> usize { - self.metadata().length - } -} - -impl ArrayFlatten for BoolArray<'_> { - fn flatten<'a>(self) -> VortexResult> - where - Self: 'a, - { - Ok(Flattened::Bool(self)) - } -} - -impl ArrayValidity for BoolArray<'_> { - fn is_valid(&self, index: usize) -> bool { - self.validity().is_valid(index) - } - - fn logical_validity(&self) -> LogicalValidity { - self.validity().to_logical(self.len()) - } -} - -impl AcceptArrayVisitor for BoolArray<'_> { - fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_buffer(self.buffer())?; - visitor.visit_validity(&self.validity()) - } -} - -#[cfg(test)] -mod tests { - use crate::array::bool::BoolArray; - use crate::compute::scalar_at::scalar_at; - use crate::IntoArray; - - #[test] - fn bool_array() { - let arr = BoolArray::from(vec![true, false, true]).into_array(); - let scalar: bool = scalar_at(&arr, 0).unwrap().try_into().unwrap(); - assert!(scalar); - } -} diff --git a/vortex-array2/src/array/bool/stats.rs b/vortex-array2/src/array/bool/stats.rs deleted file mode 100644 index 608fa7e504..0000000000 --- a/vortex-array2/src/array/bool/stats.rs +++ /dev/null @@ -1,43 +0,0 @@ -use std::collections::HashMap; - -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::array::bool::BoolArray; -use crate::stats::{ArrayStatisticsCompute, Stat}; - -impl ArrayStatisticsCompute for BoolArray<'_> { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - if self.is_empty() { - return Ok(HashMap::from([ - (Stat::TrueCount, 0.into()), - (Stat::RunCount, 0.into()), - ])); - } - - let mut prev_bit = self.boolean_buffer().value(0); - let mut true_count: usize = if prev_bit { 1 } else { 0 }; - let mut run_count: usize = 0; - for bit in self.boolean_buffer().iter().skip(1) { - if bit { - true_count += 1 - } - if bit != prev_bit { - run_count += 1; - prev_bit = bit; - } - } - run_count += 1; - - Ok(HashMap::from([ - (Stat::Min, (true_count == self.len()).into()), - (Stat::Max, (true_count > 0).into()), - ( - Stat::IsConstant, - (true_count == self.len() || true_count == 0).into(), - ), - (Stat::RunCount, run_count.into()), - (Stat::TrueCount, true_count.into()), - ])) - } -} diff --git a/vortex-array2/src/array/chunked/compute/mod.rs b/vortex-array2/src/array/chunked/compute/mod.rs deleted file mode 100644 index 4a9585361d..0000000000 --- a/vortex-array2/src/array/chunked/compute/mod.rs +++ /dev/null @@ -1,45 +0,0 @@ -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::array::chunked::ChunkedArray; -use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; -use crate::{Array, OwnedArray, ToStatic}; - -mod take; - -impl ArrayCompute for ChunkedArray<'_> { - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl AsContiguousFn for ChunkedArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { - // Combine all the chunks into one, then call as_contiguous again. - let mut chunks = Vec::with_capacity(self.nchunks()); - for array in arrays { - for chunk in ChunkedArray::try_from(array).unwrap().chunks() { - chunks.push(chunk.to_static()); - } - } - as_contiguous(&chunks) - } -} - -impl ScalarAtFn for ChunkedArray<'_> { - fn scalar_at(&self, index: usize) -> VortexResult { - let (chunk_index, chunk_offset) = self.find_chunk_idx(index); - scalar_at(&self.chunk(chunk_index).unwrap(), chunk_offset) - } -} diff --git a/vortex-array2/src/array/chunked/compute/take.rs b/vortex-array2/src/array/chunked/compute/take.rs deleted file mode 100644 index 800318ad45..0000000000 --- a/vortex-array2/src/array/chunked/compute/take.rs +++ /dev/null @@ -1,82 +0,0 @@ -use vortex::ptype::PType; -use vortex_error::VortexResult; - -use crate::array::chunked::ChunkedArray; -use crate::compute::cast::cast; -use crate::compute::take::{take, TakeFn}; -use crate::{Array, IntoArray, OwnedArray, ToArray, ToStatic}; - -impl TakeFn for ChunkedArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - if self.len() == indices.len() { - return Ok(self.to_array().to_static()); - } - - let indices = cast(indices, PType::U64.into())?.flatten_primitive()?; - - // While the chunk idx remains the same, accumulate a list of chunk indices. - let mut chunks = Vec::new(); - let mut indices_in_chunk = Vec::new(); - let mut prev_chunk_idx = self - .find_chunk_idx(indices.typed_data::()[0] as usize) - .0; - for idx in indices.typed_data::() { - let (chunk_idx, idx_in_chunk) = self.find_chunk_idx(*idx as usize); - - if chunk_idx != prev_chunk_idx { - // Start a new chunk - let indices_in_chunk_array = indices_in_chunk.clone().into_array(); - chunks.push(take( - &self.chunk(prev_chunk_idx).unwrap(), - &indices_in_chunk_array, - )?); - indices_in_chunk = Vec::new(); - } - - indices_in_chunk.push(idx_in_chunk as u64); - prev_chunk_idx = chunk_idx; - } - - if !indices_in_chunk.is_empty() { - let indices_in_chunk_array = indices_in_chunk.into_array(); - chunks.push(take( - &self.chunk(prev_chunk_idx).unwrap(), - &indices_in_chunk_array, - )?); - } - - Ok(ChunkedArray::new(chunks, self.dtype().clone()).into_array()) - } -} - -#[cfg(test)] -mod test { - use itertools::Itertools; - - use crate::array::chunked::ChunkedArray; - use crate::array::primitive::PrimitiveArray; - use crate::compute::as_contiguous::as_contiguous; - use crate::compute::take::take; - use crate::IntoArray; - - #[test] - fn test_take() { - let a = vec![1i32, 2, 3].into_array(); - let arr = ChunkedArray::new(vec![a.clone(), a.clone(), a.clone()], a.dtype().clone()); - assert_eq!(arr.nchunks(), 3); - assert_eq!(arr.len(), 9); - let indices = vec![0, 0, 6, 4].into_array(); - - let result = PrimitiveArray::try_from( - as_contiguous( - &ChunkedArray::try_from(take(arr.as_ref(), &indices).unwrap()) - .unwrap() - .chunks() - .collect_vec(), - ) - .unwrap(), - ) - .unwrap(); - assert_eq!(result.typed_data::(), &[1, 1, 1, 2]); - } -} diff --git a/vortex-array2/src/array/chunked/mod.rs b/vortex-array2/src/array/chunked/mod.rs deleted file mode 100644 index efbc4df021..0000000000 --- a/vortex-array2/src/array/chunked/mod.rs +++ /dev/null @@ -1,215 +0,0 @@ -use std::collections::HashMap; - -use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::scalar_at::scalar_at; -use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; -use crate::validity::Validity::NonNullable; -use crate::validity::{ArrayValidity, LogicalValidity}; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayFlatten, IntoArrayData, OwnedArray, ToArrayData}; - -mod compute; -mod stats; - -impl_encoding!("vortex.chunked", Chunked); - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct ChunkedMetadata; - -impl ChunkedArray<'_> { - const ENDS_DTYPE: DType = DType::Int( - IntWidth::_64, - Signedness::Unsigned, - Nullability::NonNullable, - ); - - pub fn new(chunks: Vec, dtype: DType) -> Self { - Self::try_new(chunks, dtype).unwrap() - } - - pub fn try_new(chunks: Vec, dtype: DType) -> VortexResult { - for chunk in &chunks { - if chunk.dtype() != &dtype { - vortex_bail!(MismatchedTypes: dtype, chunk.dtype()); - } - } - - let chunk_ends = PrimitiveArray::from_vec( - [0u64] - .into_iter() - .chain(chunks.iter().map(|c| c.len() as u64)) - .scan(0, |acc, c| { - *acc += c; - Some(*acc) - }) - .collect_vec(), - NonNullable, - ); - - let mut children = vec![chunk_ends.into_array_data()]; - children.extend(chunks.iter().map(|a| a.to_array_data())); - - Self::try_from_parts( - dtype, - ChunkedMetadata, - vec![].into(), - children.into(), - HashMap::default(), - ) - } - - #[inline] - pub fn chunk(&self, idx: usize) -> Option { - // Offset the index since chunk_ends is child 0. - self.array().child(idx + 1, self.array().dtype()) - } - - pub fn nchunks(&self) -> usize { - self.chunk_ends().len() - 1 - } - - #[inline] - pub fn chunk_ends(&self) -> Array { - self.array() - .child(0, &Self::ENDS_DTYPE) - .expect("missing chunk ends") - } - - pub fn find_chunk_idx(&self, index: usize) -> (usize, usize) { - assert!(index <= self.len(), "Index out of bounds of the array"); - - // TODO(ngates): migrate to the new search_sorted API to subtract 1 if not exact match. - let mut index_chunk = - search_sorted(&self.chunk_ends(), index, SearchSortedSide::Left).unwrap(); - let mut chunk_start = - usize::try_from(scalar_at(&self.chunk_ends(), index_chunk).unwrap()).unwrap(); - - if chunk_start != index { - index_chunk -= 1; - chunk_start = - usize::try_from(scalar_at(&self.chunk_ends(), index_chunk).unwrap()).unwrap(); - } - - let index_in_chunk = index - chunk_start; - (index_chunk, index_in_chunk) - } -} - -impl<'a> ChunkedArray<'a> { - pub fn chunks(&'a self) -> impl Iterator> { - (0..self.nchunks()).map(|c| self.chunk(c).unwrap()) - } -} - -impl FromIterator for OwnedChunkedArray { - fn from_iter>(iter: T) -> Self { - let chunks: Vec = iter.into_iter().collect(); - let dtype = chunks - .first() - .map(|c| c.dtype().clone()) - .expect("Cannot create a chunked array from an empty iterator"); - Self::new(chunks, dtype) - } -} - -impl ArrayFlatten for ChunkedArray<'_> { - fn flatten<'a>(self) -> VortexResult> - where - Self: 'a, - { - Ok(Flattened::Chunked(self)) - } -} - -impl AcceptArrayVisitor for ChunkedArray<'_> { - fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_child("chunk_ends", &self.chunk_ends())?; - for (idx, chunk) in self.chunks().enumerate() { - visitor.visit_child(format!("[{}]", idx).as_str(), &chunk)?; - } - Ok(()) - } -} - -impl ArrayTrait for ChunkedArray<'_> { - fn len(&self) -> usize { - usize::try_from(scalar_at(&self.chunk_ends(), self.nchunks()).unwrap()).unwrap() - } -} - -impl ArrayValidity for ChunkedArray<'_> { - fn is_valid(&self, _index: usize) -> bool { - todo!() - } - - fn logical_validity(&self) -> LogicalValidity { - todo!() - } -} - -#[cfg(test)] -mod test { - use vortex::ptype::NativePType; - use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - - use crate::array::chunked::{ChunkedArray, OwnedChunkedArray}; - use crate::{Array, IntoArray}; - - #[allow(dead_code)] - fn chunked_array() -> OwnedChunkedArray { - ChunkedArray::new( - vec![ - vec![1u64, 2, 3].into_array(), - vec![4u64, 5, 6].into_array(), - vec![7u64, 8, 9].into_array(), - ], - DType::Int( - IntWidth::_64, - Signedness::Unsigned, - Nullability::NonNullable, - ), - ) - } - - #[allow(dead_code)] - fn assert_equal_slices(arr: Array, slice: &[T]) { - let mut values = Vec::with_capacity(arr.len()); - ChunkedArray::try_from(arr) - .unwrap() - .chunks() - .map(|a| a.flatten_primitive().unwrap()) - .for_each(|a| values.extend_from_slice(a.typed_data::())); - assert_eq!(values, slice); - } - - // FIXME(ngates): bring back when slicing is a compute function. - // #[test] - // pub fn slice_middle() { - // assert_equal_slices(chunked_array().slice(2, 5).unwrap(), &[3u64, 4, 5]) - // } - // - // #[test] - // pub fn slice_begin() { - // assert_equal_slices(chunked_array().slice(1, 3).unwrap(), &[2u64, 3]); - // } - // - // #[test] - // pub fn slice_aligned() { - // assert_equal_slices(chunked_array().slice(3, 6).unwrap(), &[4u64, 5, 6]); - // } - // - // #[test] - // pub fn slice_many_aligned() { - // assert_equal_slices(chunked_array().slice(0, 6).unwrap(), &[1u64, 2, 3, 4, 5, 6]); - // } - // - // #[test] - // pub fn slice_end() { - // assert_equal_slices(chunked_array().slice(7, 8).unwrap(), &[8u64]); - // } -} diff --git a/vortex-array2/src/array/chunked/stats.rs b/vortex-array2/src/array/chunked/stats.rs deleted file mode 100644 index 0cb8e521b2..0000000000 --- a/vortex-array2/src/array/chunked/stats.rs +++ /dev/null @@ -1,13 +0,0 @@ -use std::collections::HashMap; - -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::array::chunked::ChunkedArray; -use crate::stats::{ArrayStatisticsCompute, Stat}; - -impl ArrayStatisticsCompute for ChunkedArray<'_> { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - todo!() - } -} diff --git a/vortex-array2/src/array/composite/array.rs b/vortex-array2/src/array/composite/array.rs deleted file mode 100644 index 11b5e8bb2d..0000000000 --- a/vortex-array2/src/array/composite/array.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::collections::HashMap; - -use vortex::scalar::AsBytes; -use vortex_error::VortexResult; -use vortex_schema::{CompositeID, DType}; - -use crate::array::composite::{find_extension, CompositeExtensionRef, TypedCompositeArray}; -use crate::compute::ArrayCompute; -use crate::stats::ArrayStatisticsCompute; -use crate::validity::{ArrayValidity, LogicalValidity}; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{ - impl_encoding, ArrayFlatten, IntoArrayData, TryDeserializeArrayMetadata, - TrySerializeArrayMetadata, -}; - -pub trait UnderlyingMetadata: - 'static + Send + Sync + Debug + TrySerializeArrayMetadata + for<'m> TryDeserializeArrayMetadata<'m> -{ - fn id(&self) -> CompositeID; -} - -impl_encoding!("vortex.composite", Composite); - -#[derive(Debug, Clone)] -pub struct CompositeMetadata { - ext: CompositeExtensionRef, - underlying_dtype: DType, - underlying_metadata: Arc<[u8]>, -} - -impl<'a> CompositeArray<'a> { - pub fn new(id: CompositeID, metadata: Arc<[u8]>, underlying: Array<'a>) -> Self { - let dtype = DType::Composite(id, underlying.dtype().is_nullable().into()); - let ext = find_extension(id.0).expect("Unrecognized composite extension"); - Self::try_from_parts( - dtype, - CompositeMetadata { - ext, - underlying_dtype: underlying.dtype().clone(), - underlying_metadata: metadata, - }, - vec![].into(), - vec![underlying.into_array_data()].into(), - HashMap::default(), - ) - .unwrap() - } -} - -impl CompositeArray<'_> { - #[inline] - pub fn id(&self) -> CompositeID { - self.metadata().ext.id() - } - - #[inline] - pub fn extension(&self) -> CompositeExtensionRef { - find_extension(self.id().0).expect("Unrecognized composite extension") - } - - pub fn underlying_metadata(&self) -> &Arc<[u8]> { - &self.metadata().underlying_metadata - } - - pub fn underlying_dtype(&self) -> &DType { - &self.metadata().underlying_dtype - } - - #[inline] - pub fn underlying(&self) -> Array { - self.array() - .child(0, self.underlying_dtype()) - .expect("CompositeArray must have an underlying array") - } - - pub fn with_compute(&self, mut f: F) -> R - where - F: FnMut(&dyn ArrayCompute) -> R, - { - let mut result = None; - - self.extension() - .with_compute(self, &mut |c| { - result = Some(f(c)); - Ok(()) - }) - .unwrap(); - - // Now we unwrap the optional, which we know to be populated by the closure. - result.unwrap() - } - - pub fn as_typed TryDeserializeArrayMetadata<'a>>( - &self, - ) -> VortexResult> { - Ok(TypedCompositeArray::new( - M::try_deserialize_metadata(Some(self.underlying_metadata().as_bytes()))?, - self.underlying().clone(), - )) - } -} - -impl ArrayTrait for CompositeArray<'_> { - fn len(&self) -> usize { - self.underlying().len() - } -} - -impl ArrayFlatten for CompositeArray<'_> { - fn flatten<'a>(self) -> VortexResult> - where - Self: 'a, - { - Ok(Flattened::Composite(self)) - } -} - -impl ArrayValidity for CompositeArray<'_> { - fn is_valid(&self, index: usize) -> bool { - self.underlying().with_dyn(|a| a.is_valid(index)) - } - - fn logical_validity(&self) -> LogicalValidity { - self.underlying().with_dyn(|a| a.logical_validity()) - } -} - -impl AcceptArrayVisitor for CompositeArray<'_> { - fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_child("underlying", &self.underlying()) - } -} - -impl ArrayStatisticsCompute for CompositeArray<'_> {} diff --git a/vortex-array2/src/array/composite/compute.rs b/vortex-array2/src/array/composite/compute.rs deleted file mode 100644 index ac6b3b3d21..0000000000 --- a/vortex-array2/src/array/composite/compute.rs +++ /dev/null @@ -1,95 +0,0 @@ -use arrow_array::ArrayRef as ArrowArrayRef; -use itertools::Itertools; -use vortex::scalar::Scalar; -use vortex_error::{vortex_err, VortexResult}; - -use crate::array::composite::array::CompositeArray; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::scalar_at::{scalar_at, ScalarAtFn}; -use crate::compute::slice::{slice, SliceFn}; -use crate::compute::take::{take, TakeFn}; -use crate::compute::ArrayCompute; -use crate::{Array, IntoArray, OwnedArray}; - -impl ArrayCompute for CompositeArray<'_> { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - Some(self) - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl AsArrowArray for CompositeArray<'_> { - fn as_arrow(&self) -> VortexResult { - self.with_compute(|c| { - c.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "as_arrow", - format!("composite extension {}", self.id()) - )) - }) - }) - } -} - -impl AsContiguousFn for CompositeArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { - let composites = arrays - .iter() - .map(|array| CompositeArray::try_from(array).unwrap()) - .collect_vec(); - let underlyings = composites.iter().map(|c| c.underlying()).collect_vec(); - Ok(CompositeArray::new( - self.id(), - self.underlying_metadata().clone(), - as_contiguous(&underlyings)?, - ) - .into_array()) - } -} - -impl ScalarAtFn for CompositeArray<'_> { - fn scalar_at(&self, index: usize) -> VortexResult { - // TODO(ngates): this seems wrong... I don't think we just cast scalars like this. - // e.g. how do we know what a datetime is in? - let underlying = scalar_at(&self.underlying(), index)?; - underlying.cast(self.dtype()) - } -} - -impl TakeFn for CompositeArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - Ok(CompositeArray::new( - self.id(), - self.underlying_metadata().clone(), - take(&self.underlying(), indices)?, - ) - .into_array()) - } -} - -impl SliceFn for CompositeArray<'_> { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(CompositeArray::new( - self.id(), - self.underlying_metadata().clone(), - slice(&self.underlying(), start, stop)?, - ) - .into_array()) - } -} diff --git a/vortex-array2/src/array/composite/mod.rs b/vortex-array2/src/array/composite/mod.rs deleted file mode 100644 index a72e9665b5..0000000000 --- a/vortex-array2/src/array/composite/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -pub use array::*; -use linkme::distributed_slice; -pub use typed::*; -use vortex_schema::CompositeID; - -mod array; -mod compute; -mod serde; -mod typed; - -#[distributed_slice] -pub static VORTEX_COMPOSITE_EXTENSIONS: [&'static dyn CompositeExtension] = [..]; - -pub fn find_extension(id: &str) -> Option<&'static dyn CompositeExtension> { - VORTEX_COMPOSITE_EXTENSIONS - .iter() - .find(|ext| ext.id().0 == id) - .copied() -} - -pub fn find_extension_id(id: &str) -> Option { - find_extension(id).map(|e| e.id()) -} diff --git a/vortex-array2/src/array/composite/serde.rs b/vortex-array2/src/array/composite/serde.rs deleted file mode 100644 index 99e021edf3..0000000000 --- a/vortex-array2/src/array/composite/serde.rs +++ /dev/null @@ -1,18 +0,0 @@ -use std::sync::Arc; - -use vortex_error::VortexResult; - -use crate::array::composite::CompositeMetadata; -use crate::{TryDeserializeArrayMetadata, TrySerializeArrayMetadata}; - -impl TrySerializeArrayMetadata for CompositeMetadata { - fn try_serialize_metadata(&self) -> VortexResult> { - todo!() - } -} - -impl TryDeserializeArrayMetadata<'_> for CompositeMetadata { - fn try_deserialize_metadata(_metadata: Option<&[u8]>) -> VortexResult { - todo!() - } -} diff --git a/vortex-array2/src/array/composite/typed.rs b/vortex-array2/src/array/composite/typed.rs deleted file mode 100644 index e9593d2606..0000000000 --- a/vortex-array2/src/array/composite/typed.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::fmt::Debug; - -use vortex_error::VortexResult; -use vortex_schema::CompositeID; -use vortex_schema::DType; - -use crate::array::composite::array::CompositeArray; -use crate::array::composite::UnderlyingMetadata; -use crate::compute::ArrayCompute; -use crate::Array; - -pub trait CompositeExtension: Debug + Send + Sync + 'static { - fn id(&self) -> CompositeID; - - fn with_compute<'a>( - &self, - array: &'a CompositeArray<'a>, - f: &mut dyn for<'b> FnMut(&'b (dyn ArrayCompute + 'a)) -> VortexResult<()>, - ) -> VortexResult<()>; -} - -pub type CompositeExtensionRef = &'static dyn CompositeExtension; - -#[derive(Debug, Clone)] -pub struct TypedCompositeArray<'a, M: UnderlyingMetadata> { - metadata: M, - underlying: Array<'a>, - dtype: DType, -} - -impl<'a, M: UnderlyingMetadata> TypedCompositeArray<'a, M> { - pub fn new(metadata: M, underlying: Array<'a>) -> Self { - let dtype = DType::Composite(metadata.id(), underlying.dtype().is_nullable().into()); - Self { - metadata, - underlying, - dtype, - } - } - - #[inline] - pub fn underlying_metadata(&self) -> &M { - &self.metadata - } - - #[inline] - pub fn underlying(&self) -> &Array<'a> { - &self.underlying - } - - #[inline] - pub fn dtype(&self) -> &DType { - &self.dtype - } - - pub fn as_composite(&self) -> VortexResult> { - Ok(CompositeArray::new( - self.underlying_metadata().id(), - self.underlying_metadata().try_serialize_metadata()?, - self.underlying().clone(), - )) - } -} - -#[macro_export] -macro_rules! impl_composite { - ($id:expr, $T:ty) => { - use linkme::distributed_slice; - use paste::paste; - use vortex_schema::{CompositeID, DType, Nullability}; - use $crate::array::composite::{ - CompositeArray, CompositeExtension, TypedCompositeArray, UnderlyingMetadata, - VORTEX_COMPOSITE_EXTENSIONS, - }; - use $crate::compute::ArrayCompute; - use $crate::TryDeserializeArrayMetadata; - - paste! { - #[derive(Debug)] - pub struct [<$T Extension>]; - - impl [<$T Extension>] { - pub const ID: CompositeID = CompositeID($id); - - pub fn dtype(nullability: Nullability) -> DType { - DType::Composite(Self::ID, nullability) - } - } - - impl CompositeExtension for [<$T Extension>] { - fn id(&self) -> CompositeID { - Self::ID - } - - fn with_compute<'a>( - &self, - array: &'a CompositeArray<'a>, - f: &mut dyn for<'b> FnMut(&'b (dyn ArrayCompute + 'a)) -> VortexResult<()>, - ) -> VortexResult<()> { - if array.id() != Self::ID { - panic!("Incorrect CompositeID"); - } - let typed = TypedCompositeArray::new( - $T::try_deserialize_metadata(Some(array.underlying_metadata().as_ref()))?, - array.underlying().clone(), - ); - f(&typed) - } - } - - impl UnderlyingMetadata for $T { - fn id(&self) -> CompositeID { - [<$T Extension>]::ID - } - } - - #[distributed_slice(VORTEX_COMPOSITE_EXTENSIONS)] - static ENCODINGS_COMPOSITE_EXT: &'static dyn CompositeExtension = &[<$T Extension>]; - } - }; -} - -pub use impl_composite; diff --git a/vortex-array2/src/array/constant/compute.rs b/vortex-array2/src/array/constant/compute.rs deleted file mode 100644 index a2935c41e8..0000000000 --- a/vortex-array2/src/array/constant/compute.rs +++ /dev/null @@ -1,58 +0,0 @@ -use itertools::Itertools; -use vortex::scalar::Scalar; -use vortex_error::{vortex_err, VortexResult}; - -use crate::array::constant::ConstantArray; -use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; -use crate::{Array, IntoArray, OwnedArray}; - -impl ArrayCompute for ConstantArray<'_> { - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl AsContiguousFn for ConstantArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { - let chunks = arrays - .iter() - .map(|a| ConstantArray::try_from(a).unwrap()) - .collect_vec(); - - if chunks.iter().map(|c| c.scalar()).all_equal() { - Ok(ConstantArray::new( - chunks.first().unwrap().scalar().clone(), - chunks.iter().map(|c| c.len()).sum(), - ) - .into_array()) - } else { - // TODO(ngates): we need to flatten the constant arrays and then concatenate them - Err(vortex_err!( - "Cannot concatenate constant arrays with differing scalars" - )) - } - } -} - -impl ScalarAtFn for ConstantArray<'_> { - fn scalar_at(&self, _index: usize) -> VortexResult { - Ok(self.scalar().clone()) - } -} - -impl TakeFn for ConstantArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - Ok(ConstantArray::new(self.scalar().clone(), indices.len()).into_array()) - } -} diff --git a/vortex-array2/src/array/constant/mod.rs b/vortex-array2/src/array/constant/mod.rs deleted file mode 100644 index 68a6de293c..0000000000 --- a/vortex-array2/src/array/constant/mod.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::impl_encoding; -use crate::stats::Stat; -use crate::validity::{ArrayValidity, LogicalValidity}; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; - -mod compute; -mod flatten; -mod stats; - -impl_encoding!("vortex.constant", Constant); - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConstantMetadata { - scalar: Scalar, - length: usize, -} - -impl ConstantArray<'_> { - pub fn new(scalar: S, length: usize) -> Self - where - Scalar: From, - { - let scalar: Scalar = scalar.into(); - let stats = HashMap::from([ - (Stat::Max, scalar.clone()), - (Stat::Min, scalar.clone()), - (Stat::IsConstant, true.into()), - (Stat::IsSorted, true.into()), - (Stat::RunCount, 1.into()), - ]); - Self::try_from_parts( - scalar.dtype().clone(), - ConstantMetadata { scalar, length }, - vec![].into(), - vec![].into(), - stats, - ) - .unwrap() - } - - pub fn scalar(&self) -> &Scalar { - &self.metadata().scalar - } -} - -impl ArrayValidity for ConstantArray<'_> { - fn is_valid(&self, _index: usize) -> bool { - match self.metadata().scalar.dtype().is_nullable() { - true => !self.scalar().is_null(), - false => true, - } - } - - fn logical_validity(&self) -> LogicalValidity { - match self.scalar().is_null() { - true => LogicalValidity::AllInvalid(self.len()), - false => LogicalValidity::AllValid(self.len()), - } - } -} - -impl AcceptArrayVisitor for ConstantArray<'_> { - fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - Ok(()) - } -} - -impl ArrayTrait for ConstantArray<'_> { - fn len(&self) -> usize { - self.metadata().length - } -} diff --git a/vortex-array2/src/array/constant/stats.rs b/vortex-array2/src/array/constant/stats.rs deleted file mode 100644 index 9b25c2371a..0000000000 --- a/vortex-array2/src/array/constant/stats.rs +++ /dev/null @@ -1,24 +0,0 @@ -use std::collections::HashMap; - -use vortex::scalar::Scalar; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::array::constant::ConstantArray; -use crate::stats::{ArrayStatisticsCompute, Stat}; - -impl ArrayStatisticsCompute for ConstantArray<'_> { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - if matches!(self.dtype(), &DType::Bool(_)) { - let Scalar::Bool(b) = self.scalar() else { - unreachable!("Got bool dtype without bool scalar") - }; - return Ok([( - Stat::TrueCount, - (self.len() as u64 * b.value().cloned().map(|v| v as u64).unwrap_or(0)).into(), - )] - .into()); - } - Ok(HashMap::default()) - } -} diff --git a/vortex-array2/src/array/datetime/README.md b/vortex-array2/src/array/datetime/README.md deleted file mode 100644 index 8e6bb6bb36..0000000000 --- a/vortex-array2/src/array/datetime/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Vortex Datetime Composite Extensions - -This module provides implementations of datetime types using composite arrays. - -## Arrow Conversion - -| Arrow Type | Vortex Type | | -|-----------------------|-----------------|----------------------------------| -| `time32/64` | `LocalTime` | Time since midnight | -| `date32/64` | `LocalDate` | Julian day | -| `timestamp(tz=None)` | `LocalDateTime` | Julian day + time since midnight | -| `timestamp(tz=UTC)` | `Instant` | Time since Unix epoch | -| `timestamp(tz=Other)` | `ZonedDateTime` | TZ aware time since Unix epoch | diff --git a/vortex-array2/src/array/mod.rs b/vortex-array2/src/array/mod.rs deleted file mode 100644 index 320027199b..0000000000 --- a/vortex-array2/src/array/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub mod bool; -pub mod chunked; -pub mod composite; -pub mod constant; -pub mod datetime; -pub mod primitive; -pub mod r#struct; -pub mod varbin; diff --git a/vortex-array2/src/array/primitive/compute/as_arrow.rs b/vortex-array2/src/array/primitive/compute/as_arrow.rs deleted file mode 100644 index cbc402411d..0000000000 --- a/vortex-array2/src/array/primitive/compute/as_arrow.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::sync::Arc; - -use arrow_array::{ - ArrayRef as ArrowArrayRef, ArrowPrimitiveType, PrimitiveArray as ArrowPrimitiveArray, -}; -use arrow_buffer::ScalarBuffer; -use vortex::ptype::PType; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::as_arrow::AsArrowArray; -use crate::validity::ArrayValidity; - -impl AsArrowArray for PrimitiveArray<'_> { - fn as_arrow(&self) -> VortexResult { - use arrow_array::types::*; - Ok(match self.ptype() { - PType::U8 => Arc::new(as_arrow_array_primitive::(self)?), - PType::U16 => Arc::new(as_arrow_array_primitive::(self)?), - PType::U32 => Arc::new(as_arrow_array_primitive::(self)?), - PType::U64 => Arc::new(as_arrow_array_primitive::(self)?), - PType::I8 => Arc::new(as_arrow_array_primitive::(self)?), - PType::I16 => Arc::new(as_arrow_array_primitive::(self)?), - PType::I32 => Arc::new(as_arrow_array_primitive::(self)?), - PType::I64 => Arc::new(as_arrow_array_primitive::(self)?), - PType::F16 => Arc::new(as_arrow_array_primitive::(self)?), - PType::F32 => Arc::new(as_arrow_array_primitive::(self)?), - PType::F64 => Arc::new(as_arrow_array_primitive::(self)?), - }) - } -} - -fn as_arrow_array_primitive( - array: &PrimitiveArray, -) -> VortexResult> { - Ok(ArrowPrimitiveArray::new( - ScalarBuffer::::new(array.buffer().clone().into(), 0, array.len()), - array.logical_validity().to_null_buffer()?, - )) -} diff --git a/vortex-array2/src/array/primitive/compute/as_contiguous.rs b/vortex-array2/src/array/primitive/compute/as_contiguous.rs deleted file mode 100644 index 717da79372..0000000000 --- a/vortex-array2/src/array/primitive/compute/as_contiguous.rs +++ /dev/null @@ -1,30 +0,0 @@ -use arrow_buffer::{MutableBuffer, ScalarBuffer}; -use vortex::match_each_native_ptype; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::as_contiguous::AsContiguousFn; -use crate::validity::Validity; -use crate::{Array, IntoArray, OwnedArray}; - -impl AsContiguousFn for PrimitiveArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { - let validity = if self.dtype().is_nullable() { - Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) - } else { - Validity::NonNullable - }; - - let mut buffer = MutableBuffer::with_capacity( - arrays.iter().map(|a| a.len()).sum::() * self.ptype().byte_width(), - ); - for array in arrays { - buffer.extend_from_slice(PrimitiveArray::try_from(array).unwrap().buffer().as_slice()) - } - match_each_native_ptype!(self.ptype(), |$T| { - Ok(PrimitiveArray::try_new(ScalarBuffer::<$T>::from(buffer), validity) - .unwrap() - .into_array()) - }) - } -} diff --git a/vortex-array2/src/array/primitive/compute/cast.rs b/vortex-array2/src/array/primitive/compute/cast.rs deleted file mode 100644 index 78016aedb7..0000000000 --- a/vortex-array2/src/array/primitive/compute/cast.rs +++ /dev/null @@ -1,75 +0,0 @@ -use vortex::match_each_native_ptype; -use vortex::ptype::{NativePType, PType}; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::DType; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::cast::CastFn; -use crate::{IntoArray, OwnedArray, ToArrayData}; - -impl CastFn for PrimitiveArray<'_> { - fn cast(&self, dtype: &DType) -> VortexResult { - // TODO(ngates): check validity - let ptype = PType::try_from(dtype)?; - if ptype == self.ptype() { - return Ok(self.to_array_data().into_array()); - } - - match_each_native_ptype!(ptype, |$T| { - Ok(PrimitiveArray::from_vec( - cast::<$T>(self)?, - self.validity().clone(), - ).into_array()) - }) - } -} - -fn cast(array: &PrimitiveArray) -> VortexResult> { - match_each_native_ptype!(array.ptype(), |$E| { - array - .typed_data::<$E>() - .iter() - // TODO(ngates): allow configurable checked/unchecked casting - .map(|&v| { - T::from(v).ok_or_else(|| { - vortex_err!(ComputeError: "Failed to cast {} to {:?}", v, T::PTYPE) - }) - }) - .collect() - }) -} - -#[cfg(test)] -mod test { - use vortex::ptype::PType; - use vortex_error::VortexError; - - use crate::array::primitive::PrimitiveArray; - use crate::{compute, IntoArray}; - - #[test] - fn cast_u32_u8() { - let arr = vec![0u32, 10, 200].into_array(); - let p = - PrimitiveArray::try_from(compute::cast::cast(&arr, PType::U8.into()).unwrap()).unwrap(); - assert_eq!(p.typed_data::(), vec![0u8, 10, 200]); - } - - #[test] - fn cast_u32_f32() { - let arr = vec![0u32, 10, 200].into_array(); - let u8arr = PrimitiveArray::try_from(compute::cast::cast(&arr, PType::F32.into()).unwrap()) - .unwrap(); - assert_eq!(u8arr.typed_data::(), vec![0.0f32, 10., 200.]); - } - - #[test] - fn cast_i32_u32() { - let arr = vec![-1i32].into_array(); - let error = compute::cast::cast(&arr, PType::U32.into()).err().unwrap(); - let VortexError::ComputeError(s, _) = error else { - unreachable!() - }; - assert_eq!(s.to_string(), "Failed to cast -1 to U32"); - } -} diff --git a/vortex-array2/src/array/primitive/compute/fill.rs b/vortex-array2/src/array/primitive/compute/fill.rs deleted file mode 100644 index f361e82b76..0000000000 --- a/vortex-array2/src/array/primitive/compute/fill.rs +++ /dev/null @@ -1,71 +0,0 @@ -use vortex::match_each_native_ptype; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::fill::FillForwardFn; -use crate::validity::ArrayValidity; -use crate::{IntoArray, OwnedArray, ToArrayData}; - -impl FillForwardFn for PrimitiveArray<'_> { - fn fill_forward(&self) -> VortexResult { - let validity = self.logical_validity(); - let Some(nulls) = validity.to_null_buffer()? else { - return Ok(self.to_array_data().into_array()); - }; - match_each_native_ptype!(self.ptype(), |$T| { - let typed_data = self.typed_data::<$T>(); - let mut last_value = $T::default(); - let filled = typed_data - .iter() - .zip(nulls.into_iter()) - .map(|(v, valid)| { - if valid { - last_value = *v; - } - last_value - }) - .collect::>(); - Ok(filled.into_array()) - }) - } -} - -#[cfg(test)] -mod test { - use crate::array::bool::BoolArray; - use crate::array::primitive::PrimitiveArray; - use crate::validity::{ArrayValidity, Validity}; - use crate::{compute, IntoArray}; - - #[test] - fn leading_none() { - let arr = PrimitiveArray::from_nullable_vec(vec![None, Some(8u8), None, Some(10), None]) - .into_array(); - let p = PrimitiveArray::try_from(compute::fill::fill_forward(&arr).unwrap()).unwrap(); - assert_eq!(p.typed_data::(), vec![0, 8, 8, 10, 10]); - assert!(p.logical_validity().is_all_valid()); - } - - #[test] - fn all_none() { - let arr = - PrimitiveArray::from_nullable_vec(vec![Option::::None, None, None, None, None]) - .into_array(); - - let p = PrimitiveArray::try_from(compute::fill::fill_forward(&arr).unwrap()).unwrap(); - assert_eq!(p.typed_data::(), vec![0, 0, 0, 0, 0]); - assert!(p.logical_validity().is_all_valid()); - } - - #[test] - fn nullable_non_null() { - let arr = PrimitiveArray::from_vec( - vec![8u8, 10u8, 12u8, 14u8, 16u8], - Validity::Array(BoolArray::from(vec![true, true, true, true, true]).into_array()), - ) - .into_array(); - let p = PrimitiveArray::try_from(compute::fill::fill_forward(&arr).unwrap()).unwrap(); - assert_eq!(p.typed_data::(), vec![8, 10, 12, 14, 16]); - assert!(p.logical_validity().is_all_valid()); - } -} diff --git a/vortex-array2/src/array/primitive/compute/mod.rs b/vortex-array2/src/array/primitive/compute/mod.rs deleted file mode 100644 index 180cc99601..0000000000 --- a/vortex-array2/src/array/primitive/compute/mod.rs +++ /dev/null @@ -1,53 +0,0 @@ -use crate::array::primitive::PrimitiveArray; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::cast::CastFn; -use crate::compute::fill::FillForwardFn; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::search_sorted::SearchSortedFn; -use crate::compute::slice::SliceFn; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; - -mod as_arrow; -mod as_contiguous; -mod cast; -mod fill; -mod scalar_at; -mod search_sorted; -mod slice; -mod take; - -impl ArrayCompute for PrimitiveArray<'_> { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - Some(self) - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn cast(&self) -> Option<&dyn CastFn> { - Some(self) - } - - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { - Some(self) - } - - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} diff --git a/vortex-array2/src/array/primitive/compute/scalar_at.rs b/vortex-array2/src/array/primitive/compute/scalar_at.rs deleted file mode 100644 index 810d13a068..0000000000 --- a/vortex-array2/src/array/primitive/compute/scalar_at.rs +++ /dev/null @@ -1,21 +0,0 @@ -use vortex::match_each_native_ptype; -use vortex::scalar::PrimitiveScalar; -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::scalar_at::ScalarAtFn; -use crate::validity::ArrayValidity; - -impl ScalarAtFn for PrimitiveArray<'_> { - fn scalar_at(&self, index: usize) -> VortexResult { - match_each_native_ptype!(self.ptype(), |$T| { - Ok(PrimitiveScalar::try_new( - self.is_valid(index) - .then(|| self.typed_data::<$T>()[index]), - self.dtype().nullability(), - )? - .into()) - }) - } -} diff --git a/vortex-array2/src/array/primitive/compute/search_sorted.rs b/vortex-array2/src/array/primitive/compute/search_sorted.rs deleted file mode 100644 index 1cd34324ce..0000000000 --- a/vortex-array2/src/array/primitive/compute/search_sorted.rs +++ /dev/null @@ -1,45 +0,0 @@ -use vortex::match_each_native_ptype; -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::search_sorted::SearchSorted; -use crate::compute::search_sorted::{SearchSortedFn, SearchSortedSide}; - -impl SearchSortedFn for PrimitiveArray<'_> { - fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult { - match_each_native_ptype!(self.ptype(), |$T| { - let pvalue: $T = value.try_into()?; - Ok(self.typed_data::<$T>().search_sorted(&pvalue, side)) - }) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::compute::search_sorted::search_sorted; - use crate::IntoArray; - - #[test] - fn test_searchsorted_primitive() { - let values = vec![1u16, 2, 3].into_array(); - - assert_eq!( - search_sorted(&values, 0, SearchSortedSide::Left).unwrap(), - 0 - ); - assert_eq!( - search_sorted(&values, 1, SearchSortedSide::Left).unwrap(), - 0 - ); - assert_eq!( - search_sorted(&values, 1, SearchSortedSide::Right).unwrap(), - 1 - ); - assert_eq!( - search_sorted(&values, 4, SearchSortedSide::Left).unwrap(), - 3 - ); - } -} diff --git a/vortex-array2/src/array/primitive/compute/slice.rs b/vortex-array2/src/array/primitive/compute/slice.rs deleted file mode 100644 index de392700de..0000000000 --- a/vortex-array2/src/array/primitive/compute/slice.rs +++ /dev/null @@ -1,19 +0,0 @@ -use vortex::match_each_native_ptype; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::slice::SliceFn; -use crate::IntoArray; -use crate::OwnedArray; - -impl SliceFn for PrimitiveArray<'_> { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - match_each_native_ptype!(self.ptype(), |$T| { - Ok(PrimitiveArray::try_new( - self.scalar_buffer::<$T>().slice(start, stop - start), - self.validity().slice(start, stop)?, - )? - .into_array()) - }) - } -} diff --git a/vortex-array2/src/array/primitive/compute/take.rs b/vortex-array2/src/array/primitive/compute/take.rs deleted file mode 100644 index 2df9e9c8c4..0000000000 --- a/vortex-array2/src/array/primitive/compute/take.rs +++ /dev/null @@ -1,43 +0,0 @@ -use num_traits::PrimInt; -use vortex::ptype::NativePType; -use vortex::{match_each_integer_ptype, match_each_native_ptype}; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::compute::take::TakeFn; -use crate::IntoArray; -use crate::{Array, OwnedArray}; - -impl TakeFn for PrimitiveArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - let validity = self.validity(); - let indices = indices.clone().flatten_primitive()?; - match_each_native_ptype!(self.ptype(), |$T| { - match_each_integer_ptype!(indices.ptype(), |$I| { - Ok(PrimitiveArray::from_vec( - take_primitive(self.typed_data::<$T>(), indices.typed_data::<$I>()), - validity.take(indices.array())?, - ).into_array()) - }) - }) - } -} - -fn take_primitive(array: &[T], indices: &[I]) -> Vec { - indices - .iter() - .map(|&idx| array[idx.to_usize().unwrap()]) - .collect() -} - -#[cfg(test)] -mod test { - use crate::array::primitive::compute::take::take_primitive; - - #[test] - fn test_take() { - let a = vec![1i32, 2, 3, 4, 5]; - let result = take_primitive(&a, &[0, 0, 4, 2]); - assert_eq!(result, vec![1i32, 1, 5, 3]); - } -} diff --git a/vortex-array2/src/array/primitive/mod.rs b/vortex-array2/src/array/primitive/mod.rs deleted file mode 100644 index cf055714ef..0000000000 --- a/vortex-array2/src/array/primitive/mod.rs +++ /dev/null @@ -1,120 +0,0 @@ -mod compute; -mod stats; - -use std::collections::HashMap; - -use arrow_buffer::{ArrowNativeType, ScalarBuffer}; -use itertools::Itertools; -use serde::{Deserialize, Serialize}; -use vortex::ptype::{NativePType, PType}; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::buffer::Buffer; -use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::ArrayFlatten; -use crate::{impl_encoding, IntoArray}; - -impl_encoding!("vortex.primitive", Primitive); - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct PrimitiveMetadata { - validity: ValidityMetadata, -} - -impl PrimitiveArray<'_> { - pub fn validity(&self) -> Validity { - self.metadata() - .validity - .to_validity(self.array().child(0, &Validity::DTYPE)) - } - - pub fn ptype(&self) -> PType { - // TODO(ngates): we can't really cache this anywhere? - self.dtype().try_into().unwrap() - } - - pub fn buffer(&self) -> &Buffer { - self.array().buffer(0).expect("missing buffer") - } - - pub fn scalar_buffer(&self) -> ScalarBuffer { - ScalarBuffer::new(self.buffer().clone().into(), 0, self.len()) - } - - pub fn typed_data(&self) -> &[T] { - self.buffer().typed_data::() - } -} - -impl PrimitiveArray<'_> { - pub fn try_new( - buffer: ScalarBuffer, - validity: Validity, - ) -> VortexResult { - Self::try_from_parts( - DType::from(T::PTYPE).with_nullability(validity.nullability()), - PrimitiveMetadata { - validity: validity.to_metadata(buffer.len())?, - }, - vec![Buffer::Owned(buffer.into_inner())].into(), - validity.into_array_data().into_iter().collect_vec().into(), - HashMap::default(), - ) - } - - pub fn from_vec(values: Vec, validity: Validity) -> Self { - Self::try_new(ScalarBuffer::from(values), validity).unwrap() - } - - pub fn from_nullable_vec(values: Vec>) -> Self { - let elems: Vec = values.iter().map(|v| v.unwrap_or_default()).collect(); - let validity = Validity::from(values.iter().map(|v| v.is_some()).collect::>()); - Self::from_vec(elems, validity) - } -} - -impl From> for PrimitiveArray<'_> { - fn from(values: Vec) -> Self { - PrimitiveArray::from_vec(values, Validity::NonNullable) - } -} - -impl IntoArray<'static> for Vec { - fn into_array(self) -> Array<'static> { - PrimitiveArray::from(self).into_array() - } -} - -impl ArrayFlatten for PrimitiveArray<'_> { - fn flatten<'a>(self) -> VortexResult> - where - Self: 'a, - { - Ok(Flattened::Primitive(self)) - } -} - -impl ArrayTrait for PrimitiveArray<'_> { - fn len(&self) -> usize { - self.buffer().len() / self.ptype().byte_width() - } -} - -impl ArrayValidity for PrimitiveArray<'_> { - fn is_valid(&self, index: usize) -> bool { - self.validity().is_valid(index) - } - - fn logical_validity(&self) -> LogicalValidity { - self.validity().to_logical(self.len()) - } -} - -impl AcceptArrayVisitor for PrimitiveArray<'_> { - fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_buffer(self.buffer())?; - visitor.visit_validity(&self.validity()) - } -} diff --git a/vortex-array2/src/array/primitive/stats.rs b/vortex-array2/src/array/primitive/stats.rs deleted file mode 100644 index e7a74eaf62..0000000000 --- a/vortex-array2/src/array/primitive/stats.rs +++ /dev/null @@ -1,293 +0,0 @@ -use std::collections::HashMap; -use std::mem::size_of; - -use arrow_buffer::buffer::BooleanBuffer; -use vortex::match_each_native_ptype; -use vortex::ptype::NativePType; -use vortex::scalar::Scalar; -use vortex::scalar::{ListScalarVec, PScalar}; -use vortex_error::VortexResult; - -use crate::array::primitive::PrimitiveArray; -use crate::stats::{ArrayStatisticsCompute, Stat}; -use crate::validity::ArrayValidity; -use crate::validity::LogicalValidity; -use crate::IntoArray; - -impl ArrayStatisticsCompute for PrimitiveArray<'_> { - fn compute_statistics(&self, stat: Stat) -> VortexResult> { - match_each_native_ptype!(self.ptype(), |$P| { - match self.logical_validity() { - LogicalValidity::AllValid(_) => self.typed_data::<$P>().compute_statistics(stat), - LogicalValidity::AllInvalid(_) => all_null_stats::<$P>(), - LogicalValidity::Array(a) => NullableValues( - self.typed_data::<$P>(), - &a.into_array().flatten_bool()?.boolean_buffer(), - ) - .compute_statistics(stat), - } - }) - } -} - -impl ArrayStatisticsCompute for &[T] { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - if self.is_empty() { - return Ok(HashMap::default()); - } - let mut stats = StatsAccumulator::new(self[0]); - self.iter().skip(1).for_each(|next| stats.next(*next)); - Ok(stats.into_map()) - } -} - -fn all_null_stats() -> VortexResult> { - Ok(HashMap::from([ - (Stat::Min, Option::::None.into()), - (Stat::Max, Option::::None.into()), - (Stat::IsConstant, true.into()), - (Stat::IsSorted, true.into()), - (Stat::IsStrictSorted, true.into()), - (Stat::RunCount, 1.into()), - (Stat::NullCount, 1.into()), - ( - Stat::BitWidthFreq, - ListScalarVec(vec![0; size_of::() * 8 + 1]).into(), - ), - ( - Stat::TrailingZeroFreq, - ListScalarVec(vec![size_of::() * 8; size_of::() * 8 + 1]).into(), - ), - ])) -} - -struct NullableValues<'a, T: NativePType>(&'a [T], &'a BooleanBuffer); - -impl<'a, T: NativePType> ArrayStatisticsCompute for NullableValues<'a, T> { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - let values = self.0; - if values.is_empty() { - return Ok(HashMap::default()); - } - - let first_non_null = self - .1 - .iter() - .enumerate() - .skip_while(|(_, valid)| !*valid) - .map(|(idx, _)| values[idx]) - .next() - .expect("Must be at least one non-null value"); - - let mut stats = StatsAccumulator::new(first_non_null); - values - .iter() - .zip(self.1.iter()) - .skip(1) - .map(|(next, valid)| valid.then_some(*next)) - .for_each(|next| stats.nullable_next(next)); - Ok(stats.into_map()) - } -} - -trait BitWidth { - fn bit_width(self) -> usize; - fn trailing_zeros(self) -> usize; -} - -impl> BitWidth for T { - fn bit_width(self) -> usize { - let bit_width = size_of::() * 8; - let scalar: PScalar = self.into(); - match scalar { - PScalar::U8(i) => bit_width - i.leading_zeros() as usize, - PScalar::U16(i) => bit_width - i.leading_zeros() as usize, - PScalar::U32(i) => bit_width - i.leading_zeros() as usize, - PScalar::U64(i) => bit_width - i.leading_zeros() as usize, - PScalar::I8(i) => bit_width - i.leading_zeros() as usize, - PScalar::I16(i) => bit_width - i.leading_zeros() as usize, - PScalar::I32(i) => bit_width - i.leading_zeros() as usize, - PScalar::I64(i) => bit_width - i.leading_zeros() as usize, - PScalar::F16(_) => bit_width, - PScalar::F32(_) => bit_width, - PScalar::F64(_) => bit_width, - } - } - - fn trailing_zeros(self) -> usize { - let scalar: PScalar = self.into(); - match scalar { - PScalar::U8(i) => i.trailing_zeros() as usize, - PScalar::U16(i) => i.trailing_zeros() as usize, - PScalar::U32(i) => i.trailing_zeros() as usize, - PScalar::U64(i) => i.trailing_zeros() as usize, - PScalar::I8(i) => i.trailing_zeros() as usize, - PScalar::I16(i) => i.trailing_zeros() as usize, - PScalar::I32(i) => i.trailing_zeros() as usize, - PScalar::I64(i) => i.trailing_zeros() as usize, - PScalar::F16(_) => 0, - PScalar::F32(_) => 0, - PScalar::F64(_) => 0, - } - } -} - -struct StatsAccumulator { - prev: T, - min: T, - max: T, - is_sorted: bool, - is_strict_sorted: bool, - run_count: usize, - null_count: usize, - bit_widths: Vec, - trailing_zeros: Vec, -} - -impl StatsAccumulator { - fn new(first_value: T) -> Self { - let mut stats = Self { - prev: first_value, - min: first_value, - max: first_value, - is_sorted: true, - is_strict_sorted: true, - run_count: 1, - null_count: 0, - bit_widths: vec![0; size_of::() * 8 + 1], - trailing_zeros: vec![0; size_of::() * 8 + 1], - }; - stats.bit_widths[first_value.bit_width()] += 1; - stats.trailing_zeros[first_value.trailing_zeros()] += 1; - stats - } - - pub fn nullable_next(&mut self, next: Option) { - match next { - Some(n) => self.next(n), - None => { - self.bit_widths[0] += 1; - self.trailing_zeros[T::PTYPE.bit_width()] += 1; - self.null_count += 1; - } - } - } - - pub fn next(&mut self, next: T) { - self.bit_widths[next.bit_width()] += 1; - self.trailing_zeros[next.trailing_zeros()] += 1; - - if self.prev == next { - self.is_strict_sorted = false; - } else { - if next < self.prev { - self.is_sorted = false; - } - self.run_count += 1; - } - if next < self.min { - self.min = next; - } else if next > self.max { - self.max = next; - } - self.prev = next; - } - - pub fn into_map(self) -> HashMap { - HashMap::from([ - (Stat::Min, self.min.into()), - (Stat::Max, self.max.into()), - (Stat::NullCount, self.null_count.into()), - (Stat::IsConstant, (self.min == self.max).into()), - (Stat::BitWidthFreq, ListScalarVec(self.bit_widths).into()), - ( - Stat::TrailingZeroFreq, - ListScalarVec(self.trailing_zeros).into(), - ), - (Stat::IsSorted, self.is_sorted.into()), - ( - Stat::IsStrictSorted, - (self.is_sorted && self.is_strict_sorted).into(), - ), - (Stat::RunCount, self.run_count.into()), - ]) - } -} - -#[cfg(test)] -mod test { - use vortex::scalar::ListScalarVec; - - use crate::array::primitive::PrimitiveArray; - use crate::stats::{ArrayStatistics, Stat}; - - #[test] - fn stats() { - let arr = PrimitiveArray::from(vec![1, 2, 3, 4, 5]); - let min: i32 = arr.statistics().compute_as(Stat::Min).unwrap(); - let max: i32 = arr.statistics().compute_as(Stat::Max).unwrap(); - let is_sorted: bool = arr.statistics().compute_as(Stat::IsSorted).unwrap(); - let is_strict_sorted: bool = arr.statistics().compute_as(Stat::IsStrictSorted).unwrap(); - let is_constant: bool = arr.statistics().compute_as(Stat::IsConstant).unwrap(); - let bit_width_freq: Vec = arr - .statistics() - .compute_as::>(Stat::BitWidthFreq) - .unwrap() - .0; - let trailing_zeros_freq: Vec = arr - .statistics() - .compute_as::>(Stat::TrailingZeroFreq) - .unwrap() - .0; - let run_count: u64 = arr.statistics().compute_as(Stat::RunCount).unwrap(); - assert_eq!(min, 1); - assert_eq!(max, 5); - assert!(is_sorted); - assert!(is_strict_sorted); - assert!(!is_constant); - assert_eq!( - bit_width_freq, - vec![ - 0u64, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ] - ); - assert_eq!( - trailing_zeros_freq, - vec![ - // 1, 3, 5 have 0 trailing zeros - // 2 has 1 trailing zero, 4 has 2 trailing zeros - 3u64, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, - ] - ); - assert_eq!(run_count, 5); - } - - #[test] - fn stats_u8() { - let arr = PrimitiveArray::from(vec![1u8, 2, 3, 4, 5]); - let min: u8 = arr.statistics().compute_as(Stat::Min).unwrap(); - let max: u8 = arr.statistics().compute_as(Stat::Max).unwrap(); - assert_eq!(min, 1); - assert_eq!(max, 5); - } - - #[test] - fn nullable_stats_u8() { - let arr = PrimitiveArray::from_nullable_vec(vec![None, Some(1i32), None, Some(2)]); - let min: Option = arr.statistics().compute_as(Stat::Min); - let max: Option = arr.statistics().compute_as(Stat::Max); - assert_eq!(min, Some(1)); - assert_eq!(max, Some(2)); - } - - #[test] - fn all_null() { - let arr = PrimitiveArray::from_nullable_vec(vec![Option::::None, None, None]); - let min: Option = arr.statistics().compute_as(Stat::Min); - let max: Option = arr.statistics().compute_as(Stat::Max); - assert_eq!(min, None); - assert_eq!(max, None); - } -} diff --git a/vortex-array2/src/array/varbin/accessor.rs b/vortex-array2/src/array/varbin/accessor.rs deleted file mode 100644 index 17c62cd64a..0000000000 --- a/vortex-array2/src/array/varbin/accessor.rs +++ /dev/null @@ -1,49 +0,0 @@ -use vortex::match_each_integer_ptype; -use vortex_error::VortexResult; - -use crate::accessor::ArrayAccessor; -use crate::array::varbin::VarBinArray; -use crate::validity::ArrayValidity; - -impl ArrayAccessor for VarBinArray<'_> { - type Item<'a> = Option<&'a [u8]>; - - fn with_iterator FnOnce(&mut dyn Iterator>) -> R, R>( - &self, - f: F, - ) -> VortexResult { - // TODO(ngates): what happens if bytes is much larger than sliced_bytes? - let primitive = self.bytes().flatten_primitive()?; - let offsets = self.offsets().flatten_primitive()?; - let validity = self.logical_validity().to_null_buffer()?; - - match_each_integer_ptype!(offsets.ptype(), |$T| { - let offsets = offsets.typed_data::<$T>(); - let bytes = primitive.typed_data::(); - - match validity { - None => { - let mut iter = offsets - .iter() - .zip(offsets.iter().skip(1)) - .map(|(start, end)| Some(&bytes[*start as usize..*end as usize])); - Ok(f(&mut iter)) - } - Some(validity) => { - let mut iter = offsets - .iter() - .zip(offsets.iter().skip(1)) - .zip(validity.iter()) - .map(|((start, end), valid)| { - if valid { - Some(&bytes[*start as usize..*end as usize]) - } else { - None - } - }); - Ok(f(&mut iter)) - } - } - }) - } -} diff --git a/vortex-array2/src/array/varbin/builder.rs b/vortex-array2/src/array/varbin/builder.rs deleted file mode 100644 index 205919c580..0000000000 --- a/vortex-array2/src/array/varbin/builder.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::mem; - -use arrow_buffer::NullBufferBuilder; -use vortex::ptype::NativePType; -use vortex_schema::DType; - -use crate::array::primitive::PrimitiveArray; -use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; -use crate::validity::Validity; -use crate::IntoArray; - -pub struct VarBinBuilder { - offsets: Vec, - data: Vec, - validity: NullBufferBuilder, -} - -impl VarBinBuilder { - pub fn with_capacity(len: usize) -> Self { - let mut offsets = Vec::with_capacity(len + 1); - offsets.push(O::zero()); - Self { - offsets, - data: Vec::new(), - validity: NullBufferBuilder::new(len), - } - } - - #[inline] - pub fn push(&mut self, value: Option<&[u8]>) { - match value { - Some(v) => self.push_value(v), - None => self.push_null(), - } - } - - #[inline] - pub fn push_value(&mut self, value: &[u8]) { - self.offsets - .push(O::from(self.data.len() + value.len()).unwrap()); - self.data.extend_from_slice(value); - self.validity.append_non_null(); - } - - #[inline] - pub fn push_null(&mut self) { - self.offsets.push(self.offsets[self.offsets.len() - 1]); - self.validity.append_null(); - } - - pub fn finish(&mut self, dtype: DType) -> OwnedVarBinArray { - let offsets = PrimitiveArray::from(mem::take(&mut self.offsets)); - let data = PrimitiveArray::from(mem::take(&mut self.data)); - - let nulls = self.validity.finish(); - - let validity = if dtype.is_nullable() { - nulls.map(Validity::from).unwrap_or(Validity::AllValid) - } else { - assert!(nulls.is_none(), "dtype and validity mismatch"); - Validity::NonNullable - }; - - VarBinArray::new(offsets.into_array(), data.into_array(), dtype, validity) - } -} - -#[cfg(test)] -mod test { - use vortex::scalar::Utf8Scalar; - use vortex_schema::DType; - use vortex_schema::Nullability::Nullable; - - use crate::array::varbin::builder::VarBinBuilder; - use crate::compute::scalar_at::scalar_at; - use crate::IntoArray; - - #[test] - fn test_builder() { - let mut builder = VarBinBuilder::::with_capacity(0); - builder.push(Some(b"hello")); - builder.push(None); - builder.push(Some(b"world")); - let array = builder.finish(DType::Utf8(Nullable)).into_array(); - - assert_eq!(array.len(), 3); - assert_eq!(array.dtype().nullability(), Nullable); - assert_eq!( - scalar_at(&array, 0).unwrap(), - Utf8Scalar::nullable("hello".to_owned()).into() - ); - assert!(scalar_at(&array, 1).unwrap().is_null()); - } -} diff --git a/vortex-array2/src/array/varbin/compute/mod.rs b/vortex-array2/src/array/varbin/compute/mod.rs deleted file mode 100644 index c28a1ac5ce..0000000000 --- a/vortex-array2/src/array/varbin/compute/mod.rs +++ /dev/null @@ -1,144 +0,0 @@ -use std::sync::Arc; - -use arrow_array::{ - ArrayRef as ArrowArrayRef, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, -}; -use itertools::Itertools; -use vortex::ptype::PType; -use vortex::scalar::Scalar; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; - -use crate::array::primitive::PrimitiveArray; -use crate::array::varbin::{varbin_scalar, VarBinArray}; -use crate::arrow::wrappers::as_offset_buffer; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn}; -use crate::compute::cast::cast; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::slice::SliceFn; -use crate::compute::take::TakeFn; -use crate::compute::ArrayCompute; -use crate::validity::{ArrayValidity, Validity}; -use crate::{Array, IntoArray, OwnedArray, ToArray}; - -mod slice; -mod take; - -impl ArrayCompute for VarBinArray<'_> { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - Some(self) - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - Some(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - Some(self) - } - - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl AsContiguousFn for VarBinArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { - let bytes_chunks: Vec = arrays - .iter() - .map(|a| VarBinArray::try_from(a).unwrap().sliced_bytes()) - .try_collect()?; - let bytes = as_contiguous(&bytes_chunks)?; - - let validity = if self.dtype().is_nullable() { - Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) - } else { - Validity::NonNullable - }; - - let mut offsets = Vec::new(); - offsets.push(0); - for a in arrays.iter().map(|a| VarBinArray::try_from(a).unwrap()) { - let first_offset: u64 = a.first_offset()?; - let offsets_array = cast(&a.offsets(), PType::U64.into())?.flatten_primitive()?; - let shift = offsets.last().copied().unwrap_or(0); - offsets.extend( - offsets_array - .typed_data::() - .iter() - .skip(1) // Ignore the zero offset for each array - .map(|o| o + shift - first_offset), - ); - } - - let offsets_array = PrimitiveArray::from(offsets).into_array(); - - Ok(VarBinArray::new(offsets_array, bytes, self.dtype().clone(), validity).into_array()) - } -} - -impl AsArrowArray for VarBinArray<'_> { - fn as_arrow(&self) -> VortexResult { - // Ensure the offsets are either i32 or i64 - let offsets = self.offsets().flatten_primitive()?; - let offsets = match offsets.ptype() { - PType::I32 | PType::I64 => offsets, - // Unless it's u64, everything else can be converted into an i32. - // FIXME(ngates): do not copy offsets again - PType::U64 => cast(&offsets.to_array(), PType::I64.into())?.flatten_primitive()?, - _ => cast(&offsets.to_array(), PType::I32.into())?.flatten_primitive()?, - }; - let nulls = self.logical_validity().to_null_buffer()?; - - let data = self.bytes().flatten_primitive()?; - assert_eq!(data.ptype(), PType::U8); - let data = data.buffer(); - - // Switch on Arrow DType. - Ok(match self.dtype() { - DType::Binary(_) => match offsets.ptype() { - PType::I32 => Arc::new(BinaryArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - PType::I64 => Arc::new(LargeBinaryArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - _ => panic!("Invalid offsets type"), - }, - DType::Utf8(_) => match offsets.ptype() { - PType::I32 => Arc::new(StringArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - PType::I64 => Arc::new(LargeStringArray::new( - as_offset_buffer::(offsets), - data.into(), - nulls, - )), - _ => panic!("Invalid offsets type"), - }, - _ => vortex_bail!(MismatchedTypes: "utf8 or binary", self.dtype()), - }) - } -} - -impl ScalarAtFn for VarBinArray<'_> { - fn scalar_at(&self, index: usize) -> VortexResult { - if self.is_valid(index) { - self.bytes_at(index) - .map(|bytes| varbin_scalar(bytes, self.dtype())) - } else { - Ok(Scalar::null(self.dtype())) - } - } -} diff --git a/vortex-array2/src/array/varbin/compute/slice.rs b/vortex-array2/src/array/varbin/compute/slice.rs deleted file mode 100644 index 8aecb1ce6d..0000000000 --- a/vortex-array2/src/array/varbin/compute/slice.rs +++ /dev/null @@ -1,17 +0,0 @@ -use vortex_error::VortexResult; - -use crate::array::varbin::VarBinArray; -use crate::compute::slice::{slice, SliceFn}; -use crate::{IntoArray, OwnedArray}; - -impl SliceFn for VarBinArray<'_> { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(VarBinArray::new( - slice(&self.offsets(), start, stop + 1)?, - self.bytes().clone(), - self.dtype().clone(), - self.validity().slice(start, stop)?, - ) - .into_array()) - } -} diff --git a/vortex-array2/src/array/varbin/compute/take.rs b/vortex-array2/src/array/varbin/compute/take.rs deleted file mode 100644 index 18d2b17901..0000000000 --- a/vortex-array2/src/array/varbin/compute/take.rs +++ /dev/null @@ -1,80 +0,0 @@ -use arrow_buffer::NullBuffer; -use vortex::match_each_integer_ptype; -use vortex::ptype::NativePType; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::array::varbin::builder::VarBinBuilder; -use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; -use crate::compute::take::TakeFn; -use crate::validity::Validity; -use crate::IntoArray; -use crate::{Array, OwnedArray}; - -impl TakeFn for VarBinArray<'_> { - fn take(&self, indices: &Array) -> VortexResult { - // TODO(ngates): support i64 indices. - assert!( - indices.len() < i32::MAX as usize, - "indices.len() must be less than i32::MAX" - ); - - let offsets = self.offsets().flatten_primitive()?; - let data = self.bytes().flatten_primitive()?; - let indices = indices.clone().flatten_primitive()?; - match_each_integer_ptype!(offsets.ptype(), |$O| { - match_each_integer_ptype!(indices.ptype(), |$I| { - Ok(take( - self.dtype().clone(), - offsets.typed_data::<$O>(), - data.typed_data::(), - indices.typed_data::<$I>(), - self.validity(), - )?.into_array()) - }) - }) - } -} - -fn take( - dtype: DType, - offsets: &[O], - data: &[u8], - indices: &[I], - validity: Validity, -) -> VortexResult { - let logical_validity = validity.to_logical(offsets.len() - 1); - if let Some(v) = logical_validity.to_null_buffer()? { - return Ok(take_nullable(dtype, offsets, data, indices, v)); - } - - let mut builder = VarBinBuilder::::with_capacity(indices.len()); - for &idx in indices { - let idx = idx.to_usize().unwrap(); - let start = offsets[idx].to_usize().unwrap(); - let stop = offsets[idx + 1].to_usize().unwrap(); - builder.push(Some(&data[start..stop])); - } - Ok(builder.finish(dtype)) -} - -fn take_nullable( - dtype: DType, - offsets: &[O], - data: &[u8], - indices: &[I], - null_buffer: NullBuffer, -) -> OwnedVarBinArray { - let mut builder = VarBinBuilder::::with_capacity(indices.len()); - for &idx in indices { - let idx = idx.to_usize().unwrap(); - if null_buffer.is_valid(idx) { - let start = offsets[idx].to_usize().unwrap(); - let stop = offsets[idx + 1].to_usize().unwrap(); - builder.push(Some(&data[start..stop])); - } else { - builder.push(None); - } - } - builder.finish(dtype) -} diff --git a/vortex-array2/src/array/varbin/mod.rs b/vortex-array2/src/array/varbin/mod.rs deleted file mode 100644 index 18240197b4..0000000000 --- a/vortex-array2/src/array/varbin/mod.rs +++ /dev/null @@ -1,277 +0,0 @@ -use std::collections::HashMap; - -use num_traits::AsPrimitive; -use serde::{Deserialize, Serialize}; -use vortex::match_each_native_ptype; -use vortex::ptype::NativePType; -use vortex::scalar::{BinaryScalar, Scalar, Utf8Scalar}; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; - -use crate::array::primitive::PrimitiveArray; -use crate::array::varbin::builder::VarBinBuilder; -use crate::compute::scalar_at::scalar_at; -use crate::compute::slice::slice; -use crate::validity::{Validity, ValidityMetadata}; -use crate::{impl_encoding, OwnedArray, ToArrayData}; - -mod accessor; -mod array; -pub mod builder; -mod compute; -mod flatten; -mod stats; - -impl_encoding!("vortex.varbin", VarBin); - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VarBinMetadata { - validity: ValidityMetadata, - offsets_dtype: DType, -} - -impl VarBinArray<'_> { - pub fn new(offsets: Array, bytes: Array, dtype: DType, validity: Validity) -> Self { - Self::try_new(offsets, bytes, dtype, validity).unwrap() - } - - pub fn try_new( - offsets: Array, - bytes: Array, - dtype: DType, - validity: Validity, - ) -> VortexResult { - if !matches!(offsets.dtype(), DType::Int(_, _, Nullability::NonNullable)) { - vortex_bail!(MismatchedTypes: "non nullable int", offsets.dtype()); - } - if !matches!( - bytes.dtype(), - DType::Int(IntWidth::_8, Signedness::Unsigned, Nullability::NonNullable) - ) { - vortex_bail!(MismatchedTypes: "u8", bytes.dtype()); - } - if !matches!(dtype, DType::Binary(_) | DType::Utf8(_)) { - vortex_bail!(MismatchedTypes: "utf8 or binary", dtype); - } - if dtype.is_nullable() == (validity == Validity::NonNullable) { - vortex_bail!("incorrect validity {:?}", validity); - } - - let metadata = VarBinMetadata { - validity: validity.to_metadata(offsets.len() - 1)?, - offsets_dtype: offsets.dtype().clone(), - }; - - let mut children = Vec::with_capacity(3); - children.push(offsets.to_array_data()); - children.push(bytes.to_array_data()); - if let Some(a) = validity.into_array_data() { - children.push(a) - } - - Self::try_from_parts( - dtype, - metadata, - vec![].into(), - children.into(), - HashMap::default(), - ) - } - - #[inline] - pub fn offsets(&self) -> Array { - self.array() - .child(0, &self.metadata().offsets_dtype) - .expect("missing offsets") - } - - pub fn first_offset(&self) -> VortexResult { - scalar_at(&self.offsets(), 0)? - .cast(&DType::from(T::PTYPE))? - .try_into() - } - - #[inline] - pub fn bytes(&self) -> Array { - self.array().child(1, &DType::BYTES).expect("missing bytes") - } - - pub fn validity(&self) -> Validity { - self.metadata() - .validity - .to_validity(self.array().child(2, &Validity::DTYPE)) - } - - pub fn sliced_bytes(&self) -> VortexResult { - let first_offset: usize = scalar_at(&self.offsets(), 0)?.try_into()?; - let last_offset: usize = - scalar_at(&self.offsets(), self.offsets().len() - 1)?.try_into()?; - slice(&self.bytes(), first_offset, last_offset) - } - - pub fn from_vec>(vec: Vec, dtype: DType) -> Self { - let size: usize = vec.iter().map(|v| v.as_ref().len()).sum(); - if size < u32::MAX as usize { - Self::from_vec_sized::(vec, dtype) - } else { - Self::from_vec_sized::(vec, dtype) - } - } - - fn from_vec_sized(vec: Vec, dtype: DType) -> Self - where - K: NativePType, - T: AsRef<[u8]>, - { - let mut builder = VarBinBuilder::::with_capacity(vec.len()); - for v in vec { - builder.push_value(v.as_ref()); - } - builder.finish(dtype) - } - - pub fn from_iter, I: IntoIterator>>( - iter: I, - dtype: DType, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinBuilder::::with_capacity(iter.size_hint().0); - for v in iter { - builder.push(v.as_ref().map(|o| o.as_ref())); - } - builder.finish(dtype) - } - - pub(self) fn offset_at(&self, index: usize) -> usize { - PrimitiveArray::try_from(self.offsets()) - .ok() - .map(|p| { - match_each_native_ptype!(p.ptype(), |$P| { - p.typed_data::<$P>()[index].as_() - }) - }) - .unwrap_or_else(|| { - scalar_at(&self.offsets(), index) - .unwrap() - .try_into() - .unwrap() - }) - } - - pub fn bytes_at(&self, index: usize) -> VortexResult> { - let start = self.offset_at(index); - let end = self.offset_at(index + 1); - let sliced = slice(&self.bytes(), start, end)?; - Ok(sliced.flatten_primitive()?.buffer().as_slice().to_vec()) - } -} - -impl From> for VarBinArray<'_> { - fn from(value: Vec<&[u8]>) -> Self { - VarBinArray::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From>> for VarBinArray<'_> { - fn from(value: Vec>) -> Self { - VarBinArray::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From> for VarBinArray<'_> { - fn from(value: Vec) -> Self { - VarBinArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - -impl From> for VarBinArray<'_> { - fn from(value: Vec<&str>) -> Self { - VarBinArray::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - -impl<'a> FromIterator> for VarBinArray<'_> { - fn from_iter>>(iter: T) -> Self { - VarBinArray::from_iter(iter, DType::Binary(Nullability::Nullable)) - } -} - -impl FromIterator>> for VarBinArray<'_> { - fn from_iter>>>(iter: T) -> Self { - VarBinArray::from_iter(iter, DType::Binary(Nullability::Nullable)) - } -} - -impl FromIterator> for VarBinArray<'_> { - fn from_iter>>(iter: T) -> Self { - VarBinArray::from_iter(iter, DType::Utf8(Nullability::Nullable)) - } -} - -impl<'a> FromIterator> for VarBinArray<'_> { - fn from_iter>>(iter: T) -> Self { - VarBinArray::from_iter(iter, DType::Utf8(Nullability::Nullable)) - } -} - -pub fn varbin_scalar(value: Vec, dtype: &DType) -> Scalar { - if matches!(dtype, DType::Utf8(_)) { - let str = unsafe { String::from_utf8_unchecked(value) }; - Utf8Scalar::try_new(Some(str), dtype.nullability()) - .unwrap() - .into() - } else { - BinaryScalar::try_new(Some(value), dtype.nullability()) - .unwrap() - .into() - } -} - -#[cfg(test)] -mod test { - use vortex_schema::{DType, Nullability}; - - use crate::array::primitive::PrimitiveArray; - use crate::array::varbin::VarBinArray; - use crate::compute::scalar_at::scalar_at; - use crate::compute::slice::slice; - use crate::validity::Validity; - use crate::{IntoArray, OwnedArray}; - - fn binary_array() -> OwnedArray { - let values = PrimitiveArray::from( - "hello worldhello world this is a long string" - .as_bytes() - .to_vec(), - ); - let offsets = PrimitiveArray::from(vec![0, 11, 44]); - - VarBinArray::new( - offsets.into_array(), - values.into_array(), - DType::Utf8(Nullability::NonNullable), - Validity::NonNullable, - ) - .into_array() - } - - #[test] - pub fn test_scalar_at() { - let binary_arr = binary_array(); - assert_eq!(binary_arr.len(), 2); - assert_eq!(scalar_at(&binary_arr, 0).unwrap(), "hello world".into()); - assert_eq!( - scalar_at(&binary_arr, 1).unwrap(), - "hello world this is a long string".into() - ) - } - - #[test] - pub fn slice_array() { - let binary_arr = slice(&binary_array(), 1, 2).unwrap(); - assert_eq!( - scalar_at(&binary_arr, 0).unwrap(), - "hello world this is a long string".into() - ); - } -} diff --git a/vortex-array2/src/array/varbin/stats.rs b/vortex-array2/src/array/varbin/stats.rs deleted file mode 100644 index 1cd0f6697b..0000000000 --- a/vortex-array2/src/array/varbin/stats.rs +++ /dev/null @@ -1,152 +0,0 @@ -use std::cmp::Ordering; -use std::collections::HashMap; - -use vortex::scalar::Scalar; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::accessor::ArrayAccessor; -use crate::array::varbin::{varbin_scalar, VarBinArray}; -use crate::stats::{ArrayStatisticsCompute, Stat}; - -impl ArrayStatisticsCompute for VarBinArray<'_> { - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - self.with_iterator(|iter| { - let mut acc = VarBinAccumulator::default(); - for next_val in iter { - acc.nullable_next(next_val) - } - acc.finish(self.dtype()) - }) - } -} - -pub struct VarBinAccumulator<'a> { - min: &'a [u8], - max: &'a [u8], - is_constant: bool, - is_sorted: bool, - is_strict_sorted: bool, - last_value: &'a [u8], - null_count: usize, - runs: usize, -} - -impl Default for VarBinAccumulator<'_> { - fn default() -> Self { - Self { - min: &[0xFF], - max: &[0x00], - is_constant: true, - is_sorted: true, - is_strict_sorted: true, - last_value: &[0x00], - runs: 0, - null_count: 0, - } - } -} - -impl<'a> VarBinAccumulator<'a> { - pub fn nullable_next(&mut self, val: Option<&'a [u8]>) { - match val { - None => self.null_count += 1, - Some(v) => self.next(v), - } - } - - pub fn next(&mut self, val: &'a [u8]) { - if val < self.min { - self.min.clone_from(&val); - } else if val > self.max { - self.max.clone_from(&val); - } - - match val.cmp(self.last_value) { - Ordering::Less => self.is_sorted = false, - Ordering::Equal => { - self.is_strict_sorted = false; - return; - } - Ordering::Greater => {} - } - self.is_constant = false; - self.last_value = val; - self.runs += 1; - } - - pub fn finish(&self, dtype: &DType) -> HashMap { - HashMap::from([ - (Stat::Min, varbin_scalar(self.min.to_vec(), dtype)), - (Stat::Max, varbin_scalar(self.max.to_vec(), dtype)), - (Stat::RunCount, self.runs.into()), - (Stat::IsSorted, self.is_sorted.into()), - (Stat::IsStrictSorted, self.is_strict_sorted.into()), - (Stat::IsConstant, self.is_constant.into()), - (Stat::NullCount, self.null_count.into()), - ]) - } -} - -#[cfg(test)] -mod test { - use vortex_schema::{DType, Nullability}; - - use crate::array::varbin::{OwnedVarBinArray, VarBinArray}; - use crate::stats::{ArrayStatistics, Stat}; - - fn array(dtype: DType) -> OwnedVarBinArray { - VarBinArray::from_vec( - vec!["hello world", "hello world this is a long string"], - dtype, - ) - } - - #[test] - fn utf8_stats() { - let arr = array(DType::Utf8(Nullability::NonNullable)); - assert_eq!( - arr.statistics().compute_as::(Stat::Min).unwrap(), - String::from("hello world") - ); - assert_eq!( - arr.statistics().compute_as::(Stat::Max).unwrap(), - String::from("hello world this is a long string") - ); - assert_eq!( - arr.statistics() - .compute_as::(Stat::RunCount) - .unwrap(), - 2 - ); - assert!(!arr - .statistics() - .compute_as::(Stat::IsConstant) - .unwrap()); - assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); - } - - #[test] - fn binary_stats() { - let arr = array(DType::Binary(Nullability::NonNullable)); - assert_eq!( - arr.statistics().compute_as::>(Stat::Min).unwrap(), - "hello world".as_bytes().to_vec() - ); - assert_eq!( - arr.statistics().compute_as::>(Stat::Max).unwrap(), - "hello world this is a long string".as_bytes().to_vec() - ); - assert_eq!( - arr.statistics() - .compute_as::(Stat::RunCount) - .unwrap(), - 2 - ); - assert!(!arr - .statistics() - .compute_as::(Stat::IsConstant) - .unwrap()); - assert!(arr.statistics().compute_as::(Stat::IsSorted).unwrap()); - } -} diff --git a/vortex-array2/src/arrow/mod.rs b/vortex-array2/src/arrow/mod.rs deleted file mode 100644 index 119a42e810..0000000000 --- a/vortex-array2/src/arrow/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod array; -mod recordbatch; -pub mod wrappers; diff --git a/vortex-array2/src/arrow/recordbatch.rs b/vortex-array2/src/arrow/recordbatch.rs deleted file mode 100644 index a00b27a110..0000000000 --- a/vortex-array2/src/arrow/recordbatch.rs +++ /dev/null @@ -1,29 +0,0 @@ -use std::sync::Arc; - -use arrow_array::RecordBatch; - -use crate::array::r#struct::StructArray; -use crate::arrow::array::FromArrowArray; -use crate::{ArrayData, IntoArrayData, ToArrayData}; - -impl ToArrayData for RecordBatch { - fn to_array_data(&self) -> ArrayData { - StructArray::try_new( - self.schema() - .fields() - .iter() - .map(|f| f.name()) - .map(|s| s.to_owned()) - .map(Arc::new) - .collect(), - self.columns() - .iter() - .zip(self.schema().fields()) - .map(|(array, field)| ArrayData::from_arrow(array.clone(), field.is_nullable())) - .collect(), - self.num_rows(), - ) - .unwrap() - .into_array_data() - } -} diff --git a/vortex-array2/src/arrow/wrappers.rs b/vortex-array2/src/arrow/wrappers.rs deleted file mode 100644 index 3e7e8ae421..0000000000 --- a/vortex-array2/src/arrow/wrappers.rs +++ /dev/null @@ -1,13 +0,0 @@ -use arrow_buffer::{Buffer as ArrowBuffer, OffsetBuffer, ScalarBuffer}; -use vortex::ptype::NativePType; - -use crate::array::primitive::PrimitiveArray; - -pub fn as_scalar_buffer(array: PrimitiveArray<'_>) -> ScalarBuffer { - assert_eq!(array.ptype(), T::PTYPE); - ScalarBuffer::from(ArrowBuffer::from(array.buffer())) -} - -pub fn as_offset_buffer(array: PrimitiveArray<'_>) -> OffsetBuffer { - OffsetBuffer::new(as_scalar_buffer(array)) -} diff --git a/vortex-array2/src/compute/as_arrow.rs b/vortex-array2/src/compute/as_arrow.rs deleted file mode 100644 index bab448221c..0000000000 --- a/vortex-array2/src/compute/as_arrow.rs +++ /dev/null @@ -1,37 +0,0 @@ -use arrow_array::ArrayRef as ArrowArrayRef; -use vortex_error::{vortex_err, VortexResult}; - -use crate::{Array, IntoArray}; - -pub trait AsArrowArray { - fn as_arrow(&self) -> VortexResult; -} - -pub fn as_arrow(array: &Array) -> VortexResult { - array.with_dyn(|a| { - // If as_arrow is implemented, then invoke that. - if let Some(a) = a.as_arrow() { - return a.as_arrow(); - } - - // Otherwise, flatten and try again. - let array = array.clone().flatten()?.into_array(); - a.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "as_arrow", array.encoding().id().name())) - }) - }) -} - -// TODO(ngates): return a RecordBatchReader instead? -pub fn as_arrow_chunks(_array: &Array) -> VortexResult> { - todo!("PORT") - // if let Some(chunked) = array.as_data::() { - // chunked - // .chunks() - // .iter() - // .map(|a| as_arrow(a.as_ref())) - // .try_collect() - // } else { - // as_arrow(array).map(|a| vec![a]) - // } -} diff --git a/vortex-array2/src/compute/as_contiguous.rs b/vortex-array2/src/compute/as_contiguous.rs deleted file mode 100644 index ae6a9f006b..0000000000 --- a/vortex-array2/src/compute/as_contiguous.rs +++ /dev/null @@ -1,36 +0,0 @@ -use itertools::Itertools; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::Array; - -pub trait AsContiguousFn { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult>; -} - -pub fn as_contiguous(arrays: &[Array]) -> VortexResult> { - if arrays.is_empty() { - vortex_bail!(ComputeError: "No arrays to concatenate"); - } - if !arrays.iter().map(|chunk| chunk.encoding().id()).all_equal() { - vortex_bail!(ComputeError: - "Chunks have differing encodings", - ); - } - if !arrays.iter().map(|chunk| chunk.dtype()).all_equal() { - vortex_bail!(ComputeError: - "Chunks have differing dtypes", - ); - } - - let first = arrays.first().unwrap(); - first.with_dyn(|a| { - a.as_contiguous() - .map(|f| f.as_contiguous(arrays)) - .unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "as_contiguous", - first.encoding().id().name() - )) - }) - }) -} diff --git a/vortex-array2/src/compute/cast.rs b/vortex-array2/src/compute/cast.rs deleted file mode 100644 index 15fa2694e6..0000000000 --- a/vortex-array2/src/compute/cast.rs +++ /dev/null @@ -1,21 +0,0 @@ -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::DType; - -use crate::{Array, OwnedArray, ToStatic}; - -pub trait CastFn { - fn cast(&self, dtype: &DType) -> VortexResult; -} - -pub fn cast(array: &Array, dtype: &DType) -> VortexResult { - if array.dtype() == dtype { - return Ok(array.to_static()); - } - - // TODO(ngates): check for null_count if dtype is non-nullable - array.with_dyn(|a| { - a.cast().map(|f| f.cast(dtype)).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "cast", array.encoding().id().name())) - }) - }) -} diff --git a/vortex-array2/src/compute/fill.rs b/vortex-array2/src/compute/fill.rs deleted file mode 100644 index e984fc2231..0000000000 --- a/vortex-array2/src/compute/fill.rs +++ /dev/null @@ -1,24 +0,0 @@ -use vortex_error::{vortex_err, VortexResult}; - -use crate::{Array, OwnedArray, ToStatic}; - -pub trait FillForwardFn { - fn fill_forward(&self) -> VortexResult; -} - -pub fn fill_forward(array: &Array) -> VortexResult { - if !array.dtype().is_nullable() { - return Ok(array.to_static()); - } - - array.with_dyn(|a| { - a.fill_forward() - .map(|t| t.fill_forward()) - .unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "fill_forward", - array.encoding().id().name() - )) - }) - }) -} diff --git a/vortex-array2/src/compute/mod.rs b/vortex-array2/src/compute/mod.rs deleted file mode 100644 index 5ba54a0bda..0000000000 --- a/vortex-array2/src/compute/mod.rs +++ /dev/null @@ -1,57 +0,0 @@ -use as_arrow::AsArrowArray; -use as_contiguous::AsContiguousFn; -use cast::CastFn; -use fill::FillForwardFn; -use patch::PatchFn; -use scalar_at::ScalarAtFn; -use search_sorted::SearchSortedFn; -use slice::SliceFn; -use take::TakeFn; - -pub mod as_arrow; -pub mod as_contiguous; -pub mod cast; -pub mod fill; -pub mod patch; -pub mod scalar_at; -pub mod search_sorted; -pub mod slice; -pub mod take; - -pub trait ArrayCompute { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - None - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - None - } - - fn cast(&self) -> Option<&dyn CastFn> { - None - } - - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { - None - } - - fn patch(&self) -> Option<&dyn PatchFn> { - None - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - None - } - - fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { - None - } - - fn slice(&self) -> Option<&dyn SliceFn> { - None - } - - fn take(&self) -> Option<&dyn TakeFn> { - None - } -} diff --git a/vortex-array2/src/compute/patch.rs b/vortex-array2/src/compute/patch.rs deleted file mode 100644 index cd8a14d318..0000000000 --- a/vortex-array2/src/compute/patch.rs +++ /dev/null @@ -1,28 +0,0 @@ -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::{Array, OwnedArray}; - -pub trait PatchFn { - fn patch(&self, patch: &Array) -> VortexResult; -} - -/// Returns a new array where the non-null values from the patch array are replaced in the original. -pub fn patch(array: &Array, patch: &Array) -> VortexResult { - if array.len() != patch.len() { - vortex_bail!( - "patch array {} must have the same length as the original array {}", - patch, - array - ); - } - - if array.dtype().as_nullable() != patch.dtype().as_nullable() { - vortex_bail!(MismatchedTypes: array.dtype(), patch.dtype()); - } - - array.with_dyn(|a| { - a.patch().map(|t| t.patch(patch)).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) - }) - }) -} diff --git a/vortex-array2/src/compute/scalar_at.rs b/vortex-array2/src/compute/scalar_at.rs deleted file mode 100644 index 34d4f0df65..0000000000 --- a/vortex-array2/src/compute/scalar_at.rs +++ /dev/null @@ -1,22 +0,0 @@ -use vortex::scalar::Scalar; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::Array; - -pub trait ScalarAtFn { - fn scalar_at(&self, index: usize) -> VortexResult; -} - -pub fn scalar_at(array: &Array, index: usize) -> VortexResult { - if index >= array.len() { - vortex_bail!(OutOfBounds: index, 0, array.len()); - } - - array.with_dyn(|a| { - a.scalar_at() - .map(|t| t.scalar_at(index)) - .unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "scalar_at", array.encoding().id().name())) - }) - }) -} diff --git a/vortex-array2/src/compute/search_sorted.rs b/vortex-array2/src/compute/search_sorted.rs deleted file mode 100644 index 4bd44e3adc..0000000000 --- a/vortex-array2/src/compute/search_sorted.rs +++ /dev/null @@ -1,144 +0,0 @@ -use std::cmp::Ordering; -use std::cmp::Ordering::{Equal, Greater, Less}; - -use vortex::scalar::Scalar; -use vortex_error::{vortex_err, VortexResult}; - -use crate::compute::scalar_at::scalar_at; -use crate::Array; - -#[derive(Debug, Copy, Clone)] -pub enum SearchSortedSide { - Left, - Right, -} - -pub trait SearchSortedFn { - fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult; -} - -pub fn search_sorted>( - array: &Array, - target: T, - side: SearchSortedSide, -) -> VortexResult { - let scalar = target.into().cast(array.dtype())?; - array.with_dyn(|a| { - if let Some(search_sorted) = a.search_sorted() { - return search_sorted.search_sorted(&scalar, side); - } - - if a.scalar_at().is_some() { - return Ok(SearchSorted::search_sorted(&array, &scalar, side)); - } - - Err(vortex_err!( - NotImplemented: "search_sorted", - array.encoding().id().name() - )) - }) -} - -pub trait IndexOrd { - fn index_cmp(&self, idx: usize, elem: &V) -> Option; - - fn index_lt(&self, idx: usize, elem: &V) -> bool { - matches!(self.index_cmp(idx, elem), Some(Less)) - } - - fn index_le(&self, idx: usize, elem: &V) -> bool { - matches!(self.index_cmp(idx, elem), Some(Less | Equal)) - } - - fn index_gt(&self, idx: usize, elem: &V) -> bool { - matches!(self.index_cmp(idx, elem), Some(Greater)) - } - - fn index_ge(&self, idx: usize, elem: &V) -> bool { - matches!(self.index_cmp(idx, elem), Some(Greater | Equal)) - } -} - -#[allow(clippy::len_without_is_empty)] -pub trait Len { - fn len(&self) -> usize; -} - -pub trait SearchSorted { - fn search_sorted(&self, value: &T, side: SearchSortedSide) -> usize - where - Self: IndexOrd, - { - match side { - SearchSortedSide::Left => self.search_sorted_by(|idx| { - if self.index_lt(idx, value) { - Less - } else { - Greater - } - }), - SearchSortedSide::Right => self.search_sorted_by(|idx| { - if self.index_le(idx, value) { - Less - } else { - Greater - } - }), - } - } - - fn search_sorted_by Ordering>(&self, f: F) -> usize; -} - -impl + Len + ?Sized, T> SearchSorted for S { - // Code adapted from Rust standard library slice::binary_search_by - fn search_sorted_by Ordering>(&self, mut f: F) -> usize { - // INVARIANTS: - // - 0 <= left <= left + size = right <= self.len() - // - f returns Less for everything in self[..left] - // - f returns Greater for everything in self[right..] - let mut size = self.len(); - let mut left = 0; - let mut right = size; - while left < right { - let mid = left + size / 2; - let cmp = f(mid); - - left = if cmp == Less { mid + 1 } else { left }; - right = if cmp == Greater { mid } else { right }; - if cmp == Equal { - return mid; - } - - size = right - left; - } - - left - } -} - -impl IndexOrd for &Array<'_> { - fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { - let scalar_a = scalar_at(self, idx).ok()?; - scalar_a.partial_cmp(elem) - } -} - -impl IndexOrd for [T] { - fn index_cmp(&self, idx: usize, elem: &T) -> Option { - // SAFETY: Used in search_sorted_by same as the standard library. The search_sorted ensures idx is in bounds - unsafe { self.get_unchecked(idx) }.partial_cmp(elem) - } -} - -impl Len for &Array<'_> { - fn len(&self) -> usize { - Array::len(self) - } -} - -impl Len for [T] { - fn len(&self) -> usize { - self.len() - } -} diff --git a/vortex-array2/src/compute/slice.rs b/vortex-array2/src/compute/slice.rs deleted file mode 100644 index e8358e8b23..0000000000 --- a/vortex-array2/src/compute/slice.rs +++ /dev/null @@ -1,31 +0,0 @@ -use vortex_error::{vortex_bail, vortex_err, VortexResult}; - -use crate::{Array, OwnedArray}; - -/// Limit array to start..stop range -pub trait SliceFn { - fn slice(&self, start: usize, stop: usize) -> VortexResult; -} - -pub fn slice(array: &Array, start: usize, stop: usize) -> VortexResult { - check_slice_bounds(array, start, stop)?; - - array.with_dyn(|c| { - c.slice().map(|t| t.slice(start, stop)).unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "slice", - array.encoding().id().name() - )) - }) - }) -} - -fn check_slice_bounds(array: &Array, start: usize, stop: usize) -> VortexResult<()> { - if start > array.len() { - vortex_bail!(OutOfBounds: start, 0, array.len()); - } - if stop > array.len() { - vortex_bail!(OutOfBounds: stop, 0, array.len()); - } - Ok(()) -} diff --git a/vortex-array2/src/compute/take.rs b/vortex-array2/src/compute/take.rs deleted file mode 100644 index adc519870c..0000000000 --- a/vortex-array2/src/compute/take.rs +++ /dev/null @@ -1,24 +0,0 @@ -use log::info; -use vortex_error::{vortex_err, VortexResult}; - -use crate::{Array, IntoArray, OwnedArray}; - -pub trait TakeFn { - fn take(&self, indices: &Array) -> VortexResult; -} - -pub fn take(array: &Array, indices: &Array) -> VortexResult { - array.with_dyn(|a| { - if let Some(take) = a.take() { - return take.take(indices); - } - - // Otherwise, flatten and try again. - info!("TakeFn not implemented for {}, flattening", array); - array.clone().flatten()?.into_array().with_dyn(|a| { - a.take().map(|t| t.take(indices)).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) - }) - }) - }) -} diff --git a/vortex-array2/src/encoding.rs b/vortex-array2/src/encoding.rs deleted file mode 100644 index b217f08321..0000000000 --- a/vortex-array2/src/encoding.rs +++ /dev/null @@ -1,72 +0,0 @@ -use std::any::Any; -use std::fmt::{Debug, Formatter}; - -use linkme::distributed_slice; -pub use vortex::encoding::EncodingId; -use vortex_error::VortexResult; - -use crate::flatten::{ArrayFlatten, Flattened}; -use crate::ArrayDef; -use crate::{Array, ArrayTrait}; - -#[distributed_slice] -pub static VORTEX_ENCODINGS: [EncodingRef] = [..]; - -pub type EncodingRef = &'static dyn ArrayEncoding; - -pub fn find_encoding(id: &str) -> Option { - VORTEX_ENCODINGS - .iter() - .find(|&x| x.id().name() == id) - .cloned() -} - -/// Object-safe encoding trait for an array. -pub trait ArrayEncoding: 'static + Sync + Send { - fn as_any(&self) -> &dyn Any; - - fn id(&self) -> EncodingId; - - /// Flatten the given array. - fn flatten<'a>(&self, array: Array<'a>) -> VortexResult>; - - /// Unwrap the provided array into an implementation of ArrayTrait - fn with_dyn<'a>( - &self, - array: &'a Array<'a>, - f: &mut dyn for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> VortexResult<()>, - ) -> VortexResult<()>; -} - -/// Non-object-safe extensions to the ArrayEncoding trait. -pub trait ArrayEncodingExt { - type D: ArrayDef; - - fn flatten<'a>(array: Array<'a>) -> VortexResult> - where - ::D: 'a, - { - let typed = <::Array<'a> as TryFrom>::try_from(array)?; - ArrayFlatten::flatten(typed) - } - - fn with_dyn<'a, R, F>(array: &'a Array<'a>, mut f: F) -> R - where - F: for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> R, - ::D: 'a, - { - let typed = - <::Array<'a> as TryFrom>::try_from(array.clone()).unwrap(); - f(&typed) - } -} - -impl Debug for dyn ArrayEncoding + '_ { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - Debug::fmt(&self.id(), f) - } -} - -pub trait ArrayEncodingRef { - fn encoding(&self) -> EncodingRef; -} diff --git a/vortex-array2/src/implementation.rs b/vortex-array2/src/implementation.rs deleted file mode 100644 index 68f1362643..0000000000 --- a/vortex-array2/src/implementation.rs +++ /dev/null @@ -1,102 +0,0 @@ -use vortex_error::VortexError; - -use crate::encoding::{ArrayEncoding, EncodingRef}; -use crate::encoding::{ArrayEncodingExt, EncodingId}; -use crate::{Array, ArrayMetadata}; -use crate::{ArrayTrait, TryDeserializeArrayMetadata}; - -/// Trait the defines the set of types relating to an array. -/// Because it has associated types it can't be used as a trait object. -pub trait ArrayDef { - const ID: EncodingId; - const ENCODING: EncodingRef; - - type Array<'a>: ArrayTrait + TryFrom, Error = VortexError> + 'a; - type Metadata: ArrayMetadata + Clone + for<'m> TryDeserializeArrayMetadata<'m>; - type Encoding: ArrayEncoding + ArrayEncodingExt; -} - -#[macro_export] -macro_rules! impl_encoding { - ($id:literal, $Name:ident) => { - use paste::paste; - - paste! { - use $crate::{ - Array, - ArrayDef, - ArrayMetadata, - ArrayTrait, - Flattened, - TypedArray, - }; - use $crate::encoding::{ - ArrayEncoding, - ArrayEncodingExt, - EncodingId, - EncodingRef, - VORTEX_ENCODINGS, - }; - use std::any::Any; - use std::fmt::Debug; - use std::sync::Arc; - use std::marker::{Send, Sync}; - - /// The array definition trait - #[derive(Debug)] - pub struct $Name; - impl ArrayDef for $Name { - const ID: EncodingId = EncodingId::new($id); - const ENCODING: EncodingRef = &[<$Name Encoding>]; - type Array<'a> = [<$Name Array>]<'a>; - type Metadata = [<$Name Metadata>]; - type Encoding = [<$Name Encoding>]; - } - - pub type [<$Name Array>]<'a> = TypedArray<'a, $Name>; - pub type [] = TypedArray<'static, $Name>; - - /// The array encoding - pub struct [<$Name Encoding>]; - #[$crate::linkme::distributed_slice(VORTEX_ENCODINGS)] - #[allow(non_upper_case_globals)] - static []: EncodingRef = &[<$Name Encoding>]; - impl ArrayEncoding for [<$Name Encoding>] { - fn as_any(&self) -> &dyn Any { - self - } - - fn id(&self) -> EncodingId { - $Name::ID - } - - fn flatten<'a>(&self, array: Array<'a>) -> VortexResult> { - ::flatten(array) - } - - #[inline] - fn with_dyn<'a>( - &self, - array: &'a Array<'a>, - f: &mut dyn for<'b> FnMut(&'b (dyn ArrayTrait + 'a)) -> VortexResult<()>, - ) -> VortexResult<()> { - ::with_dyn(array, f) - } - } - impl ArrayEncodingExt for [<$Name Encoding>] { - type D = $Name; - } - - /// Implement ArrayMetadata - impl ArrayMetadata for [<$Name Metadata>] { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_arc(self: Arc) -> Arc { - self - } - } - } - }; -} diff --git a/vortex-array2/src/lib.rs b/vortex-array2/src/lib.rs deleted file mode 100644 index c18907a8a3..0000000000 --- a/vortex-array2/src/lib.rs +++ /dev/null @@ -1,214 +0,0 @@ -mod accessor; -pub mod array; -mod arrow; -pub mod buffer; -pub mod compute; -mod context; -mod data; -pub mod encoding; -mod flatten; -mod implementation; -mod metadata; -mod stats; -mod tree; -mod typed; -pub mod validity; -mod view; -mod visitor; - -use std::fmt::{Debug, Display, Formatter}; - -pub use context::*; -pub use data::*; -pub use flatten::*; -pub use implementation::*; -pub use linkme; -pub use metadata::*; -pub use typed::*; -pub use view::*; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::buffer::Buffer; -use crate::compute::ArrayCompute; -use crate::encoding::{ArrayEncodingRef, EncodingRef}; -use crate::stats::{ArrayStatistics, ArrayStatisticsCompute}; -use crate::validity::ArrayValidity; -use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; - -#[derive(Debug, Clone)] -pub enum Array<'v> { - Data(ArrayData), - DataRef(&'v ArrayData), - View(ArrayView<'v>), -} - -pub type OwnedArray = Array<'static>; - -impl Array<'_> { - pub fn encoding(&self) -> EncodingRef { - match self { - Array::Data(d) => d.encoding(), - Array::DataRef(d) => d.encoding(), - Array::View(v) => v.encoding(), - } - } - - pub fn dtype(&self) -> &DType { - match self { - Array::Data(d) => d.dtype(), - Array::DataRef(d) => d.dtype(), - Array::View(v) => v.dtype(), - } - } - - pub fn len(&self) -> usize { - self.with_dyn(|a| a.len()) - } - - pub fn is_empty(&self) -> bool { - self.with_dyn(|a| a.is_empty()) - } - - pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option> { - match self { - Array::Data(d) => d.child(idx, dtype).map(Array::DataRef), - Array::DataRef(d) => d.child(idx, dtype).map(Array::DataRef), - Array::View(v) => v.child(idx, dtype).map(Array::View), - } - } - - pub fn buffer(&self, idx: usize) -> Option<&Buffer> { - match self { - Array::Data(d) => d.buffers().get(idx), - Array::DataRef(d) => d.buffers().get(idx), - Array::View(v) => v.buffers().get(idx), - } - } -} - -impl ToStatic for Array<'_> { - type Static = OwnedArray; - - fn to_static(&self) -> Self::Static { - Array::Data(self.to_array_data()) - } -} - -pub trait ToArray { - fn to_array(&self) -> Array; -} - -pub trait IntoArray<'a> { - fn into_array(self) -> Array<'a>; -} - -pub trait ToArrayData { - fn to_array_data(&self) -> ArrayData; -} - -pub trait IntoArrayData { - fn into_array_data(self) -> ArrayData; -} - -pub trait ToStatic { - type Static; - - fn to_static(&self) -> Self::Static; -} - -/// Collects together the behaviour of an array. -pub trait ArrayTrait: - ArrayEncodingRef - + ArrayCompute - + ArrayDType - + ArrayFlatten - + ArrayValidity - + AcceptArrayVisitor - + ArrayStatistics - + ArrayStatisticsCompute - + ToArrayData -{ - fn len(&self) -> usize; - - fn is_empty(&self) -> bool { - // TODO(ngates): remove this default impl to encourage explicit implementation - self.len() == 0 - } - - fn nbytes(&self) -> usize { - let mut visitor = NBytesVisitor(0); - self.accept(&mut visitor).unwrap(); - visitor.0 - } -} - -pub trait ArrayDType { - fn dtype(&self) -> &DType; -} - -struct NBytesVisitor(usize); -impl ArrayVisitor for NBytesVisitor { - fn visit_child(&mut self, _name: &str, array: &Array) -> VortexResult<()> { - self.0 += array.with_dyn(|a| a.nbytes()); - Ok(()) - } - - fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { - self.0 += buffer.len(); - Ok(()) - } -} - -impl<'a> Array<'a> { - pub fn with_dyn(&'a self, mut f: F) -> R - where - F: FnMut(&dyn ArrayTrait) -> R, - { - let mut result = None; - - self.encoding() - .with_dyn(self, &mut |array| { - result = Some(f(array)); - Ok(()) - }) - .unwrap(); - - // Now we unwrap the optional, which we know to be populated by the closure. - result.unwrap() - } -} - -impl Display for Array<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let prefix = match self { - Array::Data(_) => "", - Array::DataRef(_) => "&", - Array::View(_) => "$", - }; - write!( - f, - "{}{}({}, len={})", - prefix, - self.encoding().id(), - self.dtype(), - self.len() - ) - } -} - -impl IntoArrayData for Array<'_> { - fn into_array_data(self) -> ArrayData { - match self { - Array::Data(d) => d, - Array::DataRef(d) => d.clone(), - Array::View(_) => self.with_dyn(|a| a.to_array_data()), - } - } -} - -impl ToArrayData for Array<'_> { - fn to_array_data(&self) -> ArrayData { - self.clone().into_array_data() - } -} diff --git a/vortex-array2/src/stats.rs b/vortex-array2/src/stats.rs deleted file mode 100644 index c0fb6dcaf2..0000000000 --- a/vortex-array2/src/stats.rs +++ /dev/null @@ -1,62 +0,0 @@ -use std::collections::HashMap; - -use vortex::scalar::Scalar; -use vortex_error::VortexResult; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum Stat { - BitWidthFreq, - TrailingZeroFreq, - IsConstant, - IsSorted, - IsStrictSorted, - Max, - Min, - RunCount, - TrueCount, - NullCount, -} - -pub trait ArrayStatistics { - fn statistics(&self) -> &(dyn Statistics + '_) { - &EmptyStatistics - } -} - -pub trait ArrayStatisticsCompute { - /// Compute the requested statistic. Can return additional stats. - fn compute_statistics(&self, _stat: Stat) -> VortexResult> { - Ok(HashMap::new()) - } -} - -pub trait Statistics { - fn compute(&self, stat: Stat) -> Option; - fn get(&self, stat: Stat) -> Option; - fn set(&self, stat: Stat, value: Scalar); - fn to_map(&self) -> HashMap; -} - -impl dyn Statistics + '_ { - pub fn compute_as>(&self, stat: Stat) -> Option { - self.compute(stat).and_then(|s| T::try_from(s).ok()) - } - - pub fn get_as>(&self, stat: Stat) -> Option { - self.get(stat).and_then(|s| T::try_from(s).ok()) - } -} - -pub struct EmptyStatistics; -impl Statistics for EmptyStatistics { - fn compute(&self, _stat: Stat) -> Option { - None - } - fn get(&self, _stat: Stat) -> Option { - None - } - fn set(&self, _stat: Stat, _value: Scalar) {} - fn to_map(&self) -> HashMap { - HashMap::default() - } -} diff --git a/vortex-array2/src/typed.rs b/vortex-array2/src/typed.rs deleted file mode 100644 index 46dc133160..0000000000 --- a/vortex-array2/src/typed.rs +++ /dev/null @@ -1,203 +0,0 @@ -use std::collections::HashMap; -use std::sync::Arc; - -use vortex::scalar::Scalar; -use vortex_error::{vortex_err, VortexError, VortexResult}; -use vortex_schema::DType; - -use crate::buffer::{Buffer, OwnedBuffer}; -use crate::encoding::{ArrayEncodingRef, EncodingRef}; -use crate::stats::{ArrayStatistics, Stat, Statistics}; -use crate::visitor::ArrayVisitor; -use crate::{ - Array, ArrayDType, ArrayData, ArrayDef, IntoArray, IntoArrayData, ToArray, ToArrayData, - ToStatic, TryDeserializeArrayMetadata, -}; - -#[derive(Debug)] -pub struct TypedArray<'a, D: ArrayDef> { - array: Array<'a>, - metadata: D::Metadata, -} - -impl TypedArray<'_, D> { - pub fn try_from_parts( - dtype: DType, - metadata: D::Metadata, - buffers: Arc<[OwnedBuffer]>, - children: Arc<[ArrayData]>, - stats: HashMap, - ) -> VortexResult { - let array = Array::Data(ArrayData::try_new( - D::ENCODING, - dtype, - Arc::new(metadata.clone()), - buffers, - children, - stats, - )?); - Ok(Self { array, metadata }) - } - - pub fn len(&self) -> usize { - self.array.with_dyn(|a| a.len()) - } - - pub fn is_empty(&self) -> bool { - self.array.with_dyn(|a| a.is_empty()) - } - - pub fn dtype(&self) -> &DType { - self.array.dtype() - } - - pub fn metadata(&self) -> &D::Metadata { - &self.metadata - } -} - -impl<'a, 'b, D: ArrayDef> TypedArray<'b, D> { - pub fn array(&'a self) -> &'a Array<'b> { - &self.array - } -} - -impl Clone for TypedArray<'_, D> { - fn clone(&self) -> Self { - Self { - array: self.array.clone(), - metadata: self.metadata.clone(), - } - } -} - -impl<'a, D: ArrayDef> TryFrom> for TypedArray<'a, D> { - type Error = VortexError; - - fn try_from(array: Array<'a>) -> Result { - if array.encoding().id() != D::ENCODING.id() { - return Err(vortex_err!("incorrect encoding")); - } - let metadata = match &array { - Array::Data(d) => d - .metadata() - .as_any() - .downcast_ref::() - .unwrap() - .clone(), - Array::DataRef(d) => d - .metadata() - .as_any() - .downcast_ref::() - .unwrap() - .clone(), - Array::View(v) => D::Metadata::try_deserialize_metadata(v.metadata())?, - }; - Ok(TypedArray { array, metadata }) - } -} - -impl<'a, D: ArrayDef> TryFrom<&'a Array<'a>> for TypedArray<'a, D> { - type Error = VortexError; - - fn try_from(value: &'a Array<'a>) -> Result { - value.clone().try_into() - } -} - -impl ArrayDType for TypedArray<'_, D> { - fn dtype(&self) -> &DType { - self.array().dtype() - } -} - -impl ArrayEncodingRef for TypedArray<'_, D> { - fn encoding(&self) -> EncodingRef { - self.array().encoding() - } -} - -impl ArrayStatistics for TypedArray<'_, D> { - fn statistics(&self) -> &(dyn Statistics + '_) { - match self.array() { - Array::Data(d) => d.statistics(), - Array::DataRef(d) => d.statistics(), - Array::View(v) => v.statistics(), - } - } -} - -impl ToStatic for TypedArray<'_, D> { - type Static = TypedArray<'static, D>; - - fn to_static(&self) -> Self::Static { - TypedArray { - array: Array::Data(self.to_array_data()), - metadata: self.metadata.clone(), - } - } -} - -impl<'a, D: ArrayDef> AsRef> for TypedArray<'a, D> { - fn as_ref(&self) -> &Array<'a> { - &self.array - } -} - -impl ToArray for TypedArray<'_, D> { - fn to_array(&self) -> Array { - self.array.clone() - } -} - -impl<'a, D: ArrayDef> IntoArray<'a> for TypedArray<'a, D> { - fn into_array(self) -> Array<'a> { - self.array - } -} - -impl IntoArrayData for TypedArray<'_, D> { - fn into_array_data(self) -> ArrayData { - match self.array { - Array::Data(d) => d, - Array::DataRef(d) => d.clone(), - Array::View(_) => { - struct Visitor { - buffers: Vec, - children: Vec, - } - impl ArrayVisitor for Visitor { - fn visit_child(&mut self, _name: &str, array: &Array) -> VortexResult<()> { - self.children.push(array.to_array_data()); - Ok(()) - } - - fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { - self.buffers.push(buffer.to_static()); - Ok(()) - } - } - let mut visitor = Visitor { - buffers: vec![], - children: vec![], - }; - self.array().with_dyn(|a| a.accept(&mut visitor).unwrap()); - ArrayData::try_new( - self.encoding(), - self.array().dtype().clone(), - Arc::new(self.metadata().clone()), - visitor.buffers.into(), - visitor.children.into(), - self.statistics().to_map(), - ) - .unwrap() - } - } - } -} - -impl ToArrayData for TypedArray<'_, D> { - fn to_array_data(&self) -> ArrayData { - self.clone().into_array_data() - } -} diff --git a/vortex-array2/src/view.rs b/vortex-array2/src/view.rs deleted file mode 100644 index 44fa3af2a2..0000000000 --- a/vortex-array2/src/view.rs +++ /dev/null @@ -1,154 +0,0 @@ -use std::fmt::{Debug, Formatter}; - -use vortex::flatbuffers::array as fb; -use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult}; -use vortex_schema::DType; - -use crate::buffer::Buffer; -use crate::encoding::EncodingRef; -use crate::stats::{EmptyStatistics, Statistics}; -use crate::SerdeContext; -use crate::{Array, IntoArray, ToArray}; - -#[derive(Clone)] -pub struct ArrayView<'v> { - encoding: EncodingRef, - dtype: &'v DType, - array: fb::Array<'v>, - buffers: &'v [Buffer<'v>], - ctx: &'v SerdeContext, - // TODO(ngates): a store a Projection. A projected ArrayView contains the full fb::Array - // metadata, but only the buffers from the selected columns. Therefore we need to know - // which fb:Array children to skip when calculating how to slice into buffers. -} - -impl<'a> Debug for ArrayView<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ArrayView") - .field("encoding", &self.encoding) - .field("dtype", self.dtype) - // .field("array", &self.array) - .field("buffers", &self.buffers) - .field("ctx", &self.ctx) - .finish() - } -} - -impl<'v> ArrayView<'v> { - pub fn try_new( - ctx: &'v SerdeContext, - dtype: &'v DType, - array: fb::Array<'v>, - buffers: &'v [Buffer], - ) -> VortexResult { - let encoding = ctx - .find_encoding(array.encoding()) - .ok_or_else(|| vortex_err!(InvalidSerde: "Encoding ID out of bounds"))?; - - if buffers.len() != Self::cumulative_nbuffers(array) { - vortex_bail!(InvalidSerde: - "Incorrect number of buffers {}, expected {}", - buffers.len(), - Self::cumulative_nbuffers(array) - ) - } - - let view = Self { - encoding, - dtype, - array, - buffers, - ctx, - }; - - // Validate here that the metadata correctly parses, so that an encoding can infallibly - // implement Encoding::with_view(). - // FIXME(ngates): validate the metadata - view.to_array().with_dyn(|_| Ok::<(), VortexError>(()))?; - - Ok(view) - } - - pub fn encoding(&self) -> EncodingRef { - self.encoding - } - - pub fn dtype(&self) -> &DType { - self.dtype - } - - pub fn metadata(&self) -> Option<&'v [u8]> { - self.array.metadata().map(|m| m.bytes()) - } - - // TODO(ngates): should we separate self and DType lifetimes? Should DType be cloned? - pub fn child(&'v self, idx: usize, dtype: &'v DType) -> Option> { - let child = self.array_child(idx)?; - - // Figure out how many buffers to skip... - // We store them depth-first. - let buffer_offset = self - .array - .children()? - .iter() - .take(idx) - .map(|child| Self::cumulative_nbuffers(child)) - .sum(); - let buffer_count = Self::cumulative_nbuffers(child); - - Some( - Self::try_new( - self.ctx, - dtype, - child, - &self.buffers[buffer_offset..][0..buffer_count], - ) - .unwrap(), - ) - } - - fn array_child(&self, idx: usize) -> Option> { - let children = self.array.children()?; - if idx < children.len() { - Some(children.get(idx)) - } else { - None - } - } - - /// The number of buffers used by the current Array. - pub fn nbuffers(&self) -> usize { - self.array.nbuffers() as usize - } - - /// The number of buffers used by the current Array and all its children. - fn cumulative_nbuffers(array: fb::Array) -> usize { - let mut nbuffers = array.nbuffers() as usize; - for child in array.children().unwrap_or_default() { - nbuffers += Self::cumulative_nbuffers(child) - } - nbuffers - } - - pub fn buffers(&self) -> &'v [Buffer] { - // This is only true for the immediate current node? - self.buffers[0..self.nbuffers()].as_ref() - } - - pub fn statistics(&self) -> &dyn Statistics { - // TODO(ngates): store statistics in FlatBuffers - &EmptyStatistics - } -} - -impl ToArray for ArrayView<'_> { - fn to_array(&self) -> Array { - Array::View(self.clone()) - } -} - -impl<'v> IntoArray<'v> for ArrayView<'v> { - fn into_array(self) -> Array<'v> { - Array::View(self) - } -} diff --git a/vortex-datetime/Cargo.toml b/vortex-datetime-parts/Cargo.toml similarity index 77% rename from vortex-datetime/Cargo.toml rename to vortex-datetime-parts/Cargo.toml index 7fbad3a160..699a4f77e0 100644 --- a/vortex-datetime/Cargo.toml +++ b/vortex-datetime-parts/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "vortex-datetime" +name = "vortex-datetime-parts" version = "0.1.0" edition = "2021" @@ -12,3 +12,4 @@ vortex-error = { path = "../vortex-error" } vortex-schema = { "path" = "../vortex-schema" } linkme = { workspace = true } log = { workspace = true } +serde = { workspace = true, features = ["derive"] } diff --git a/vortex-datetime-parts/src/array.rs b/vortex-datetime-parts/src/array.rs new file mode 100644 index 0000000000..8042eea392 --- /dev/null +++ b/vortex-datetime-parts/src/array.rs @@ -0,0 +1,128 @@ +use serde::{Deserialize, Serialize}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, ToArrayData}; +use vortex_error::{vortex_bail, VortexResult}; + +impl_encoding!("vortex.datetimeparts", DateTimeParts); + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct DateTimePartsMetadata { + days_dtype: DType, + seconds_dtype: DType, + subseconds_dtype: DType, + validity: ValidityMetadata, +} + +impl DateTimePartsArray<'_> { + pub fn try_new( + dtype: DType, + days: Array, + seconds: Array, + subsecond: Array, + validity: Validity, + ) -> VortexResult { + if !matches!(days.dtype(), DType::Int(_, _, _)) { + vortex_bail!(MismatchedTypes: "any integer", days.dtype()); + } + if !matches!(seconds.dtype(), DType::Int(_, _, _)) { + vortex_bail!(MismatchedTypes: "any integer", seconds.dtype()); + } + if !matches!(subsecond.dtype(), DType::Int(_, _, _)) { + vortex_bail!(MismatchedTypes: "any integer", subsecond.dtype()); + } + + let length = days.len(); + if length != seconds.len() || length != subsecond.len() { + vortex_bail!( + "Mismatched lengths {} {} {}", + days.len(), + seconds.len(), + subsecond.len() + ); + } + + let mut children = Vec::with_capacity(4); + children.extend([ + days.to_array_data(), + seconds.to_array_data(), + subsecond.to_array_data(), + ]); + let validity_metadata = validity.to_metadata(length)?; + if let Some(validity) = validity.into_array_data() { + children.push(validity); + } + + Self::try_from_parts( + dtype, + DateTimePartsMetadata { + days_dtype: days.dtype().clone(), + seconds_dtype: seconds.dtype().clone(), + subseconds_dtype: subsecond.dtype().clone(), + validity: validity_metadata, + }, + children.into(), + HashMap::new(), + ) + } + + pub fn days(&self) -> Array { + self.array() + .child(0, &self.metadata().days_dtype) + .expect("Missing days array") + } + + pub fn seconds(&self) -> Array { + self.array() + .child(1, &self.metadata().seconds_dtype) + .expect("Missing seconds array") + } + + pub fn subsecond(&self) -> Array { + self.array() + .child(2, &self.metadata().subseconds_dtype) + .expect("Missing subsecond array") + } + + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(3, &Validity::DTYPE)) + } +} + +impl ArrayFlatten for DateTimePartsArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + todo!() + } +} + +impl ArrayValidity for DateTimePartsArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) + } + + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) + } +} + +impl AcceptArrayVisitor for DateTimePartsArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("days", &self.days())?; + visitor.visit_child("seconds", &self.seconds())?; + visitor.visit_child("subsecond", &self.subsecond()) + } +} + +impl ArrayStatisticsCompute for DateTimePartsArray<'_> {} + +impl ArrayTrait for DateTimePartsArray<'_> { + fn len(&self) -> usize { + self.days().len() + } +} diff --git a/vortex-datetime-parts/src/compress.rs b/vortex-datetime-parts/src/compress.rs new file mode 100644 index 0000000000..c949791a1a --- /dev/null +++ b/vortex-datetime-parts/src/compress.rs @@ -0,0 +1,92 @@ +use vortex::array::composite::{Composite, CompositeArray}; +use vortex::array::datetime::{LocalDateTimeArray, LocalDateTimeExtension, TimeUnit}; +use vortex::array::primitive::PrimitiveArray; +use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::compute::cast::cast; +use vortex::ptype::PType; +use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray}; +use vortex_error::VortexResult; + +use crate::{DateTimePartsArray, DateTimePartsEncoding}; + +impl EncodingCompression for DateTimePartsEncoding { + fn can_compress( + &self, + array: &Array, + _config: &CompressConfig, + ) -> Option<&dyn EncodingCompression> { + if array.encoding().id() != Composite::ID { + return None; + } + + let composite = CompositeArray::try_from(array).unwrap(); + if !matches!(composite.id(), LocalDateTimeExtension::ID) { + return None; + } + + Some(self) + } + + fn compress( + &self, + array: &Array, + like: Option<&Array>, + ctx: CompressCtx, + ) -> VortexResult { + let array = CompositeArray::try_from(array)?; + match array.id() { + LocalDateTimeExtension::ID => compress_localdatetime( + array + .as_typed() + .expect("Can only compress LocalDateTimeArray"), + like.map(|l| DateTimePartsArray::try_from(l).unwrap()), + ctx, + ), + _ => panic!("Unsupported composite ID {}", array.id()), + } + } +} + +fn compress_localdatetime( + array: LocalDateTimeArray, + like: Option, + ctx: CompressCtx, +) -> VortexResult { + let underlying = cast(array.underlying(), PType::I64.into())?.flatten_primitive()?; + + let divisor = match array.underlying_metadata().time_unit() { + TimeUnit::Ns => 1_000_000_000, + TimeUnit::Us => 1_000_000, + TimeUnit::Ms => 1_000, + TimeUnit::S => 1, + }; + + let length = underlying.len(); + let mut days = Vec::with_capacity(length); + let mut seconds = Vec::with_capacity(length); + let mut subsecond = Vec::with_capacity(length); + + for &t in underlying.typed_data::().iter() { + days.push(t / (86_400 * divisor)); + seconds.push((t % (86_400 * divisor)) / divisor); + subsecond.push((t % (86_400 * divisor)) % divisor); + } + + Ok(DateTimePartsArray::try_new( + LocalDateTimeExtension::dtype(underlying.dtype().nullability()), + ctx.named("days").compress( + &PrimitiveArray::from(days).into_array(), + like.as_ref().map(|l| l.days()).as_ref(), + )?, + ctx.named("seconds").compress( + &PrimitiveArray::from(seconds).into_array(), + like.as_ref().map(|l| l.seconds()).as_ref(), + )?, + ctx.named("subsecond").compress( + &PrimitiveArray::from(subsecond).into_array(), + like.as_ref().map(|l| l.subsecond()).as_ref(), + )?, + ctx.compress_validity(underlying.validity())?, + )? + .into_array()) +} diff --git a/vortex-datetime-parts/src/compute.rs b/vortex-datetime-parts/src/compute.rs new file mode 100644 index 0000000000..8a4b1c9de6 --- /dev/null +++ b/vortex-datetime-parts/src/compute.rs @@ -0,0 +1,43 @@ +use vortex::compute::slice::{slice, SliceFn}; +use vortex::compute::take::{take, TakeFn}; +use vortex::compute::ArrayCompute; +use vortex::{Array, ArrayDType, IntoArray, OwnedArray}; +use vortex_error::VortexResult; + +use crate::DateTimePartsArray; + +impl ArrayCompute for DateTimePartsArray<'_> { + fn slice(&self) -> Option<&dyn SliceFn> { + Some(self) + } + + fn take(&self) -> Option<&dyn TakeFn> { + Some(self) + } +} + +impl TakeFn for DateTimePartsArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + Ok(DateTimePartsArray::try_new( + self.dtype().clone(), + take(&self.days(), indices)?, + take(&self.seconds(), indices)?, + take(&self.subsecond(), indices)?, + self.validity(), + )? + .into_array()) + } +} + +impl SliceFn for DateTimePartsArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + Ok(DateTimePartsArray::try_new( + self.dtype().clone(), + slice(&self.days(), start, stop)?, + slice(&self.seconds(), start, stop)?, + slice(&self.subsecond(), start, stop)?, + self.validity().slice(start, stop)?, + )? + .into_array()) + } +} diff --git a/vortex-datetime-parts/src/lib.rs b/vortex-datetime-parts/src/lib.rs new file mode 100644 index 0000000000..87b19dff5e --- /dev/null +++ b/vortex-datetime-parts/src/lib.rs @@ -0,0 +1,5 @@ +pub use array::*; + +mod array; +mod compress; +mod compute; diff --git a/vortex-datetime/src/compress.rs b/vortex-datetime/src/compress.rs deleted file mode 100644 index 895c2c960a..0000000000 --- a/vortex-datetime/src/compress.rs +++ /dev/null @@ -1,88 +0,0 @@ -use vortex::array::composite::CompositeEncoding; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef}; -use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::compute::cast::cast; -use vortex::compute::flatten::flatten_primitive; -use vortex::datetime::{LocalDateTime, LocalDateTimeArray, LocalDateTimeExtension, TimeUnit}; -use vortex::ptype::PType; -use vortex::validity::OwnedValidity; -use vortex_error::VortexResult; - -use crate::{DateTimeArray, DateTimeEncoding}; - -impl EncodingCompression for DateTimeEncoding { - fn can_compress( - &self, - array: &dyn Array, - _config: &CompressConfig, - ) -> Option<&dyn EncodingCompression> { - if array.encoding().id() != CompositeEncoding::ID { - return None; - } - - let composite = array.as_composite(); - if !matches!(composite.id(), LocalDateTimeExtension::ID) { - return None; - } - - Some(self) - } - - fn compress( - &self, - array: &dyn Array, - like: Option<&dyn Array>, - ctx: CompressCtx, - ) -> VortexResult { - let array = array.as_composite(); - match array.id() { - LocalDateTimeExtension::ID => compress_localdatetime( - array.as_typed::(), - like.map(|l| l.as_any().downcast_ref::().unwrap()), - ctx, - ), - _ => panic!("Unsupported composite ID {}", array.id()), - } - } -} - -fn compress_localdatetime( - array: LocalDateTimeArray, - like: Option<&DateTimeArray>, - ctx: CompressCtx, -) -> VortexResult { - let underlying = flatten_primitive(cast(array.underlying(), PType::I64.into())?.as_ref())?; - - let divisor = match array.metadata().time_unit() { - TimeUnit::Ns => 1_000_000_000, - TimeUnit::Us => 1_000_000, - TimeUnit::Ms => 1_000, - TimeUnit::S => 1, - }; - - let mut days = Vec::with_capacity(underlying.len()); - let mut seconds = Vec::with_capacity(underlying.len()); - let mut subsecond = Vec::with_capacity(underlying.len()); - - for &t in underlying.typed_data::().iter() { - days.push(t / (86_400 * divisor)); - seconds.push((t % (86_400 * divisor)) / divisor); - subsecond.push((t % (86_400 * divisor)) % divisor); - } - - Ok(DateTimeArray::new( - ctx.named("days") - .compress(&PrimitiveArray::from(days), like.map(|l| l.days()))?, - ctx.named("seconds") - .compress(&PrimitiveArray::from(seconds), like.map(|l| l.seconds()))?, - ctx.named("subsecond").compress( - &PrimitiveArray::from(subsecond), - like.map(|l| l.subsecond()), - )?, - ctx.compress_validity(underlying.validity())?, - LocalDateTimeExtension::dtype(underlying.validity().is_some().into()), - ) - .into_array()) -} diff --git a/vortex-datetime/src/compute.rs b/vortex-datetime/src/compute.rs deleted file mode 100644 index 443094196d..0000000000 --- a/vortex-datetime/src/compute.rs +++ /dev/null @@ -1,45 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::compute::slice::{slice, SliceFn}; -use vortex::compute::take::{take, TakeFn}; -use vortex::compute::ArrayCompute; -use vortex::validity::OwnedValidity; -use vortex::view::ToOwnedView; -use vortex_error::VortexResult; - -use crate::DateTimeArray; - -impl ArrayCompute for DateTimeArray { - fn slice(&self) -> Option<&dyn SliceFn> { - Some(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - Some(self) - } -} - -impl TakeFn for DateTimeArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - Ok(DateTimeArray::new( - take(self.days(), indices)?, - take(self.seconds(), indices)?, - take(self.subsecond(), indices)?, - self.validity().to_owned_view(), - self.dtype().clone(), - ) - .into_array()) - } -} - -impl SliceFn for DateTimeArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(DateTimeArray::new( - slice(self.days(), start, stop)?, - slice(self.seconds(), start, stop)?, - slice(self.subsecond(), start, stop)?, - self.validity().map(|v| v.slice(start, stop)).transpose()?, - self.dtype().clone(), - ) - .into_array()) - } -} diff --git a/vortex-datetime/src/datetime.rs b/vortex-datetime/src/datetime.rs deleted file mode 100644 index 03356b3748..0000000000 --- a/vortex-datetime/src/datetime.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsCompute, StatsSet}; -use vortex::validity::Validity; -use vortex::validity::{OwnedValidity, ValidityView}; -use vortex::view::AsView; -use vortex::{impl_array, ArrayWalker}; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; - -/// An array that decomposes a datetime into days, seconds, and nanoseconds. -#[derive(Debug, Clone)] -pub struct DateTimeArray { - days: ArrayRef, - seconds: ArrayRef, - subsecond: ArrayRef, - validity: Option, - dtype: DType, - stats: Arc>, -} - -impl DateTimeArray { - pub fn new( - days: ArrayRef, - seconds: ArrayRef, - subsecond: ArrayRef, - validity: Option, - dtype: DType, - ) -> Self { - Self::try_new(days, seconds, subsecond, validity, dtype).unwrap() - } - - pub fn try_new( - days: ArrayRef, - seconds: ArrayRef, - subsecond: ArrayRef, - validity: Option, - dtype: DType, - ) -> VortexResult { - if !matches!(days.dtype(), DType::Int(_, _, _)) { - vortex_bail!(MismatchedTypes: "any integer", days.dtype()); - } - if !matches!(seconds.dtype(), DType::Int(_, _, _)) { - vortex_bail!(MismatchedTypes: "any integer", seconds.dtype()); - } - if !matches!(subsecond.dtype(), DType::Int(_, _, _)) { - vortex_bail!(MismatchedTypes: "any integer", subsecond.dtype()); - } - - Ok(Self { - days, - seconds, - subsecond, - validity, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - - #[inline] - pub fn days(&self) -> &ArrayRef { - &self.days - } - - #[inline] - pub fn seconds(&self) -> &ArrayRef { - &self.seconds - } - - #[inline] - pub fn subsecond(&self) -> &ArrayRef { - &self.subsecond - } -} - -impl Array for DateTimeArray { - impl_array!(); - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn len(&self) -> usize { - self.days.len() - } - - fn is_empty(&self) -> bool { - self.days.is_empty() - } - - fn dtype(&self) -> &DType { - &self.dtype - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - fn encoding(&self) -> EncodingRef { - &DateTimeEncoding - } - - fn nbytes(&self) -> usize { - self.days().nbytes() + self.seconds().nbytes() + self.subsecond().nbytes() - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.days())?; - walker.visit_child(self.seconds())?; - walker.visit_child(self.subsecond()) - } -} - -impl OwnedValidity for DateTimeArray { - fn validity(&self) -> Option { - self.validity.as_view() - } -} - -impl StatsCompute for DateTimeArray {} - -impl ArrayDisplay for DateTimeArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("days", self.days())?; - f.child("seconds", self.seconds())?; - f.child("subsecond", self.subsecond()) - } -} - -#[derive(Debug)] -pub struct DateTimeEncoding; - -impl DateTimeEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.datetime"); -} - -impl Encoding for DateTimeEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } -} diff --git a/vortex-datetime/src/lib.rs b/vortex-datetime/src/lib.rs deleted file mode 100644 index 24a59ca0b6..0000000000 --- a/vortex-datetime/src/lib.rs +++ /dev/null @@ -1,11 +0,0 @@ -pub use datetime::*; -use linkme::distributed_slice; -use vortex::encoding::{EncodingRef, ENCODINGS}; - -mod compress; -mod compute; -mod datetime; -mod serde; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_DATETIME: EncodingRef = &DateTimeEncoding; diff --git a/vortex-datetime/src/serde.rs b/vortex-datetime/src/serde.rs deleted file mode 100644 index 137f366885..0000000000 --- a/vortex-datetime/src/serde.rs +++ /dev/null @@ -1,42 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex::validity::OwnedValidity; -use vortex_error::VortexResult; - -use crate::{DateTimeArray, DateTimeEncoding}; - -impl ArraySerde for DateTimeArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.dtype(self.days().dtype())?; - ctx.write(self.days())?; - ctx.dtype(self.seconds().dtype())?; - ctx.write(self.seconds())?; - ctx.dtype(self.subsecond().dtype())?; - ctx.write(self.subsecond())?; - ctx.write_validity(self.validity()) - } - - fn metadata(&self) -> VortexResult>> { - // FIXME(ngates): I think we need child dtypes? - Ok(None) - } -} - -impl EncodingSerde for DateTimeEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let days_dtype = ctx.dtype()?; - let days = ctx.with_schema(&days_dtype).read()?; - let seconds_dtype = ctx.dtype()?; - let seconds = ctx.with_schema(&seconds_dtype).read()?; - let subseconds_dtype = ctx.dtype()?; - let subsecs = ctx.with_schema(&subseconds_dtype).read()?; - Ok(DateTimeArray::new( - days, - seconds, - subsecs, - ctx.read_validity()?, - ctx.schema().clone(), - ) - .into_array()) - } -} diff --git a/vortex-dict/Cargo.toml b/vortex-dict/Cargo.toml index 30f91c022a..be3faf814f 100644 --- a/vortex-dict/Cargo.toml +++ b/vortex-dict/Cargo.toml @@ -18,6 +18,8 @@ hashbrown = { workspace = true } linkme = { workspace = true } log = { workspace = true } num-traits = { workspace = true } +paste = { workspace = true } +serde = { workspace = true } vortex-array = { path = "../vortex-array" } vortex-error = { path = "../vortex-error" } vortex-schema = { path = "../vortex-schema" } diff --git a/vortex-dict/benches/dict_compress.rs b/vortex-dict/benches/dict_compress.rs index 0d5ea2c199..872dc71bf4 100644 --- a/vortex-dict/benches/dict_compress.rs +++ b/vortex-dict/benches/dict_compress.rs @@ -4,11 +4,10 @@ use rand::prelude::SliceRandom; use rand::{thread_rng, Rng}; use vortex::array::primitive::PrimitiveArray; use vortex::array::varbin::VarBinArray; -use vortex::array::Array; -use vortex::match_each_native_ptype; +use vortex::{match_each_native_ptype, ArrayTrait}; use vortex_dict::dict_encode_typed_primitive; -fn gen_primitive_dict(len: usize, uniqueness: f64) -> PrimitiveArray { +fn gen_primitive_dict<'a>(len: usize, uniqueness: f64) -> PrimitiveArray<'a> { let mut rng = thread_rng(); let value_range = len as f64 * uniqueness; let range = Uniform::new(-(value_range / 2.0) as i32, (value_range / 2.0) as i32); @@ -17,7 +16,7 @@ fn gen_primitive_dict(len: usize, uniqueness: f64) -> PrimitiveArray { PrimitiveArray::from(data) } -fn gen_varbin_dict(len: usize, uniqueness: f64) -> VarBinArray { +fn gen_varbin_dict<'a>(len: usize, uniqueness: f64) -> VarBinArray<'a> { let mut rng = thread_rng(); let uniq_cnt = (len as f64 * uniqueness) as usize; let dict: Vec = (0..uniq_cnt) diff --git a/vortex-dict/src/compress.rs b/vortex-dict/src/compress.rs index 7f911b0722..dd124bcc04 100644 --- a/vortex-dict/src/compress.rs +++ b/vortex-dict/src/compress.rs @@ -4,38 +4,38 @@ use ahash::RandomState; use hashbrown::hash_map::{Entry, RawEntryMut}; use hashbrown::HashMap; use num_traits::AsPrimitive; -use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::varbin::{VarBinArray, VarBinEncoding}; -use vortex::array::{Array, ArrayKind, ArrayRef}; +use vortex::accessor::ArrayAccessor; +use vortex::array::primitive::{Primitive, PrimitiveArray}; +use vortex::array::varbin::{VarBin, VarBinArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::match_each_native_ptype; use vortex::ptype::NativePType; use vortex::scalar::AsBytes; -use vortex::stats::Stat; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::validity::Validity; +use vortex::{ + match_each_native_ptype, Array, ArrayDType, ArrayDef, IntoArray, OwnedArray, ToArray, +}; use vortex_error::VortexResult; use vortex_schema::DType; use crate::dict::{DictArray, DictEncoding}; -use crate::downcast::DowncastDict; impl EncodingCompression for DictEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // TODO(robert): Add support for VarBinView - if array.encoding().id() != PrimitiveEncoding::ID - && array.encoding().id() != VarBinEncoding::ID - { + if array.encoding().id() != Primitive::ID && array.encoding().id() != VarBin::ID { return None; }; // No point dictionary coding if the array is unique. // We don't have a unique stat yet, but strict-sorted implies unique. if array - .stats() - .get_or_compute_as(&Stat::IsStrictSorted) + .statistics() + .compute_as(Stat::IsStrictSorted) .unwrap_or(false) { return None; @@ -46,42 +46,49 @@ impl EncodingCompression for DictEncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { - let dict_like = like.map(|like_arr| like_arr.as_dict()); + ) -> VortexResult { + let dict_like = like.map(|like_arr| DictArray::try_from(like_arr).unwrap()); + let dict_like_ref = dict_like.as_ref(); - let (codes, dict) = match ArrayKind::from(array) { - ArrayKind::Primitive(p) => { + let (codes, dict) = match array.encoding().id() { + Primitive::ID => { + let p = PrimitiveArray::try_from(array)?; let (codes, dict) = match_each_native_ptype!(p.ptype(), |$P| { - dict_encode_typed_primitive::<$P>(p) + dict_encode_typed_primitive::<$P>(&p) }); ( - ctx.auxiliary("codes") - .excluding(&DictEncoding) - .compress(&codes, dict_like.map(|dict| dict.codes()))?, - ctx.named("values") - .excluding(&DictEncoding) - .compress(&dict, dict_like.map(|dict| dict.values()))?, + ctx.auxiliary("codes").excluding(&DictEncoding).compress( + &codes.to_array(), + dict_like_ref.map(|dict| dict.codes()).as_ref(), + )?, + ctx.named("values").excluding(&DictEncoding).compress( + &dict.to_array(), + dict_like_ref.map(|dict| dict.values()).as_ref(), + )?, ) } - ArrayKind::VarBin(vb) => { - let (codes, dict) = dict_encode_varbin(vb); + VarBin::ID => { + let vb = VarBinArray::try_from(array).unwrap(); + let (codes, dict) = dict_encode_varbin(&vb); ( - ctx.auxiliary("codes") - .excluding(&DictEncoding) - .compress(&codes, dict_like.map(|dict| dict.codes()))?, - ctx.named("values") - .excluding(&DictEncoding) - .compress(&dict, dict_like.map(|dict| dict.values()))?, + ctx.auxiliary("codes").excluding(&DictEncoding).compress( + &codes.to_array(), + dict_like_ref.map(|dict| dict.codes()).as_ref(), + )?, + ctx.named("values").excluding(&DictEncoding).compress( + &dict.to_array(), + dict_like_ref.map(|dict| dict.values()).as_ref(), + )?, ) } _ => unreachable!("This array kind should have been filtered out"), }; - Ok(DictArray::new(codes, dict).into_array()) + DictArray::try_new(codes, dict).map(|a| a.into_array()) } } @@ -104,9 +111,9 @@ impl Eq for Value {} /// Dictionary encode primitive array with given PType. /// Null values in the original array are encoded in the dictionary. -pub fn dict_encode_typed_primitive( - array: &PrimitiveArray, -) -> (PrimitiveArray, PrimitiveArray) { +pub fn dict_encode_typed_primitive<'a, T: NativePType>( + array: &PrimitiveArray<'a>, +) -> (PrimitiveArray<'a>, PrimitiveArray<'a>) { let mut lookup_dict: HashMap, u64> = HashMap::new(); let mut codes: Vec = Vec::new(); let mut values: Vec = Vec::new(); @@ -115,46 +122,47 @@ pub fn dict_encode_typed_primitive( values.push(T::zero()); } - for ov in array.iter() { - match ov { - None => codes.push(0), - Some(v) => { - let code = match lookup_dict.entry(Value(v)) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(vac) => { - let next_code = values.len() as u64; - vac.insert(next_code.as_()); - values.push(v); - next_code - } - }; - codes.push(code); + ArrayAccessor::::with_iterator(array, |iter| { + for ov in iter { + match ov { + None => codes.push(0), + Some(&v) => { + let code = match lookup_dict.entry(Value(v)) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(vac) => { + let next_code = values.len() as u64; + vac.insert(next_code.as_()); + values.push(v); + next_code + } + }; + codes.push(code); + } } } - } + }) + .unwrap(); let values_validity = if array.dtype().is_nullable() { - let mut validity = Vec::with_capacity(values.len()); - validity.push(false); - validity.extend(vec![true; values.len() - 1]); + let mut validity = vec![true; values.len()]; + validity[0] = false; - Some(validity.into()) + validity.into() } else { - None + Validity::NonNullable }; ( PrimitiveArray::from(codes), - PrimitiveArray::from_nullable(values, values_validity), + PrimitiveArray::from_vec(values, values_validity), ) } /// Dictionary encode varbin array. Specializes for primitive byte arrays to avoid double copying -pub fn dict_encode_varbin(array: &VarBinArray) -> (PrimitiveArray, VarBinArray) { +pub fn dict_encode_varbin<'a>(array: &'a VarBinArray) -> (PrimitiveArray<'a>, VarBinArray<'a>) { array - .iter_primitive() - .map(|prim_iter| dict_encode_typed_varbin(array.dtype().clone(), prim_iter)) - .unwrap_or_else(|_| dict_encode_typed_varbin(array.dtype().clone(), array.iter())) + .with_iterator(|iter| dict_encode_typed_varbin(array.dtype().clone(), iter)) + .unwrap() } fn lookup_bytes<'a, T: NativePType + AsPrimitive>( @@ -167,7 +175,10 @@ fn lookup_bytes<'a, T: NativePType + AsPrimitive>( &bytes[begin..end] } -fn dict_encode_typed_varbin(dtype: DType, values: I) -> (PrimitiveArray, VarBinArray) +fn dict_encode_typed_varbin<'a, I, U>( + dtype: DType, + values: I, +) -> (PrimitiveArray<'a>, VarBinArray<'a>) where I: Iterator>, U: AsRef<[u8]>, @@ -220,19 +231,20 @@ where validity.push(false); validity.extend(vec![true; offsets.len() - 2]); - Some(validity.into()) + validity.into() } else { - None + Validity::NonNullable }; ( PrimitiveArray::from(codes), - VarBinArray::new( + VarBinArray::try_new( PrimitiveArray::from(offsets).into_array(), PrimitiveArray::from(bytes).into_array(), dtype, values_validity, - ), + ) + .unwrap(), ) } @@ -240,6 +252,7 @@ where mod test { use std::str; + use vortex::accessor::ArrayAccessor; use vortex::array::primitive::PrimitiveArray; use vortex::array::varbin::VarBinArray; use vortex::compute::scalar_at::scalar_at; @@ -257,7 +270,7 @@ mod test { #[test] fn encode_primitive_nulls() { - let arr = PrimitiveArray::from_iter(vec![ + let arr = PrimitiveArray::from_nullable_vec(vec![ Some(1), Some(1), None, @@ -273,15 +286,15 @@ mod test { &[1, 1, 0, 2, 2, 0, 2, 0] ); assert_eq!( - scalar_at(&values, 0).unwrap(), + scalar_at(values.array(), 0).unwrap(), PrimitiveScalar::nullable::(None).into() ); assert_eq!( - scalar_at(&values, 1).unwrap(), + scalar_at(values.array(), 1).unwrap(), PrimitiveScalar::nullable(Some(1)).into() ); assert_eq!( - scalar_at(&values, 2).unwrap(), + scalar_at(values.array(), 2).unwrap(), PrimitiveScalar::nullable(Some(3)).into() ); } @@ -291,15 +304,16 @@ mod test { let arr = VarBinArray::from(vec!["hello", "world", "hello", "again", "world"]); let (codes, values) = dict_encode_varbin(&arr); assert_eq!(codes.buffer().typed_data::(), &[0, 1, 0, 2, 1]); - assert_eq!( - values - .iter_primitive() - .unwrap() - .flatten() - .map(|b| unsafe { str::from_utf8_unchecked(b) }) - .collect::>(), - vec!["hello", "world", "again"] - ); + values + .with_iterator(|iter| { + assert_eq!( + iter.flatten() + .map(|b| unsafe { str::from_utf8_unchecked(b) }) + .collect::>(), + vec!["hello", "world", "again"] + ); + }) + .unwrap(); } #[test] @@ -322,29 +336,31 @@ mod test { &[1, 0, 2, 1, 0, 3, 2, 0] ); assert_eq!(String::from_utf8(values.bytes_at(0).unwrap()).unwrap(), ""); - assert_eq!( - values - .iter_primitive() - .unwrap() - .map(|b| b.map(|bv| unsafe { str::from_utf8_unchecked(bv) })) - .collect::>(), - vec![None, Some("hello"), Some("world"), Some("again")] - ); + values + .with_iterator(|iter| { + assert_eq!( + iter.map(|b| b.map(|v| unsafe { str::from_utf8_unchecked(v) })) + .collect::>(), + vec![None, Some("hello"), Some("world"), Some("again")] + ); + }) + .unwrap(); } #[test] fn repeated_values() { let arr = VarBinArray::from(vec!["a", "a", "b", "b", "a", "b", "a", "b"]); let (codes, values) = dict_encode_varbin(&arr); - assert_eq!( - values - .iter_primitive() - .unwrap() - .flatten() - .map(|b| unsafe { str::from_utf8_unchecked(b) }) - .collect::>(), - vec!["a", "b"] - ); + values + .with_iterator(|iter| { + assert_eq!( + iter.flatten() + .map(|b| unsafe { str::from_utf8_unchecked(b) }) + .collect::>(), + vec!["a", "b"] + ); + }) + .unwrap(); assert_eq!(codes.typed_data::(), &[0u64, 0, 1, 1, 0, 1, 0, 1]); } } diff --git a/vortex-dict/src/compute.rs b/vortex-dict/src/compute.rs index ecea2ff376..24f8ef8e8a 100644 --- a/vortex-dict/src/compute.rs +++ b/vortex-dict/src/compute.rs @@ -1,19 +1,14 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{flatten, FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; use vortex::scalar::Scalar; +use vortex::{Array, IntoArray, OwnedArray}; use vortex_error::VortexResult; use crate::DictArray; -impl ArrayCompute for DictArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - +impl ArrayCompute for DictArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -27,54 +22,53 @@ impl ArrayCompute for DictArray { } } -impl FlattenFn for DictArray { - fn flatten(&self) -> VortexResult { - flatten(&take(self.values(), self.codes())?) - } -} - -impl ScalarAtFn for DictArray { +impl ScalarAtFn for DictArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - let dict_index: usize = scalar_at(self.codes(), index)?.try_into()?; - scalar_at(self.values(), dict_index) + let dict_index: usize = scalar_at(&self.codes(), index)?.try_into()?; + scalar_at(&self.values(), dict_index) } } -impl TakeFn for DictArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for DictArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { // Dict // codes: 0 0 1 // dict: a b c d e f g h - let codes = take(self.codes(), indices)?; - Ok(DictArray::new(codes, self.values().clone()).into_array()) + let codes = take(&self.codes(), indices)?; + DictArray::try_new(codes, self.values()).map(|a| a.into_array()) } } -impl SliceFn for DictArray { +impl SliceFn for DictArray<'_> { // TODO(robert): Add function to trim the dictionary - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(DictArray::new(slice(self.codes(), start, stop)?, self.values().clone()).into_array()) + fn slice(&self, start: usize, stop: usize) -> VortexResult { + DictArray::try_new(slice(&self.codes(), start, stop)?, self.values()) + .map(|a| a.into_array()) } } #[cfg(test)] mod test { - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::varbin::VarBinArray; - use vortex::array::Array; - use vortex::compute::flatten::{flatten_primitive, flatten_varbin}; + use vortex::{IntoArray, ToArray}; use vortex_schema::{DType, Nullability}; use crate::{dict_encode_typed_primitive, dict_encode_varbin, DictArray}; #[test] fn flatten_nullable_primitive() { - let reference = - PrimitiveArray::from_iter(vec![Some(42), Some(-9), None, Some(42), None, Some(-9)]); + let reference = PrimitiveArray::from_nullable_vec(vec![ + Some(42), + Some(-9), + None, + Some(42), + None, + Some(-9), + ]); let (codes, values) = dict_encode_typed_primitive::(&reference); - let dict = DictArray::new(codes.into_array(), values.into_array()); - let flattened_dict = flatten_primitive(&dict).unwrap(); + let dict = DictArray::try_new(codes.into_array(), values.into_array()).unwrap(); + let flattened_dict = dict.to_array().flatten_primitive().unwrap(); assert_eq!(flattened_dict.buffer(), reference.buffer()); } @@ -85,15 +79,19 @@ mod test { DType::Utf8(Nullability::Nullable), ); let (codes, values) = dict_encode_varbin(&reference); - let dict = DictArray::new(codes.into_array(), values.into_array()); - let flattened_dict = flatten_varbin(&dict).unwrap(); + let dict = DictArray::try_new(codes.into_array(), values.into_array()).unwrap(); + let flattened_dict = dict.to_array().flatten_varbin().unwrap(); assert_eq!( - flattened_dict.offsets().as_primitive().buffer(), - reference.offsets().as_primitive().buffer() + flattened_dict + .offsets() + .flatten_primitive() + .unwrap() + .buffer(), + reference.offsets().flatten_primitive().unwrap().buffer() ); assert_eq!( - flattened_dict.bytes().as_primitive().buffer(), - reference.bytes().as_primitive().buffer() + flattened_dict.bytes().flatten_primitive().unwrap().buffer(), + reference.bytes().flatten_primitive().unwrap().buffer() ); } } diff --git a/vortex-dict/src/dict.rs b/vortex-dict/src/dict.rs index 20bec0550e..127113dddc 100644 --- a/vortex-dict/src/dict.rs +++ b/vortex-dict/src/dict.rs @@ -1,131 +1,92 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; +use serde::{Deserialize, Serialize}; +use vortex::accessor::ArrayAccessor; +use vortex::array::bool::BoolArray; +use vortex::compute::scalar_at::scalar_at; +use vortex::compute::take::take; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::IntoArrayData; +use vortex::{impl_encoding, match_each_integer_ptype, ArrayDType, ArrayFlatten, ToArrayData}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::{DType, Signedness}; +use vortex_schema::Signedness; -#[derive(Debug, Clone)] -pub struct DictArray { - codes: ArrayRef, - values: ArrayRef, - stats: Arc>, -} +impl_encoding!("vortex.dict", Dict); -impl DictArray { - pub fn new(codes: ArrayRef, dict: ArrayRef) -> Self { - Self::try_new(codes, dict).unwrap() - } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DictMetadata { + codes_dtype: DType, +} - pub fn try_new(codes: ArrayRef, dict: ArrayRef) -> VortexResult { +impl DictArray<'_> { + pub fn try_new(codes: Array, values: Array) -> VortexResult { if !matches!(codes.dtype(), DType::Int(_, Signedness::Unsigned, _)) { vortex_bail!(MismatchedTypes: "unsigned int", codes.dtype()); } - Ok(Self { - codes, - values: dict, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) + Self::try_from_parts( + values.dtype().clone(), + DictMetadata { + codes_dtype: codes.dtype().clone(), + }, + vec![values.to_array_data(), codes.to_array_data()].into(), + HashMap::new(), + ) } #[inline] - pub fn values(&self) -> &ArrayRef { - &self.values + pub fn values(&self) -> Array { + self.array().child(0, self.dtype()).expect("Missing values") } #[inline] - pub fn codes(&self) -> &ArrayRef { - &self.codes + pub fn codes(&self) -> Array { + self.array() + .child(1, &self.metadata().codes_dtype) + .expect("Missing codes") } } -impl Array for DictArray { - impl_array!(); - - fn len(&self) -> usize { - self.codes.len() - } - - fn is_empty(&self) -> bool { - self.codes.is_empty() - } - - fn dtype(&self) -> &DType { - self.values.dtype() - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - fn encoding(&self) -> EncodingRef { - &DictEncoding - } - - fn nbytes(&self) -> usize { - self.codes().nbytes() + self.values().nbytes() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.values())?; - walker.visit_child(self.codes()) +impl ArrayFlatten for DictArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + take(&self.values(), &self.codes())?.flatten() } } -impl ArrayValidity for DictArray { - fn logical_validity(&self) -> Validity { - todo!() - } - - fn is_valid(&self, _index: usize) -> bool { - todo!() +impl ArrayValidity for DictArray<'_> { + fn is_valid(&self, index: usize) -> bool { + let values_index = scalar_at(&self.codes(), index).unwrap().try_into().unwrap(); + self.values().with_dyn(|a| a.is_valid(values_index)) + } + + fn logical_validity(&self) -> LogicalValidity { + if self.dtype().is_nullable() { + let primitive_codes = self.codes().flatten_primitive().unwrap(); + match_each_integer_ptype!(primitive_codes.ptype(), |$P| { + ArrayAccessor::<$P>::with_iterator(&primitive_codes, |iter| { + LogicalValidity::Array( + BoolArray::from(iter.flatten().map(|c| *c != 0).collect::>()) + .into_array_data(), + ) + }) + .unwrap() + }) + } else { + LogicalValidity::AllValid(self.len()) + } } } -impl ArrayDisplay for DictArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("values", self.values())?; - f.child("codes", self.codes()) +impl AcceptArrayVisitor for DictArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("values", &self.values())?; + visitor.visit_child("codes", &self.codes()) } } -#[derive(Debug)] -pub struct DictEncoding; - -impl DictEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.dict"); -} - -impl Encoding for DictEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for DictArray<'_> { + fn len(&self) -> usize { + self.codes().len() } } diff --git a/vortex-dict/src/downcast.rs b/vortex-dict/src/downcast.rs deleted file mode 100644 index ea45de1701..0000000000 --- a/vortex-dict/src/downcast.rs +++ /dev/null @@ -1,31 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::DictArray; - -mod private { - pub trait Sealed {} -} - -pub trait DowncastDict: private::Sealed { - fn maybe_dict(&self) -> Option<&DictArray>; - - fn as_dict(&self) -> &DictArray { - self.maybe_dict().unwrap() - } -} - -impl private::Sealed for dyn Array + '_ {} - -impl DowncastDict for dyn Array + '_ { - fn maybe_dict(&self) -> Option<&DictArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastDict for ArrayRef { - fn maybe_dict(&self) -> Option<&DictArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-dict/src/lib.rs b/vortex-dict/src/lib.rs index f615806a7c..d8c8ab3922 100644 --- a/vortex-dict/src/lib.rs +++ b/vortex-dict/src/lib.rs @@ -1,14 +1,7 @@ pub use compress::*; pub use dict::*; -use linkme::distributed_slice; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod compress; mod compute; mod dict; -mod downcast; -mod serde; mod stats; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_DICT: EncodingRef = &DictEncoding; diff --git a/vortex-dict/src/serde.rs b/vortex-dict/src/serde.rs deleted file mode 100644 index bcb7280a4c..0000000000 --- a/vortex-dict/src/serde.rs +++ /dev/null @@ -1,77 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{DictArray, DictEncoding}; - -impl ArraySerde for DictArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write(self.values())?; - // TODO(robert): Stop writing this - ctx.dtype(self.codes().dtype())?; - ctx.write(self.codes()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for DictEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let dict = ctx.read()?; - let codes_dtype = ctx.dtype()?; - let codes = ctx.with_schema(&codes_dtype).read()?; - Ok(DictArray::new(codes, dict).into_array()) - } -} - -#[cfg(test)] -mod test { - use vortex::array::downcast::DowncastArrayBuiltin; - use vortex::array::IntoArray; - use vortex::array::{Array, ArrayRef}; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - use crate::downcast::DowncastDict; - use crate::DictArray; - - fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } - - #[test] - fn roundtrip() { - let arr = DictArray::new( - vec![0u8, 0, 1, 2, 3].into_array(), - vec![-7i64, -13, 17, 23].into_array(), - ); - let read_arr = roundtrip_array(&arr).unwrap(); - - assert_eq!( - arr.codes().as_primitive().buffer().typed_data::(), - read_arr - .as_dict() - .codes() - .as_primitive() - .buffer() - .typed_data::() - ); - - assert_eq!( - arr.values().as_primitive().buffer().typed_data::(), - read_arr - .as_dict() - .values() - .as_primitive() - .buffer() - .typed_data::() - ); - } -} diff --git a/vortex-dict/src/stats.rs b/vortex-dict/src/stats.rs index 33bb493a07..7002c0d3f2 100644 --- a/vortex-dict/src/stats.rs +++ b/vortex-dict/src/stats.rs @@ -1,49 +1,50 @@ -use vortex::stats::{Stat, StatsCompute, StatsSet}; +use std::collections::HashMap; + +use vortex::scalar::Scalar; +use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat}; use vortex_error::VortexResult; use crate::dict::DictArray; -impl StatsCompute for DictArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - let mut stats = StatsSet::new(); +impl ArrayStatisticsCompute for DictArray<'_> { + fn compute_statistics(&self, _stat: Stat) -> VortexResult> { + let mut stats = HashMap::new(); - if let Some(rc) = self.codes().stats().get_or_compute(&Stat::RunCount) { - stats.set(Stat::RunCount, rc); + if let Some(rc) = self.codes().statistics().compute_as(Stat::RunCount) { + stats.insert(Stat::RunCount, rc); } - if let Some(min) = self.values().stats().get_or_compute(&Stat::Min) { - stats.set(Stat::Min, min); + if let Some(min) = self.values().statistics().compute_as(Stat::Min) { + stats.insert(Stat::Min, min); } - if let Some(max) = self.values().stats().get_or_compute(&Stat::Max) { - stats.set(Stat::Max, max); + if let Some(max) = self.values().statistics().compute_as(Stat::Max) { + stats.insert(Stat::Max, max); } - if let Some(is_constant) = self.codes().stats().get_or_compute(&Stat::IsConstant) { - stats.set(Stat::IsConstant, is_constant); + if let Some(is_constant) = self.codes().statistics().compute_as(Stat::IsConstant) { + stats.insert(Stat::IsConstant, is_constant); } - if let Some(null_count) = self.codes().stats().get_or_compute(&Stat::NullCount) { - stats.set(Stat::NullCount, null_count); + if let Some(null_count) = self.codes().statistics().compute_as(Stat::NullCount) { + stats.insert(Stat::NullCount, null_count); } // if dictionary is sorted if self .values() - .stats() - .get_or_compute_as::(&Stat::IsSorted) + .statistics() + .compute_as(Stat::IsSorted) .unwrap_or(false) { - if let Some(codes_are_sorted) = self - .codes() - .stats() - .get_or_compute_as::(&Stat::IsSorted) + if let Some(codes_are_sorted) = + self.codes().statistics().compute_as::(Stat::IsSorted) { - stats.set(Stat::IsSorted, codes_are_sorted.into()); + stats.insert(Stat::IsSorted, codes_are_sorted.into()); } if let Some(codes_are_strict_sorted) = self .codes() - .stats() - .get_or_compute_as::(&Stat::IsStrictSorted) + .statistics() + .compute_as::(Stat::IsStrictSorted) { - stats.set(Stat::IsStrictSorted, codes_are_strict_sorted.into()); + stats.insert(Stat::IsStrictSorted, codes_are_strict_sorted.into()); } } diff --git a/vortex-fastlanes/Cargo.toml b/vortex-fastlanes/Cargo.toml index 24a265a240..edac5af797 100644 --- a/vortex-fastlanes/Cargo.toml +++ b/vortex-fastlanes/Cargo.toml @@ -16,14 +16,16 @@ workspace = true [dependencies] arrayref = { workspace = true } +fastlanez = { path = "../fastlanez" } +itertools = { workspace = true } +linkme = { workspace = true } +log = { workspace = true } +num-traits = { workspace = true } +paste = { workspace = true } +serde = { workspace = true } vortex-array = { path = "../vortex-array" } vortex-error = { path = "../vortex-error" } vortex-schema = { path = "../vortex-schema" } -linkme = { workspace = true } -itertools = { workspace = true } -num-traits = { workspace = true } -fastlanez = { path = "../fastlanez" } -log = { workspace = true } [dev-dependencies] criterion = { workspace = true } diff --git a/vortex-fastlanes/benches/bitpacking_take.rs b/vortex-fastlanes/benches/bitpacking_take.rs index 670347de8b..609ba835f5 100644 --- a/vortex-fastlanes/benches/bitpacking_take.rs +++ b/vortex-fastlanes/benches/bitpacking_take.rs @@ -4,12 +4,12 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use itertools::Itertools; use rand::distributions::Uniform; use rand::{thread_rng, Rng}; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; +use vortex::array::sparse::SparseArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::compute::take::take; use vortex::encoding::EncodingRef; -use vortex_fastlanes::{BitPackedEncoding, DowncastFastlanes}; +use vortex_fastlanes::{BitPackedArray, BitPackedEncoding}; fn values(len: usize, bits: usize) -> Vec { let rng = thread_rng(); @@ -24,17 +24,17 @@ fn bench_take(c: &mut Criterion) { let values = values(1_000_000, 8); let uncompressed = PrimitiveArray::from(values.clone()); let packed = BitPackedEncoding {} - .compress(&uncompressed, None, ctx) + .compress(uncompressed.array(), None, ctx) .unwrap(); let stratified_indices: PrimitiveArray = (0..10).map(|i| i * 10_000).collect::>().into(); c.bench_function("take_10_stratified", |b| { - b.iter(|| black_box(take(&packed, &stratified_indices).unwrap())); + b.iter(|| black_box(take(&packed, stratified_indices.array()).unwrap())); }); let contiguous_indices: PrimitiveArray = (0..10).collect::>().into(); c.bench_function("take_10_contiguous", |b| { - b.iter(|| black_box(take(&packed, &contiguous_indices).unwrap())); + b.iter(|| black_box(take(&packed, contiguous_indices.array()).unwrap())); }); let rng = thread_rng(); @@ -46,12 +46,12 @@ fn bench_take(c: &mut Criterion) { .collect_vec() .into(); c.bench_function("take_10K_random", |b| { - b.iter(|| black_box(take(&packed, &random_indices).unwrap())); + b.iter(|| black_box(take(&packed, random_indices.array()).unwrap())); }); let contiguous_indices: PrimitiveArray = (0..10_000).collect::>().into(); c.bench_function("take_10K_contiguous", |b| { - b.iter(|| black_box(take(&packed, &contiguous_indices).unwrap())); + b.iter(|| black_box(take(&packed, contiguous_indices.array()).unwrap())); }); } @@ -65,23 +65,26 @@ fn bench_patched_take(c: &mut Criterion) { let uncompressed = PrimitiveArray::from(values.clone()); let packed = BitPackedEncoding {} - .compress(&uncompressed, None, ctx) + .compress(uncompressed.array(), None, ctx) .unwrap(); - let packed = packed.as_bitpacked(); + let packed = BitPackedArray::try_from(packed).unwrap(); assert!(packed.patches().is_some()); assert_eq!( - packed.patches().unwrap().as_sparse().values().len(), + SparseArray::try_from(packed.patches().unwrap()) + .unwrap() + .values() + .len(), num_exceptions as usize ); let stratified_indices: PrimitiveArray = (0..10).map(|i| i * 10_000).collect::>().into(); c.bench_function("patched_take_10_stratified", |b| { - b.iter(|| black_box(take(packed, &stratified_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), stratified_indices.array()).unwrap())); }); let contiguous_indices: PrimitiveArray = (0..10).collect::>().into(); c.bench_function("patched_take_10_contiguous", |b| { - b.iter(|| black_box(take(packed, &contiguous_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), contiguous_indices.array()).unwrap())); }); let rng = thread_rng(); @@ -93,7 +96,7 @@ fn bench_patched_take(c: &mut Criterion) { .collect_vec() .into(); c.bench_function("patched_take_10K_random", |b| { - b.iter(|| black_box(take(packed, &random_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), random_indices.array()).unwrap())); }); let not_patch_indices: PrimitiveArray = (0u32..num_exceptions) @@ -102,7 +105,7 @@ fn bench_patched_take(c: &mut Criterion) { .collect_vec() .into(); c.bench_function("patched_take_10K_contiguous_not_patches", |b| { - b.iter(|| black_box(take(packed, ¬_patch_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), not_patch_indices.array()).unwrap())); }); let patch_indices: PrimitiveArray = (big_base2..big_base2 + num_exceptions) @@ -111,7 +114,7 @@ fn bench_patched_take(c: &mut Criterion) { .collect_vec() .into(); c.bench_function("patched_take_10K_contiguous_patches", |b| { - b.iter(|| black_box(take(packed, &patch_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), patch_indices.array()).unwrap())); }); // There are currently 2 magic parameters of note: @@ -135,7 +138,7 @@ fn bench_patched_take(c: &mut Criterion) { .collect_vec() .into(); c.bench_function("patched_take_10K_adversarial", |b| { - b.iter(|| black_box(take(packed, &adversarial_indices).unwrap())); + b.iter(|| black_box(take(packed.array(), adversarial_indices.array()).unwrap())); }); } diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index 550040fb4a..b7e5ec7c72 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -1,23 +1,20 @@ use arrayref::array_ref; use fastlanez::TryBitPack; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::sparse::{SparseArray, SparseEncoding}; -use vortex::array::IntoArray; -use vortex::array::{Array, ArrayRef}; +use vortex::array::sparse::{Sparse, SparseArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::compute::cast::cast; -use vortex::compute::flatten::flatten_primitive; -use vortex::match_each_integer_ptype; use vortex::ptype::PType::U8; use vortex::ptype::{NativePType, PType}; use vortex::scalar::{ListScalarVec, Scalar}; -use vortex::stats::Stat; -use vortex::validity::OwnedValidity; -use vortex::view::ToOwnedView; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::validity::Validity; +use vortex::{ + match_each_integer_ptype, Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray, + ToStatic, +}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::downcast::DowncastFastlanes; use crate::{match_integers_by_width, BitPackedArray, BitPackedEncoding}; impl EncodingCompression for BitPackedEncoding { @@ -27,11 +24,11 @@ impl EncodingCompression for BitPackedEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive arrays - let parray = array.maybe_primitive()?; + let parray = PrimitiveArray::try_from(array).ok()?; // Only supports ints if !parray.ptype().is_int() { @@ -40,8 +37,8 @@ impl EncodingCompression for BitPackedEncoding { let bytes_per_exception = bytes_per_exception(parray.ptype()); let bit_width_freq = parray - .stats() - .get_or_compute_as::>(&Stat::BitWidthFreq)? + .statistics() + .compute_as::>(Stat::BitWidthFreq)? .0; let bit_width = best_bit_width(&bit_width_freq, bytes_per_exception); @@ -55,40 +52,40 @@ impl EncodingCompression for BitPackedEncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { + ) -> VortexResult { let parray = array.as_primitive(); let bit_width_freq = parray - .stats() - .get_or_compute_as::>(&Stat::BitWidthFreq) + .statistics() + .compute_as::>(Stat::BitWidthFreq) .unwrap() .0; - let like_bp = like.map(|l| l.as_bitpacked()); + let like_bp = like.map(|l| BitPackedArray::try_from(l).unwrap()); let bit_width = best_bit_width(&bit_width_freq, bytes_per_exception(parray.ptype())); let num_exceptions = count_exceptions(bit_width, &bit_width_freq); if bit_width == parray.ptype().bit_width() { // Nothing we can do - return Ok(parray.clone().into_array()); + return Ok(array.to_static()); } - let packed = bitpack(parray, bit_width)?; + let packed = bitpack(&parray, bit_width)?; let validity = ctx.compress_validity(parray.validity())?; let patches = if num_exceptions > 0 { Some(ctx.auxiliary("patches").compress( - &bitpack_patches(parray, bit_width, num_exceptions), - like_bp.and_then(|bp| bp.patches()), + &bitpack_patches(&parray, bit_width, num_exceptions), + like_bp.as_ref().and_then(|bp| bp.patches()).as_ref(), )?) } else { None }; - Ok(BitPackedArray::try_new( + BitPackedArray::try_new( packed, validity, patches, @@ -96,12 +93,11 @@ impl EncodingCompression for BitPackedEncoding { parray.dtype().clone(), parray.len(), ) - .unwrap() - .into_array()) + .map(|a| a.into_array()) } } -fn bitpack(parray: &PrimitiveArray, bit_width: usize) -> VortexResult { +fn bitpack(parray: &PrimitiveArray, bit_width: usize) -> VortexResult { // We know the min is > 0, so it's safe to re-interpret signed integers as unsigned. // TODO(ngates): we should implement this using a vortex cast to centralize this hack. let bytes = match_integers_by_width!(parray.ptype(), |$P| { @@ -144,7 +140,7 @@ fn bitpack_patches( parray: &PrimitiveArray, bit_width: usize, num_exceptions_hint: usize, -) -> ArrayRef { +) -> OwnedArray { match_each_integer_ptype!(parray.ptype(), |$T| { let mut indices: Vec = Vec::with_capacity(num_exceptions_hint); let mut values: Vec<$T> = Vec::with_capacity(num_exceptions_hint); @@ -154,21 +150,26 @@ fn bitpack_patches( values.push(*v); } } - SparseArray::new(indices.into_array(), values.into_array(), parray.len(), Scalar::null(&parray.dtype().as_nullable())).into_array() + SparseArray::try_new( + indices.into_array(), + PrimitiveArray::from_vec(values, Validity::AllValid).into_array(), + parray.len(), + Scalar::null(&parray.dtype().as_nullable()), + ).unwrap().into_array() }) } -pub fn unpack(array: &BitPackedArray) -> VortexResult { +pub fn unpack<'a>(array: BitPackedArray) -> VortexResult> { let bit_width = array.bit_width(); let length = array.len(); let offset = array.offset(); - let encoded = flatten_primitive(&cast(array.encoded(), U8.into())?)?; + let encoded = cast(&array.packed(), U8.into())?.flatten_primitive()?; let ptype: PType = array.dtype().try_into()?; let mut unpacked = match_integers_by_width!(ptype, |$P| { - PrimitiveArray::from_nullable( + PrimitiveArray::from_vec( unpack_primitive::<$P>(encoded.typed_data::(), bit_width, offset, length), - array.validity().to_owned_view(), + array.validity(), ) }); @@ -178,19 +179,23 @@ pub fn unpack(array: &BitPackedArray) -> VortexResult { } if let Some(patches) = array.patches() { - patch_unpacked(unpacked, patches) + patch_unpacked(unpacked, &patches) } else { Ok(unpacked) } } -fn patch_unpacked(array: PrimitiveArray, patches: &dyn Array) -> VortexResult { +fn patch_unpacked<'a>( + array: PrimitiveArray<'a>, + patches: &Array, +) -> VortexResult> { match patches.encoding().id() { - SparseEncoding::ID => { + Sparse::ID => { match_each_integer_ptype!(array.ptype(), |$T| { + let typed_patches = SparseArray::try_from(patches).unwrap(); array.patch( - &patches.as_sparse().resolved_indices(), - flatten_primitive(patches.as_sparse().values())?.typed_data::<$T>()) + &typed_patches.resolved_indices(), + typed_patches.values().flatten_primitive()?.typed_data::<$T>()) }) } _ => panic!("can't patch bitpacked array with {}", patches), @@ -259,7 +264,7 @@ pub fn unpack_primitive( pub(crate) fn unpack_single(array: &BitPackedArray, index: usize) -> VortexResult { let bit_width = array.bit_width(); - let encoded = flatten_primitive(&cast(array.encoded(), U8.into())?)?; + let encoded = cast(&array.packed(), U8.into())?.flatten_primitive()?; let ptype: PType = array.dtype().try_into()?; let index_in_encoded = index + array.offset(); @@ -331,7 +336,8 @@ fn count_exceptions(bit_width: usize, bit_width_freq: &[usize]) -> usize { mod test { use std::sync::Arc; - use vortex::encoding::{Encoding, EncodingRef}; + use vortex::encoding::{ArrayEncoding, EncodingRef}; + use vortex::ToArray; use super::*; @@ -350,12 +356,12 @@ mod test { let compressed = ctx .compress( - &PrimitiveArray::from(Vec::from_iter((0..10_000).map(|i| (i % 63) as u8))), + PrimitiveArray::from(Vec::from_iter((0..10_000).map(|i| (i % 63) as u8))).array(), None, ) .unwrap(); assert_eq!(compressed.encoding().id(), BitPackedEncoding.id()); - assert_eq!(compressed.as_bitpacked().bit_width(), 6); + assert_eq!(BitPackedArray::try_from(compressed).unwrap().bit_width(), 6); } #[test] @@ -371,9 +377,9 @@ mod test { let ctx = CompressCtx::new(Arc::new(cfg)); let values = PrimitiveArray::from(Vec::from_iter((0..n).map(|i| (i % 2047) as u16))); - let compressed = ctx.compress(&values, None).unwrap(); - let compressed = compressed.as_bitpacked(); - let decompressed = flatten_primitive(compressed).unwrap(); + let compressed = ctx.compress(values.array(), None).unwrap(); + let compressed = BitPackedArray::try_from(compressed).unwrap(); + let decompressed = compressed.to_array().flatten_primitive().unwrap(); assert_eq!(decompressed.typed_data::(), values.typed_data::()); values @@ -382,7 +388,7 @@ mod test { .enumerate() .for_each(|(i, v)| { let scalar_at: u16 = - if let Scalar::Primitive(pscalar) = unpack_single(compressed, i).unwrap() { + if let Scalar::Primitive(pscalar) = unpack_single(&compressed, i).unwrap() { pscalar.value().unwrap().try_into().unwrap() } else { panic!("expected u8 scalar") diff --git a/vortex-fastlanes/src/bitpacking/compute/mod.rs b/vortex-fastlanes/src/bitpacking/compute/mod.rs index c99b9c8176..cf26e0c7e7 100644 --- a/vortex-fastlanes/src/bitpacking/compute/mod.rs +++ b/vortex-fastlanes/src/bitpacking/compute/mod.rs @@ -1,33 +1,25 @@ -mod slice; - use std::cmp::min; use fastlanez::TryBitPack; use itertools::Itertools; use vortex::array::constant::ConstantArray; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{flatten_primitive, FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; -use vortex::match_each_integer_ptype; use vortex::ptype::NativePType; use vortex::scalar::Scalar; -use vortex::validity::OwnedValidity; +use vortex::{match_each_integer_ptype, Array, ArrayDType, ArrayTrait, IntoArray, OwnedArray}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::bitpacking::compress::{unpack, unpack_single}; +use crate::bitpacking::compress::unpack_single; use crate::{match_integers_by_width, unpack_single_primitive, BitPackedArray}; -impl ArrayCompute for BitPackedArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } +mod slice; +impl ArrayCompute for BitPackedArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -41,38 +33,30 @@ impl ArrayCompute for BitPackedArray { } } -impl FlattenFn for BitPackedArray { - fn flatten(&self) -> VortexResult { - unpack(self).map(FlattenedArray::Primitive) - } -} - -impl ScalarAtFn for BitPackedArray { +impl ScalarAtFn for BitPackedArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { if index >= self.len() { return Err(vortex_err!(OutOfBounds: index, 0, self.len())); } if let Some(patches) = self.patches() { // NB: All non-null values are considered patches - if self.bit_width == 0 || patches.is_valid(index) { - return scalar_at(patches, index)?.cast(self.dtype()); + if self.bit_width() == 0 || patches.with_dyn(|a| a.is_valid(index)) { + return scalar_at(&patches, index)?.cast(self.dtype()); } } unpack_single(self, index)?.cast(self.dtype()) } } -impl TakeFn for BitPackedArray { - fn take(&self, indices: &dyn Array) -> VortexResult { +impl TakeFn for BitPackedArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { let ptype = self.dtype().try_into()?; - let taken_validity = self.validity().map(|v| v.take(indices)).transpose()?; + let validity = self.validity(); + let taken_validity = validity.take(indices)?; if self.bit_width() == 0 { return if let Some(patches) = self.patches() { - let primitive_patches = flatten_primitive(&take(patches, indices)?)?; - Ok( - PrimitiveArray::new(ptype, primitive_patches.buffer().clone(), taken_validity) - .into_array(), - ) + let primitive_patches = take(&patches, indices)?.flatten_primitive()?; + Ok(primitive_patches.into_array()) } else { Ok( ConstantArray::new(Scalar::null(&self.dtype().as_nullable()), indices.len()) @@ -81,9 +65,9 @@ impl TakeFn for BitPackedArray { }; } - let indices = flatten_primitive(indices)?; + let indices = indices.clone().flatten_primitive()?; let taken = match_integers_by_width!(ptype, |$T| { - PrimitiveArray::from_nullable(take_primitive::<$T>(self, &indices)?, taken_validity) + PrimitiveArray::from_vec(take_primitive::<$T>(self, &indices)?, taken_validity) }); Ok(taken.reinterpret_cast(ptype).into_array()) } @@ -105,16 +89,10 @@ fn take_primitive( }); let bit_width = array.bit_width(); - let packed = flatten_primitive(array.encoded())?; + let packed = array.packed().flatten_primitive()?; let packed = packed.typed_data::(); - let patches = array - .patches() - .map(|p| { - p.maybe_sparse() - .ok_or_else(|| vortex_err!("Only sparse patches are currently supported!")) - }) - .transpose()?; + let patches = array.patches().map(SparseArray::try_from).transpose()?; // if we have a small number of relatively large batches, we gain by slicing and then patching inside the loop // if we have a large number of relatively small batches, the overhead isn't worth it, and we're better off with a bulk patch @@ -148,23 +126,21 @@ fn take_primitive( } if !prefer_bulk_patch { - if let Some(patches) = patches { + if let Some(ref patches) = patches { let patches_slice = slice( - patches, + patches.array(), chunk * 1024, min((chunk + 1) * 1024, patches.len()), )?; - let patches_slice = patches_slice - .maybe_sparse() - .ok_or_else(|| vortex_err!("Only sparse patches are currently supported!"))?; + let patches_slice = SparseArray::try_from(patches_slice)?; let offsets = PrimitiveArray::from(offsets); - do_patch_for_take_primitive(patches_slice, &offsets, &mut output)?; + do_patch_for_take_primitive(&patches_slice, &offsets, &mut output)?; } } } if prefer_bulk_patch { - if let Some(patches) = patches { + if let Some(ref patches) = patches { do_patch_for_take_primitive(patches, indices, &mut output)?; } } @@ -177,13 +153,14 @@ fn do_patch_for_take_primitive( indices: &PrimitiveArray, output: &mut [T], ) -> VortexResult<()> { - let taken_patches = take(patches, indices)?; - let taken_patches = taken_patches - .maybe_sparse() - .ok_or_else(|| vortex_err!("Only sparse patches are currently supported!"))?; + let taken_patches = take(patches.array(), indices.array())?; + let taken_patches = SparseArray::try_from(taken_patches)?; let base_index = output.len() - indices.len(); - let output_patches = flatten_primitive(taken_patches.values())?.reinterpret_cast(T::PTYPE); + let output_patches = taken_patches + .values() + .flatten_primitive()? + .reinterpret_cast(T::PTYPE); taken_patches .resolved_indices() .iter() @@ -203,17 +180,15 @@ mod test { use itertools::Itertools; use rand::distributions::Uniform; use rand::{thread_rng, Rng}; - use vortex::array::downcast::DowncastArrayBuiltin; - use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; - use vortex::array::Array; + use vortex::array::primitive::{Primitive, PrimitiveArray}; + use vortex::array::sparse::SparseArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::compute::scalar_at::scalar_at; use vortex::compute::take::take; use vortex::encoding::EncodingRef; - use vortex::scalar::Scalar; + use vortex::{ArrayDef, IntoArray}; - use crate::downcast::DowncastFastlanes; - use crate::BitPackedEncoding; + use crate::{BitPackedArray, BitPackedEncoding}; #[test] fn take_indices() { @@ -222,10 +197,11 @@ mod test { let indices = PrimitiveArray::from(vec![0, 125, 2047, 2049, 2151, 2790]); let unpacked = PrimitiveArray::from((0..4096).map(|i| (i % 63) as u8).collect::>()); - let bitpacked = ctx.compress(&unpacked, None).unwrap(); - let result = take(&bitpacked, &indices).unwrap(); - assert_eq!(result.encoding().id(), PrimitiveEncoding::ID); - let res_bytes = result.as_primitive().typed_data::(); + let bitpacked = ctx.compress(unpacked.array(), None).unwrap(); + let result = take(&bitpacked, indices.array()).unwrap(); + assert_eq!(result.encoding().id(), Primitive::ID); + let primitive_result = result.flatten_primitive().unwrap(); + let res_bytes = primitive_result.typed_data::(); assert_eq!(res_bytes, &[0, 62, 31, 33, 9, 18]); } @@ -238,12 +214,12 @@ mod test { let values = (0..u16::MAX as u32 + num_patches as u32).collect::>(); let uncompressed = PrimitiveArray::from(values.clone()); let packed = BitPackedEncoding {} - .compress(&uncompressed, None, ctx) + .compress(uncompressed.array(), None, ctx) .unwrap(); - let packed = packed.as_bitpacked(); + let packed = BitPackedArray::try_from(packed).unwrap(); assert!(packed.patches().is_some()); - let patches = packed.patches().unwrap().as_sparse(); + let patches = SparseArray::try_from(packed.patches().unwrap()).unwrap(); assert_eq!( patches.resolved_indices(), ((values.len() + 1 - num_patches)..values.len()).collect_vec() @@ -257,7 +233,7 @@ mod test { .map(|i| i as u32) .collect_vec() .into(); - let taken = take(packed, &random_indices).unwrap(); + let taken = take(packed.array(), random_indices.array()).unwrap(); // sanity check random_indices @@ -266,12 +242,12 @@ mod test { .enumerate() .for_each(|(ti, i)| { assert_eq!( - scalar_at(packed, *i as usize).unwrap(), - Scalar::from(values[*i as usize]) + u32::try_from(scalar_at(packed.array(), *i as usize).unwrap()).unwrap(), + values[*i as usize] ); assert_eq!( - scalar_at(&taken, ti).unwrap(), - Scalar::from(values[*i as usize]) + u32::try_from(scalar_at(&taken, ti).unwrap()).unwrap(), + values[*i as usize] ); }); } @@ -286,14 +262,17 @@ mod test { let packed = BitPackedEncoding .compress(&uncompressed, None, ctx) .unwrap(); - let packed = packed.as_bitpacked(); + let packed = BitPackedArray::try_from(packed).unwrap(); assert!(packed.patches().is_some()); - let patches = packed.patches().unwrap().as_sparse(); + let patches = SparseArray::try_from(packed.patches().unwrap()).unwrap(); assert_eq!(patches.resolved_indices(), vec![256]); values.iter().enumerate().for_each(|(i, v)| { - assert_eq!(scalar_at(packed, i).unwrap(), Scalar::from(*v)); + assert_eq!( + u32::try_from(scalar_at(packed.array(), i).unwrap()).unwrap(), + *v + ); }); } } diff --git a/vortex-fastlanes/src/bitpacking/compute/slice.rs b/vortex-fastlanes/src/bitpacking/compute/slice.rs index af81a71257..1950972641 100644 --- a/vortex-fastlanes/src/bitpacking/compute/slice.rs +++ b/vortex-fastlanes/src/bitpacking/compute/slice.rs @@ -1,29 +1,28 @@ use std::cmp::max; -use vortex::array::{Array, ArrayRef}; use vortex::compute::slice::{slice, SliceFn}; -use vortex::validity::OwnedValidity; +use vortex::{ArrayDType, IntoArray, OwnedArray, ToStatic}; use vortex_error::VortexResult; use crate::BitPackedArray; -impl SliceFn for BitPackedArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for BitPackedArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { let offset = start % 1024; let block_start = max(0, start - offset); let block_stop = ((stop + 1023) / 1024) * 1024; - let encoded_start = (block_start / 8) * self.bit_width; - let encoded_stop = (block_stop / 8) * self.bit_width; - Ok(Self::try_new_from_offset( - slice(self.encoded(), encoded_start, encoded_stop)?, - self.validity().map(|v| v.slice(start, stop)).transpose()?, - self.patches().map(|p| slice(p, start, stop)).transpose()?, + let encoded_start = (block_start / 8) * self.bit_width(); + let encoded_stop = (block_stop / 8) * self.bit_width(); + Self::try_new_from_offset( + slice(&self.packed(), encoded_start, encoded_stop)?, + self.validity().slice(start, stop)?, + self.patches().map(|p| slice(&p, start, stop)).transpose()?, self.bit_width(), self.dtype().clone(), stop - start, offset, - )? - .into_array()) + ) + .map(|a| a.into_array().to_static()) } } diff --git a/vortex-fastlanes/src/bitpacking/mod.rs b/vortex-fastlanes/src/bitpacking/mod.rs index 0fb14662b9..8a3d256294 100644 --- a/vortex-fastlanes/src/bitpacking/mod.rs +++ b/vortex-fastlanes/src/bitpacking/mod.rs @@ -1,209 +1,171 @@ -use std::sync::{Arc, RwLock}; - +use ::serde::{Deserialize, Serialize}; pub use compress::*; -use vortex::array::{Array, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; -use vortex::validity::Validity; -use vortex::validity::{OwnedValidity, ValidityView}; -use vortex::view::AsView; -use vortex::{impl_array, ArrayWalker}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, IntoArrayData}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::{DType, IntWidth, Nullability, Signedness}; +use vortex_schema::{IntWidth, Nullability, Signedness}; mod compress; mod compute; -mod serde; -/// NB: All non-null values in the patches array are considered patches -#[derive(Debug, Clone)] -pub struct BitPackedArray { - encoded: ArrayRef, - validity: Option, - patches: Option, - offset: usize, - len: usize, +impl_encoding!("fastlanes.bitpacked", BitPacked); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BitPackedMetadata { + validity: ValidityMetadata, + patches_dtype: Option, bit_width: usize, - dtype: DType, - stats: Arc>, + offset: usize, + length: usize, } -impl BitPackedArray { +/// NB: All non-null values in the patches array are considered patches +impl BitPackedArray<'_> { const ENCODED_DTYPE: DType = DType::Int(IntWidth::_8, Signedness::Unsigned, Nullability::NonNullable); pub fn try_new( - encoded: ArrayRef, - validity: Option, - patches: Option, + packed: Array, + validity: Validity, + patches: Option, bit_width: usize, dtype: DType, len: usize, ) -> VortexResult { - Self::try_new_from_offset(encoded, validity, patches, bit_width, dtype, len, 0) + Self::try_new_from_offset(packed, validity, patches, bit_width, dtype, len, 0) } pub(crate) fn try_new_from_offset( - encoded: ArrayRef, - validity: Option, - patches: Option, + packed: Array, + validity: Validity, + patches: Option, bit_width: usize, dtype: DType, - len: usize, + length: usize, offset: usize, ) -> VortexResult { - if encoded.dtype() != &Self::ENCODED_DTYPE { - vortex_bail!(MismatchedTypes: Self::ENCODED_DTYPE, encoded.dtype()); - } - if let Some(v) = &validity { - assert_eq!(v.len(), len); + if packed.dtype() != &Self::ENCODED_DTYPE { + vortex_bail!(MismatchedTypes: Self::ENCODED_DTYPE, packed.dtype()); } if bit_width > 64 { - return Err(vortex_err!("Unsupported bit width {}", bit_width)); + vortex_bail!("Unsupported bit width {}", bit_width); } if !matches!(dtype, DType::Int(_, _, _)) { - return Err(vortex_err!(MismatchedTypes: "int", dtype)); + vortex_bail!(MismatchedTypes: "int", dtype); } - let expected_packed_size = ((len + 1023) / 1024) * 128 * bit_width; - if encoded.len() != expected_packed_size { + let expected_packed_size = ((length + 1023) / 1024) * 128 * bit_width; + if packed.len() != expected_packed_size { return Err(vortex_err!( "Expected {} packed bytes, got {}", expected_packed_size, - encoded.len() + packed.len() )); } - Ok(Self { - encoded, - validity, - patches, + let metadata = BitPackedMetadata { + validity: validity.to_metadata(length)?, + patches_dtype: patches.as_ref().map(|p| p.dtype().as_nullable()), offset, - len, + length, bit_width, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - - #[inline] - pub fn encoded(&self) -> &ArrayRef { - &self.encoded - } - - #[inline] - pub fn bit_width(&self) -> usize { - self.bit_width - } - - #[inline] - pub fn patches(&self) -> Option<&ArrayRef> { - self.patches.as_ref() - } - - #[inline] - pub fn offset(&self) -> usize { - self.offset - } -} + }; -impl Array for BitPackedArray { - impl_array!(); - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - #[inline] - fn len(&self) -> usize { - self.len - } + let mut children = Vec::with_capacity(3); + children.push(packed.into_array_data()); + if let Some(p) = patches { + children.push(p.into_array_data()); + } + if let Some(a) = validity.into_array_data() { + children.push(a) + } - #[inline] - fn is_empty(&self) -> bool { - self.len == 0 + Self::try_from_parts(dtype, metadata, children.into(), HashMap::new()) } #[inline] - fn dtype(&self) -> &DType { - &self.dtype + pub fn packed(&self) -> Array { + self.array() + .child(0, &DType::BYTES) + .expect("Missing packed array") } #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) + pub fn bit_width(&self) -> usize { + self.metadata().bit_width } #[inline] - fn encoding(&self) -> EncodingRef { - &BitPackedEncoding + pub fn patches(&self) -> Option { + self.metadata().patches_dtype.as_ref().map(|pd| { + self.array() + .child(1, pd) + .expect("Missing patches with present metadata flag") + }) } #[inline] - fn nbytes(&self) -> usize { - // Ignore any overheads like padding or the bit-width flag. - let packed_size = ((self.bit_width * self.len()) + 7) / 8; - packed_size - + self.patches().map(|p| p.nbytes()).unwrap_or(0) - + self.validity().map(|v| v.nbytes()).unwrap_or(0) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) + pub fn offset(&self) -> usize { + self.metadata().offset } - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.encoded()) + pub fn validity(&self) -> Validity { + self.metadata().validity.to_validity(self.array().child( + if self.metadata().patches_dtype.is_some() { + 2 + } else { + 1 + }, + &Validity::DTYPE, + )) } } -impl OwnedValidity for BitPackedArray { - fn validity(&self) -> Option { - self.validity.as_view() +impl ArrayFlatten for BitPackedArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + unpack(self).map(Flattened::Primitive) } } -impl ArrayDisplay for BitPackedArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("offset", self.offset)?; - f.property("packed", format!("u{}", self.bit_width()))?; - f.child("encoded", self.encoded())?; - f.maybe_child("patches", self.patches())?; - f.validity(self.validity()) +impl ArrayValidity for BitPackedArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) } -} -impl StatsCompute for BitPackedArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - Ok(StatsSet::default()) + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) } } -#[derive(Debug)] -pub struct BitPackedEncoding; - -impl BitPackedEncoding { - pub const ID: EncodingId = EncodingId::new("fastlanes.bitpacked"); +impl AcceptArrayVisitor for BitPackedArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("packed", &self.packed())?; + if self.metadata().patches_dtype.is_some() { + visitor.visit_child( + "patches", + &self.patches().expect("Expected patches to be present "), + )?; + } + visitor.visit_validity(&self.validity()) + } } -impl Encoding for BitPackedEncoding { - fn id(&self) -> EncodingId { - Self::ID - } +impl ArrayStatisticsCompute for BitPackedArray<'_> {} - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) +impl ArrayTrait for BitPackedArray<'_> { + fn len(&self) -> usize { + self.metadata().length } - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) + fn nbytes(&self) -> usize { + // Ignore any overheads like padding or the bit-width flag. + let packed_size = ((self.bit_width() * self.len()) + 7) / 8; + packed_size + self.patches().map(|p| p.nbytes()).unwrap_or(0) } } @@ -228,7 +190,6 @@ mod test { use std::sync::Arc; use vortex::array::primitive::PrimitiveArray; - use vortex::array::Array; use vortex::compress::{CompressConfig, CompressCtx}; use vortex::compute::scalar_at::scalar_at; use vortex::compute::slice::slice; @@ -243,7 +204,8 @@ mod test { let compressed = slice( &ctx.compress( - &PrimitiveArray::from((0..10_000).map(|i| (i % 63) as u8).collect::>()), + PrimitiveArray::from((0..10_000).map(|i| (i % 63) as u8).collect::>()) + .array(), None, ) .unwrap(), @@ -268,7 +230,8 @@ mod test { let compressed = slice( &ctx.compress( - &PrimitiveArray::from((0..10_000).map(|i| (i % 63) as u8).collect::>()), + PrimitiveArray::from((0..10_000).map(|i| (i % 63) as u8).collect::>()) + .array(), None, ) .unwrap(), diff --git a/vortex-fastlanes/src/bitpacking/serde.rs b/vortex-fastlanes/src/bitpacking/serde.rs deleted file mode 100644 index 3474f09166..0000000000 --- a/vortex-fastlanes/src/bitpacking/serde.rs +++ /dev/null @@ -1,47 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex::validity::OwnedValidity; -use vortex_error::VortexResult; - -use crate::{BitPackedArray, BitPackedEncoding}; - -impl ArraySerde for BitPackedArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write(self.encoded())?; - ctx.write_validity(self.validity())?; - ctx.write_optional_array(self.patches())?; - ctx.write_usize(self.bit_width())?; - ctx.write_usize(self.len())?; - ctx.write_usize(self.offset()) - } - - fn metadata(&self) -> VortexResult>> { - let mut vec = Vec::new(); - let mut ctx = WriteCtx::new(&mut vec); - ctx.write_usize(self.bit_width())?; - ctx.write_usize(self.len())?; - ctx.write_usize(self.offset())?; - Ok(Some(vec)) - } -} - -impl EncodingSerde for BitPackedEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let encoded = ctx.bytes().read()?; - let validity = ctx.read_validity()?; - let patches = ctx.read_optional_array()?; - let bit_width = ctx.read_usize()?; - let len = ctx.read_usize()?; - let offset = ctx.read_usize()?; - Ok(BitPackedArray::try_new_from_offset( - encoded, - validity, - patches, - bit_width, - ctx.schema().clone(), - len, - offset, - )? - .into_array()) - } -} diff --git a/vortex-fastlanes/src/delta/compress.rs b/vortex-fastlanes/src/delta/compress.rs index cb3ff2d951..64c3c9b02b 100644 --- a/vortex-fastlanes/src/delta/compress.rs +++ b/vortex-fastlanes/src/delta/compress.rs @@ -3,30 +3,25 @@ use std::mem::size_of; use arrayref::array_ref; use fastlanez::{transpose, untranspose_into, Delta}; use num_traits::{WrappingAdd, WrappingSub}; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::compute::fill::fill_forward; -use vortex::compute::flatten::flatten_primitive; -use vortex::match_each_integer_ptype; use vortex::ptype::NativePType; -use vortex::validity::OwnedValidity; use vortex::validity::Validity; -use vortex::view::ToOwnedView; +use vortex::{match_each_integer_ptype, Array, IntoArray, OwnedArray}; use vortex_error::VortexResult; +use vortex_schema::Nullability; -use crate::downcast::DowncastFastlanes; use crate::{DeltaArray, DeltaEncoding}; impl EncodingCompression for DeltaEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive arrays - let parray = array.maybe_primitive()?; + let parray = PrimitiveArray::try_from(array).ok()?; // Only supports ints if !parray.ptype().is_int() { @@ -38,39 +33,45 @@ impl EncodingCompression for DeltaEncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { - let parray = array.as_primitive(); - let like_delta = like.map(|l| l.as_delta()); + ) -> VortexResult { + let parray = PrimitiveArray::try_from(array)?; + let like_delta = like.map(|l| DeltaArray::try_from(l).unwrap()); let validity = ctx.compress_validity(parray.validity())?; // Fill forward nulls - let filled = fill_forward(array)?; + let filled = fill_forward(array)?.flatten_primitive()?; // Compress the filled array let (bases, deltas) = match_each_integer_ptype!(parray.ptype(), |$T| { - let (bases, deltas) = compress_primitive(filled.as_primitive().typed_data::<$T>()); - let base_validity = validity.is_some().then(|| Validity::Valid(bases.len())); - let delta_validity = validity.is_some().then(|| Validity::Valid(deltas.len())); + let (bases, deltas) = compress_primitive(filled.typed_data::<$T>()); + let base_validity = (validity.nullability() != Nullability::NonNullable) + .then(|| Validity::AllValid) + .unwrap_or(Validity::NonNullable); + let delta_validity = (validity.nullability() != Nullability::NonNullable) + .then(|| Validity::AllValid) + .unwrap_or(Validity::NonNullable); ( // To preserve nullability, we include Validity - PrimitiveArray::from_nullable(bases, base_validity), - PrimitiveArray::from_nullable(deltas, delta_validity), + PrimitiveArray::from_vec(bases, base_validity), + PrimitiveArray::from_vec(deltas, delta_validity), ) }); // Recursively compress the bases and deltas - let bases = ctx - .named("bases") - .compress(&bases, like_delta.map(|d| d.bases()))?; - let deltas = ctx - .named("deltas") - .compress(&deltas, like_delta.map(|d| d.deltas()))?; - - Ok(DeltaArray::try_new(array.len(), bases, deltas, validity)?.into_array()) + let bases = ctx.named("bases").compress( + bases.array(), + like_delta.as_ref().map(|d| d.bases()).as_ref(), + )?; + let deltas = ctx.named("deltas").compress( + deltas.array(), + like_delta.as_ref().map(|d| d.deltas()).as_ref(), + )?; + + DeltaArray::try_new(array.len(), bases, deltas, validity).map(|a| a.into_array()) } } @@ -129,13 +130,13 @@ where (bases, deltas) } -pub fn decompress(array: &DeltaArray) -> VortexResult { - let bases = flatten_primitive(array.bases())?; - let deltas = flatten_primitive(array.deltas())?; +pub fn decompress(array: DeltaArray) -> VortexResult { + let bases = array.bases().flatten_primitive()?; + let deltas = array.deltas().flatten_primitive()?; let decoded = match_each_integer_ptype!(deltas.ptype(), |$T| { - PrimitiveArray::from_nullable( + PrimitiveArray::from_vec( decompress_primitive::<$T>(bases.typed_data(), deltas.typed_data()), - array.validity().to_owned_view() + array.validity() ) }); Ok(decoded) @@ -192,7 +193,7 @@ where mod test { use std::sync::Arc; - use vortex::encoding::{Encoding, EncodingRef}; + use vortex::encoding::{ArrayEncoding, EncodingRef}; use super::*; @@ -216,11 +217,11 @@ mod test { fn do_roundtrip_test(input: Vec) { let ctx = compress_ctx(); let compressed = DeltaEncoding {} - .compress(&PrimitiveArray::from(input.clone()), None, ctx) + .compress(PrimitiveArray::from(input.clone()).array(), None, ctx) .unwrap(); assert_eq!(compressed.encoding().id(), DeltaEncoding.id()); - let delta = compressed.as_delta(); + let delta = DeltaArray::try_from(compressed).unwrap(); let decompressed = decompress(delta).unwrap(); let decompressed_slice = decompressed.typed_data::(); diff --git a/vortex-fastlanes/src/delta/compute.rs b/vortex-fastlanes/src/delta/compute.rs index 4fca1c9cc1..5b99651012 100644 --- a/vortex-fastlanes/src/delta/compute.rs +++ b/vortex-fastlanes/src/delta/compute.rs @@ -1,18 +1,5 @@ -use vortex::compute::flatten::{FlattenFn, FlattenedArray}; use vortex::compute::ArrayCompute; -use vortex_error::VortexResult; -use crate::delta::compress::decompress; use crate::DeltaArray; -impl ArrayCompute for DeltaArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } -} - -impl FlattenFn for DeltaArray { - fn flatten(&self) -> VortexResult { - decompress(self).map(FlattenedArray::Primitive) - } -} +impl ArrayCompute for DeltaArray<'_> {} diff --git a/vortex-fastlanes/src/delta/mod.rs b/vortex-fastlanes/src/delta/mod.rs index a984e0995e..7dd25a0cb0 100644 --- a/vortex-fastlanes/src/delta/mod.rs +++ b/vortex-fastlanes/src/delta/mod.rs @@ -1,38 +1,30 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; -use vortex::validity::Validity; -use vortex::validity::{OwnedValidity, ValidityView}; -use vortex::view::AsView; -use vortex::{impl_array, match_each_integer_ptype, ArrayWalker}; +use serde::{Deserialize, Serialize}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::ValidityMetadata; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, match_each_integer_ptype, ArrayDType, ArrayFlatten, IntoArrayData}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; + +use crate::delta::compress::decompress; mod compress; mod compute; -mod serde; -#[derive(Debug, Clone)] -pub struct DeltaArray { +impl_encoding!("fastlanes.delta", Delta); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DeltaMetadata { + validity: ValidityMetadata, len: usize, - bases: ArrayRef, - deltas: ArrayRef, - validity: Option, - stats: Arc>, } -impl DeltaArray { +impl DeltaArray<'_> { pub fn try_new( len: usize, - bases: ArrayRef, - deltas: ArrayRef, - validity: Option, + bases: Array, + deltas: Array, + validity: Validity, ) -> VortexResult { if bases.dtype() != deltas.dtype() { vortex_bail!( @@ -49,23 +41,25 @@ impl DeltaArray { ); } - let delta = Self { - len, - bases, - deltas, - validity, - stats: Arc::new(RwLock::new(StatsSet::new())), - }; + let delta = Self::try_from_parts( + bases.dtype().clone(), + DeltaMetadata { + validity: validity.to_metadata(len)?, + len, + }, + vec![bases.into_array_data(), deltas.into_array_data()].into(), + HashMap::new(), + )?; let expected_bases_len = { let num_chunks = len / 1024; let remainder_base_size = if len % 1024 > 0 { 1 } else { 0 }; num_chunks * delta.lanes() + remainder_base_size }; - if delta.bases.len() != expected_bases_len { + if delta.bases().len() != expected_bases_len { vortex_bail!( "DeltaArray: bases.len() ({}) != expected_bases_len ({}), based on len ({}) and lane count ({})", - delta.bases.len(), + delta.bases().len(), expected_bases_len, len, delta.lanes() @@ -75,13 +69,13 @@ impl DeltaArray { } #[inline] - pub fn bases(&self) -> &ArrayRef { - &self.bases + pub fn bases(&self) -> Array { + self.array().child(0, self.dtype()).expect("Missing bases") } #[inline] - pub fn deltas(&self) -> &ArrayRef { - &self.deltas + pub fn deltas(&self) -> Array { + self.array().child(1, self.dtype()).expect("Missing deltas") } #[inline] @@ -91,103 +85,44 @@ impl DeltaArray { <$T as fastlanez::Delta>::lanes() }) } -} - -impl Array for DeltaArray { - impl_array!(); - #[inline] - fn len(&self) -> usize { - self.len - } - #[inline] - fn is_empty(&self) -> bool { - self.bases.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - self.bases.dtype() - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &DeltaEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.bases().nbytes() - + self.deltas().nbytes() - + self.validity().map(|v| v.nbytes()).unwrap_or(0) - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.bases())?; - walker.visit_child(self.deltas()) + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(2, &Validity::DTYPE)) } } -impl<'arr> AsRef<(dyn Array + 'arr)> for DeltaArray { - fn as_ref(&self) -> &(dyn Array + 'arr) { - self +impl ArrayFlatten for DeltaArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + decompress(self).map(Flattened::Primitive) } } -impl OwnedValidity for DeltaArray { - fn validity(&self) -> Option { - self.validity.as_view() +impl ArrayValidity for DeltaArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) } -} -impl ArrayDisplay for DeltaArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("bases", self.bases())?; - f.child("deltas", self.deltas())?; - f.validity(self.validity()) + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) } } -impl StatsCompute for DeltaArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - Ok(StatsSet::default()) +impl AcceptArrayVisitor for DeltaArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("bases", &self.bases())?; + visitor.visit_child("deltas", &self.deltas()) } } -#[derive(Debug)] -pub struct DeltaEncoding; +impl ArrayStatisticsCompute for DeltaArray<'_> {} -impl DeltaEncoding { - pub const ID: EncodingId = EncodingId::new("fastlanes.delta"); -} - -impl Encoding for DeltaEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for DeltaArray<'_> { + fn len(&self) -> usize { + self.metadata().len } } diff --git a/vortex-fastlanes/src/delta/serde.rs b/vortex-fastlanes/src/delta/serde.rs deleted file mode 100644 index 0c811bab81..0000000000 --- a/vortex-fastlanes/src/delta/serde.rs +++ /dev/null @@ -1,31 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex::validity::OwnedValidity; -use vortex_error::VortexResult; - -use crate::{DeltaArray, DeltaEncoding}; - -impl ArraySerde for DeltaArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - ctx.write(self.bases())?; - ctx.write(self.deltas())?; - ctx.write_validity(self.validity()) - } - - fn metadata(&self) -> VortexResult>> { - todo!() - } -} - -impl EncodingSerde for DeltaEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let bases = ctx.read()?; - let deltas = ctx.read()?; - let validity = ctx.read_validity()?; - Ok(DeltaArray::try_new(len, bases, deltas, validity) - .unwrap() - .into_array()) - } -} diff --git a/vortex-fastlanes/src/downcast.rs b/vortex-fastlanes/src/downcast.rs deleted file mode 100644 index 69b7d6747c..0000000000 --- a/vortex-fastlanes/src/downcast.rs +++ /dev/null @@ -1,59 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::{BitPackedArray, DeltaArray, FoRArray}; - -mod private { - pub trait Sealed {} -} - -pub trait DowncastFastlanes: private::Sealed { - fn maybe_for(&self) -> Option<&FoRArray>; - - fn as_for(&self) -> &FoRArray { - self.maybe_for().unwrap() - } - - fn maybe_delta(&self) -> Option<&DeltaArray>; - - fn as_delta(&self) -> &DeltaArray { - self.maybe_delta().unwrap() - } - - fn maybe_bitpacked(&self) -> Option<&BitPackedArray>; - - fn as_bitpacked(&self) -> &BitPackedArray { - self.maybe_bitpacked().unwrap() - } -} - -impl private::Sealed for dyn Array + '_ {} - -impl DowncastFastlanes for dyn Array + '_ { - fn maybe_for(&self) -> Option<&FoRArray> { - self.as_any().downcast_ref() - } - - fn maybe_delta(&self) -> Option<&DeltaArray> { - self.as_any().downcast_ref() - } - - fn maybe_bitpacked(&self) -> Option<&BitPackedArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastFastlanes for ArrayRef { - fn maybe_for(&self) -> Option<&FoRArray> { - self.as_any().downcast_ref() - } - - fn maybe_delta(&self) -> Option<&DeltaArray> { - self.as_any().downcast_ref() - } - - fn maybe_bitpacked(&self) -> Option<&BitPackedArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-fastlanes/src/for/compress.rs b/vortex-fastlanes/src/for/compress.rs index 8a7559e3f5..f7049efc45 100644 --- a/vortex-fastlanes/src/for/compress.rs +++ b/vortex-fastlanes/src/for/compress.rs @@ -1,20 +1,14 @@ use itertools::Itertools; use num_traits::{PrimInt, WrappingAdd, WrappingSub}; use vortex::array::constant::ConstantArray; -use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::compute::flatten::flatten_primitive; -use vortex::match_each_integer_ptype; use vortex::ptype::{NativePType, PType}; use vortex::scalar::ListScalarVec; -use vortex::stats::Stat; -use vortex::validity::OwnedValidity; -use vortex::view::ToOwnedView; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::{match_each_integer_ptype, Array, ArrayDType, ArrayTrait, IntoArray, OwnedArray}; use vortex_error::{vortex_err, VortexResult}; -use crate::downcast::DowncastFastlanes; use crate::{FoRArray, FoREncoding}; impl EncodingCompression for FoREncoding { @@ -24,11 +18,11 @@ impl EncodingCompression for FoREncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive arrays - let parray = array.maybe_primitive()?; + let parray = PrimitiveArray::try_from(array).ok()?; // Only supports integers if !parray.ptype().is_int() { @@ -36,8 +30,8 @@ impl EncodingCompression for FoREncoding { } // Nothing for us to do if the min is already zero and tz == 0 - let shift = trailing_zeros(parray); - let min = parray.stats().get_or_compute_cast::(&Stat::Min)?; + let shift = trailing_zeros(parray.array()); + let min = parray.statistics().compute_as_cast::(Stat::Min)?; if min == 0 && shift == 0 { return None; } @@ -47,15 +41,15 @@ impl EncodingCompression for FoREncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { - let parray = array.as_primitive(); - let shift = trailing_zeros(parray); + ) -> VortexResult { + let parray = PrimitiveArray::try_from(array)?; + let shift = trailing_zeros(array); let min = parray - .stats() - .get_or_compute(&Stat::Min) + .statistics() + .compute(Stat::Min) .ok_or_else(|| vortex_err!("Min stat not found"))?; let child = match_each_integer_ptype!(parray.ptype(), |$T| { @@ -65,17 +59,18 @@ impl EncodingCompression for FoREncoding { compress_primitive::<$T>(parray, shift, $T::try_from(min.clone())?).into_array() } }); + let for_like = like.map(|like_arr| FoRArray::try_from(like_arr).unwrap()); let compressed_child = ctx .named("for") .excluding(&FoREncoding) - .compress(&child, like.map(|l| l.as_for().encoded()))?; - Ok(FoRArray::try_new(compressed_child, min, shift)?.into_array()) + .compress(&child, for_like.as_ref().map(|l| l.encoded()).as_ref())?; + FoRArray::try_new(compressed_child, min, shift).map(|a| a.into_array()) } } fn compress_primitive( - parray: &PrimitiveArray, + parray: PrimitiveArray, shift: u8, min: T, ) -> PrimitiveArray { @@ -96,18 +91,18 @@ fn compress_primitive( .collect_vec() }; - PrimitiveArray::from_nullable(values, parray.validity().to_owned_view()) + PrimitiveArray::from_vec(values, parray.validity()) } -pub fn decompress(array: &FoRArray) -> VortexResult { +pub fn decompress(array: FoRArray) -> VortexResult { let shift = array.shift(); let ptype: PType = array.dtype().try_into()?; - let encoded = flatten_primitive(array.encoded())?; + let encoded = array.encoded().flatten_primitive()?; Ok(match_each_integer_ptype!(ptype, |$T| { let reference: $T = array.reference().try_into()?; - PrimitiveArray::from_nullable( + PrimitiveArray::from_vec( decompress_primitive(encoded.typed_data::<$T>(), reference, shift), - encoded.validity().to_owned_view(), + encoded.validity(), ) })) } @@ -132,10 +127,10 @@ fn decompress_primitive( } } -fn trailing_zeros(array: &dyn Array) -> u8 { +fn trailing_zeros(array: &Array) -> u8 { let tz_freq = array - .stats() - .get_or_compute_as::>(&Stat::TrailingZeroFreq) + .statistics() + .compute_as::>(Stat::TrailingZeroFreq) .map(|v| v.0) .unwrap_or(vec![0]); tz_freq @@ -151,7 +146,7 @@ mod test { use std::sync::Arc; use vortex::compute::scalar_at::ScalarAtFn; - use vortex::encoding::{Encoding, EncodingRef}; + use vortex::encoding::{ArrayEncoding, EncodingRef}; use super::*; use crate::BitPackedEncoding; @@ -170,10 +165,10 @@ mod test { // Create a range offset by a million let array = PrimitiveArray::from((0u32..10_000).map(|v| v + 1_000_000).collect_vec()); - let compressed = ctx.compress(&array, None).unwrap(); + let compressed = ctx.compress(array.array(), None).unwrap(); assert_eq!(compressed.encoding().id(), FoREncoding.id()); assert_eq!( - u32::try_from(compressed.as_for().reference()).unwrap(), + u32::try_from(FoRArray::try_from(compressed).unwrap().reference()).unwrap(), 1_000_000u32 ); } @@ -184,10 +179,10 @@ mod test { // Create a range offset by a million let array = PrimitiveArray::from((0u32..10_000).map(|v| v + 1_000_000).collect_vec()); - let compressed = ctx.compress(&array, None).unwrap(); + let compressed = ctx.compress(array.array(), None).unwrap(); assert_eq!(compressed.encoding().id(), FoREncoding.id()); - let decompressed = flatten_primitive(compressed.as_ref()).unwrap(); + let decompressed = compressed.flatten_primitive().unwrap(); assert_eq!(decompressed.typed_data::(), array.typed_data::()); } @@ -197,16 +192,16 @@ mod test { // Create a range offset by a million let array = PrimitiveArray::from((i8::MIN..i8::MAX).collect_vec()); - let compressed = FoREncoding {}.compress(&array, None, ctx).unwrap(); - let compressed = compressed.as_for(); + let compressed = FoREncoding {}.compress(array.array(), None, ctx).unwrap(); + let compressed = FoRArray::try_from(compressed).unwrap(); assert_eq!(i8::MIN, compressed.reference().try_into().unwrap()); - let encoded = flatten_primitive(compressed.encoded()).unwrap(); + let encoded = compressed.encoded().flatten_primitive().unwrap(); let bitcast: &[u8] = unsafe { std::mem::transmute(encoded.typed_data::()) }; let unsigned: Vec = (0..u8::MAX).collect_vec(); assert_eq!(bitcast, unsigned.as_slice()); - let decompressed = flatten_primitive(compressed).unwrap(); + let decompressed = compressed.array().clone().flatten_primitive().unwrap(); assert_eq!(decompressed.typed_data::(), array.typed_data::()); array .typed_data::() diff --git a/vortex-fastlanes/src/for/compute.rs b/vortex-fastlanes/src/for/compute.rs index 790e45c373..35080b0726 100644 --- a/vortex-fastlanes/src/for/compute.rs +++ b/vortex-fastlanes/src/for/compute.rs @@ -1,21 +1,14 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; -use vortex::match_each_integer_ptype; use vortex::scalar::{PrimitiveScalar, Scalar}; +use vortex::{match_each_integer_ptype, Array, IntoArray, OwnedArray}; use vortex_error::VortexResult; -use crate::r#for::compress::decompress; use crate::FoRArray; -impl ArrayCompute for FoRArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - +impl ArrayCompute for FoRArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -29,26 +22,20 @@ impl ArrayCompute for FoRArray { } } -impl FlattenFn for FoRArray { - fn flatten(&self) -> VortexResult { - decompress(self).map(FlattenedArray::Primitive) - } -} - -impl TakeFn for FoRArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - Ok(FoRArray::try_new( - take(self.encoded(), indices)?, - self.reference.clone(), - self.shift, - )? - .into_array()) +impl TakeFn for FoRArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + FoRArray::try_new( + take(&self.encoded(), indices)?, + self.reference().clone(), + self.shift(), + ) + .map(|a| a.into_array()) } } -impl ScalarAtFn for FoRArray { +impl ScalarAtFn for FoRArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - let encoded_scalar = scalar_at(self.encoded(), index)?; + let encoded_scalar = scalar_at(&self.encoded(), index)?; match (&encoded_scalar, self.reference()) { (Scalar::Primitive(p), Scalar::Primitive(r)) => match p.value() { @@ -66,14 +53,14 @@ impl ScalarAtFn for FoRArray { } } -impl SliceFn for FoRArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(FoRArray::try_new( - slice(self.encoded(), start, stop)?, +impl SliceFn for FoRArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + FoRArray::try_new( + slice(&self.encoded(), start, stop)?, self.reference().clone(), self.shift(), - )? - .into_array()) + ) + .map(|a| a.into_array()) } } @@ -89,7 +76,7 @@ mod test { fn for_scalar_at() { let forarr = FoREncoding .compress( - &PrimitiveArray::from(vec![11, 15, 19]), + PrimitiveArray::from(vec![11, 15, 19]).array(), None, CompressCtx::default(), ) diff --git a/vortex-fastlanes/src/for/mod.rs b/vortex-fastlanes/src/for/mod.rs index d98cc48cea..4a055482f7 100644 --- a/vortex-fastlanes/src/for/mod.rs +++ b/vortex-fastlanes/src/for/mod.rs @@ -1,151 +1,88 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::scalar::Scalar; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; +use serde::{Deserialize, Serialize}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, ToArrayData}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; + +use crate::r#for::compress::decompress; mod compress; mod compute; -mod serde; -#[derive(Debug, Clone)] -pub struct FoRArray { - encoded: ArrayRef, +impl_encoding!("fastlanes.for", FoR); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FoRMetadata { reference: Scalar, shift: u8, - stats: Arc>, } -impl FoRArray { - pub fn try_new(child: ArrayRef, reference: Scalar, shift: u8) -> VortexResult { +impl FoRArray<'_> { + pub fn try_new(child: Array, reference: Scalar, shift: u8) -> VortexResult { if reference.is_null() { vortex_bail!("Reference value cannot be null",); } let reference = reference.cast(child.dtype())?; - Ok(Self { - encoded: child, - reference, - shift, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) + Self::try_from_parts( + child.dtype().clone(), + FoRMetadata { reference, shift }, + vec![child.to_array_data()].into(), + HashMap::new(), + ) } #[inline] - pub fn encoded(&self) -> &ArrayRef { - &self.encoded + pub fn encoded(&self) -> Array { + self.array() + .child(0, self.dtype()) + .expect("Missing FoR child") } #[inline] pub fn reference(&self) -> &Scalar { - &self.reference + &self.metadata().reference } #[inline] pub fn shift(&self) -> u8 { - self.shift + self.metadata().shift } } -impl Array for FoRArray { - impl_array!(); - - #[inline] - fn len(&self) -> usize { - self.encoded.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.encoded.is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - self.encoded.dtype() - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &FoREncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.encoded.nbytes() + self.reference.nbytes() - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.encoded()) - } - - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } -} - -impl ArrayValidity for FoRArray { - fn logical_validity(&self) -> Validity { - self.encoded().logical_validity() - } - +impl ArrayValidity for FoRArray<'_> { fn is_valid(&self, index: usize) -> bool { - self.encoded().is_valid(index) + self.encoded().with_dyn(|a| a.is_valid(index)) } -} -impl ArrayDisplay for FoRArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("reference", self.reference())?; - f.property("shift", self.shift())?; - f.child("encoded", self.encoded()) + fn logical_validity(&self) -> LogicalValidity { + self.encoded().with_dyn(|a| a.logical_validity()) } } -impl StatsCompute for FoRArray { - fn compute(&self, _stat: &Stat) -> VortexResult { - Ok(StatsSet::default()) +impl ArrayFlatten for FoRArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + decompress(self).map(Flattened::Primitive) } } -#[derive(Debug)] -pub struct FoREncoding; - -impl FoREncoding { - pub const ID: EncodingId = EncodingId::new("fastlanes.for"); +impl AcceptArrayVisitor for FoRArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("encoded", &self.encoded()) + } } -impl Encoding for FoREncoding { - fn id(&self) -> EncodingId { - Self::ID - } +impl ArrayStatisticsCompute for FoRArray<'_> {} - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) +impl ArrayTrait for FoRArray<'_> { + fn len(&self) -> usize { + self.encoded().len() } - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) + fn nbytes(&self) -> usize { + self.reference().nbytes() + self.encoded().nbytes() } } diff --git a/vortex-fastlanes/src/for/serde.rs b/vortex-fastlanes/src/for/serde.rs deleted file mode 100644 index 23aacd10dd..0000000000 --- a/vortex-fastlanes/src/for/serde.rs +++ /dev/null @@ -1,64 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{FoRArray, FoREncoding}; - -impl ArraySerde for FoRArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.scalar(self.reference())?; - ctx.write_usize(self.shift() as usize)?; - ctx.write(self.encoded()) - } - - fn metadata(&self) -> VortexResult>> { - let mut vec = Vec::new(); - let mut ctx = WriteCtx::new(&mut vec); - ctx.scalar(self.reference())?; - ctx.write_usize(self.shift() as usize)?; - Ok(Some(vec)) - } -} - -impl EncodingSerde for FoREncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let reference = ctx.scalar()?; - let shift = ctx.read_usize()? as u8; - let child = ctx.read()?; - Ok(FoRArray::try_new(child, reference, shift) - .unwrap() - .into_array()) - } -} - -#[cfg(test)] -mod test { - - use vortex::array::IntoArray; - use vortex::array::{Array, ArrayRef}; - use vortex::scalar::Scalar; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - use crate::FoRArray; - - fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } - - #[test] - fn roundtrip() { - let arr = FoRArray::try_new( - vec![-7i64, -13, 17, 23].into_array(), - >::into(-7i64), - 2, - ) - .unwrap(); - roundtrip_array(&arr).unwrap(); - } -} diff --git a/vortex-fastlanes/src/lib.rs b/vortex-fastlanes/src/lib.rs index 198d1c6a31..fecc6d3c31 100644 --- a/vortex-fastlanes/src/lib.rs +++ b/vortex-fastlanes/src/lib.rs @@ -3,21 +3,8 @@ pub use bitpacking::*; pub use delta::*; -pub use downcast::*; -use linkme::distributed_slice; pub use r#for::*; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod bitpacking; mod delta; -mod downcast; mod r#for; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_FL_BITPACKING: EncodingRef = &BitPackedEncoding; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_FL_DELTA: EncodingRef = &DeltaEncoding; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_FL_FOR: EncodingRef = &FoREncoding; diff --git a/vortex-flatbuffers/src/lib.rs b/vortex-flatbuffers/src/lib.rs index 6928de8ce8..8278d68274 100644 --- a/vortex-flatbuffers/src/lib.rs +++ b/vortex-flatbuffers/src/lib.rs @@ -1,11 +1,11 @@ -mod serde; - use std::io; use std::io::{Read, Write}; use flatbuffers::{root, FlatBufferBuilder, Follow, Verifiable, WIPOffset}; use vortex_error::{vortex_err, VortexResult}; +pub trait FlatBufferRoot {} + pub trait ReadFlatBuffer: Sized { type Source<'a>; type Error; @@ -23,17 +23,15 @@ pub trait WriteFlatBuffer { } pub trait FlatBufferToBytes { - fn flatbuffer_to_bytes(&self) -> (Vec, usize); + fn with_flatbuffer_bytes R>(&self, f: Fn) -> R; } -pub trait FlatBufferRoot {} - impl FlatBufferToBytes for F { - fn flatbuffer_to_bytes(&self) -> (Vec, usize) { + fn with_flatbuffer_bytes R>(&self, f: Fn) -> R { let mut fbb = FlatBufferBuilder::new(); let root_offset = self.write_flatbuffer(&mut fbb); fbb.finish_minimal(root_offset); - fbb.collapse() + f(fbb.finished_data()) } } diff --git a/vortex-flatbuffers/src/serde.rs b/vortex-flatbuffers/src/serde.rs deleted file mode 100644 index 0e132fe66a..0000000000 --- a/vortex-flatbuffers/src/serde.rs +++ /dev/null @@ -1,32 +0,0 @@ -#![cfg(feature = "serde")] - -use flatbuffers::{root, FlatBufferBuilder}; -use serde::de::{DeserializeSeed, Visitor}; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use vortex_flatbuffers::{ReadFlatBuffer, WriteFlatBuffer}; - -use crate::{flatbuffers as fb, DTypeSerdeContext, ReadFlatBuffer}; -use crate::{DType, WriteFlatBuffer}; - -/// Implement the `Serialize` trait by writing to a byte array. -impl Serialize for F { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let mut fbb = FlatBufferBuilder::new(); - let root = self.write_flatbuffer(&mut fbb); - fbb.finish_minimal(root); - serializer.serialize_bytes(fbb.finished_data()) - } -} - -impl<'de, F: ReadFlatBuffer> Deserialize<'de> for F { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let ctx = DTypeSerdeContext::new(vec![]); - deserializer.deserialize_bytes(DTypeDeserializer(ctx)) - } -} diff --git a/vortex-ipc/Cargo.toml b/vortex-ipc/Cargo.toml index d9f9b3dbb8..445c3460b3 100644 --- a/vortex-ipc/Cargo.toml +++ b/vortex-ipc/Cargo.toml @@ -19,7 +19,6 @@ lending-iterator = "0.1.7" nougat = "0.2.4" streaming-iterator = "0.1.9" vortex-array = { path = "../vortex-array" } -vortex-array2 = { path = "../vortex-array2" } vortex-error = { path = "../vortex-error" } vortex-flatbuffers = { path = "../vortex-flatbuffers" } vortex-schema = { path = "../vortex-schema" } diff --git a/vortex-ipc/benches/ipc_take.rs b/vortex-ipc/benches/ipc_take.rs index 279746111e..c156ec794b 100644 --- a/vortex-ipc/benches/ipc_take.rs +++ b/vortex-ipc/benches/ipc_take.rs @@ -1,9 +1,9 @@ use std::io::Cursor; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use vortex_array2::array::primitive::PrimitiveArray; -use vortex_array2::compute::take::take; -use vortex_array2::{IntoArray, SerdeContext}; +use vortex::array::primitive::PrimitiveArray; +use vortex::compute::take::take; +use vortex::{IntoArray, SerdeContext}; use vortex_ipc::iter::FallibleLendingIterator; use vortex_ipc::reader::StreamReader; use vortex_ipc::writer::StreamWriter; @@ -21,7 +21,7 @@ fn ipc_take(c: &mut Criterion) { { let mut cursor = Cursor::new(&mut buffer); let mut writer = StreamWriter::try_new(&mut cursor, SerdeContext::default()).unwrap(); - data.with_dyn(|a| writer.write(a)).unwrap(); + writer.write_array(&data).unwrap(); } c.bench_function("take_view", |b| { @@ -29,34 +29,11 @@ fn ipc_take(c: &mut Criterion) { let mut cursor = Cursor::new(&buffer); let mut reader = StreamReader::try_new(&mut cursor).unwrap(); let mut array_reader = reader.next().unwrap().unwrap(); - let array_view = array_reader.next().unwrap().unwrap().into_array(); + let array_view = array_reader.next().unwrap().unwrap(); black_box(take(&array_view, &indices)) }); }); } -// -// #[allow(dead_code)] -// fn ipc_take_old(c: &mut Criterion) { -// // Try the old way of taking data. -// let arr = PrimitiveArray::from(vec![5; 3_000_000]); -// let indices = PrimitiveArray::from(vec![10, 11, 12, 13, 100_000, 2_999_999]); -// -// let mut buffer = vec![]; -// { -// let mut cursor = Cursor::new(&mut buffer); -// let mut ctx = WriteCtx::new(&mut cursor); -// arr.serde().unwrap().write(&mut ctx).unwrap(); -// } -// -// c.bench_function("take_old", |b| { -// b.iter(|| { -// let mut cursor = Cursor::new(&buffer); -// let mut ctx = ReadCtx::new(arr.dtype(), &mut cursor); -// let arr = ctx.read().unwrap(); -// black_box(vortex::compute::take::take(arr.as_ref(), &indices).unwrap()) -// }); -// }); -// } criterion_group!(benches, ipc_take); criterion_main!(benches); diff --git a/vortex-ipc/src/chunked.rs b/vortex-ipc/src/chunked.rs deleted file mode 100644 index a088b47d20..0000000000 --- a/vortex-ipc/src/chunked.rs +++ /dev/null @@ -1,15 +0,0 @@ -use lending_iterator::prelude::*; -use vortex::array::{Array, ArrayRef}; -use vortex_error::VortexResult; -use vortex_schema::DType; - -/// Stream chunks of a Vortex array. -#[allow(dead_code)] -pub trait ArrayChunkReader: Iterator> { - fn dtype(&self) -> &DType; -} - -#[allow(dead_code)] -pub trait ArrayViewChunkReader: LendingIteratorDyn)> { - fn dtype(&self) -> &DType; -} diff --git a/vortex-ipc/src/lib.rs b/vortex-ipc/src/lib.rs index 560f9c54ad..04c31298d1 100644 --- a/vortex-ipc/src/lib.rs +++ b/vortex-ipc/src/lib.rs @@ -25,7 +25,6 @@ pub mod flatbuffers { } } -mod chunked; pub mod iter; mod messages; pub mod reader; @@ -40,10 +39,10 @@ mod tests { use std::io::{Cursor, Write}; use std::sync::Arc; - use vortex_array2::array::primitive::PrimitiveArray; - use vortex_array2::array::r#struct::StructArray; - use vortex_array2::{IntoArray, IntoArrayData}; - use vortex_array2::{SerdeContext, ToArray}; + use vortex::array::primitive::PrimitiveArray; + use vortex::array::r#struct::StructArray; + use vortex::SerdeContext; + use vortex::{IntoArray, IntoArrayData}; use crate::iter::FallibleLendingIterator; use crate::reader::StreamReader; @@ -51,7 +50,7 @@ mod tests { #[test] fn test_write_flatbuffer() { - let col = PrimitiveArray::from(vec![0, 1, 2]).into_array_data(); + let col = PrimitiveArray::from(vec![0, 1, 2]).into_array(); let nested_struct = StructArray::try_new( vec![Arc::new("x".into()), Arc::new("y".into())], vec![col.clone(), col.clone()], @@ -61,7 +60,7 @@ mod tests { let arr = StructArray::try_new( vec![Arc::new("a".into()), Arc::new("b".into())], - vec![col.clone(), nested_struct.into_array_data()], + vec![col.clone(), nested_struct.into_array()], 3, ) .unwrap() @@ -72,7 +71,7 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let ctx = SerdeContext::default(); let mut writer = StreamWriter::try_new_unbuffered(&mut cursor, ctx).unwrap(); - arr.with_dyn(|a| writer.write(a)).unwrap(); + writer.write_array(&arr).unwrap(); cursor.flush().unwrap(); cursor.set_position(0); @@ -85,7 +84,7 @@ mod tests { // Read some number of chunks from the stream. while let Some(chunk) = array_reader.next().unwrap() { println!("VIEW: {:?}", &chunk); - let _data = chunk.to_array().into_array_data(); + let _data = chunk.into_array_data(); // let taken = take(&chunk, &PrimitiveArray::from(vec![0, 3, 0, 1])).unwrap(); // let taken = taken.as_primitive().typed_data::(); // println!("Taken: {:?}", &taken); diff --git a/vortex-ipc/src/messages.rs b/vortex-ipc/src/messages.rs index 646de10da1..cec423ee6a 100644 --- a/vortex-ipc/src/messages.rs +++ b/vortex-ipc/src/messages.rs @@ -1,8 +1,8 @@ use flatbuffers::{FlatBufferBuilder, WIPOffset}; use itertools::Itertools; +use vortex::encoding::find_encoding; use vortex::flatbuffers::array as fba; -use vortex_array2::encoding::find_encoding; -use vortex_array2::{ArrayData, SerdeContext}; +use vortex::{ArrayData, SerdeContext}; use vortex_error::{vortex_err, VortexError}; use vortex_flatbuffers::{FlatBufferRoot, WriteFlatBuffer}; use vortex_schema::DType; @@ -117,14 +117,14 @@ impl<'a> WriteFlatBuffer for IPCChunk<'a> { &self, fbb: &mut FlatBufferBuilder<'fb>, ) -> WIPOffset> { - let col_data = self.1; - let array = Some(IPCArray(self.0, col_data).write_flatbuffer(fbb)); + let array_data = self.1; + let array = Some(IPCArray(self.0, array_data).write_flatbuffer(fbb)); // Walk the ColumnData depth-first to compute the buffer offsets. - let mut buffers = Vec::with_capacity(col_data.buffers().len()); + let mut buffers = vec![]; let mut offset = 0; - for col_data in col_data.depth_first_traversal() { - for buffer in col_data.buffers() { + for array_data in array_data.depth_first_traversal() { + if let Some(buffer) = array_data.buffer() { buffers.push(fb::Buffer::new( offset as u64, buffer.len() as u64, @@ -134,6 +134,8 @@ impl<'a> WriteFlatBuffer for IPCChunk<'a> { offset += aligned_size; } } + println!("CHUNK buffer_size: {}, nbuffers: {}", offset, buffers.len()); + let buffers = Some(fbb.create_vector(&buffers)); fb::Chunk::create( @@ -180,16 +182,14 @@ impl<'a> WriteFlatBuffer for IPCArray<'a> { .collect_vec(); let children = Some(fbb.create_vector(&children)); - let nbuffers = column_data.buffers().len() as u16; // TODO(ngates): checked cast - fba::Array::create( fbb, &fba::ArrayArgs { version: Default::default(), + has_buffer: column_data.buffer().is_some(), encoding, metadata, children, - nbuffers, }, ) } diff --git a/vortex-ipc/src/reader.rs b/vortex-ipc/src/reader.rs index c5cfc0c862..64b3c8e854 100644 --- a/vortex-ipc/src/reader.rs +++ b/vortex-ipc/src/reader.rs @@ -1,12 +1,14 @@ use std::io; use std::io::{BufReader, Read}; -use arrow_buffer::MutableBuffer; +use arrow_buffer::Buffer as ArrowBuffer; use nougat::gat; -use vortex::array::composite::COMPOSITE_EXTENSIONS; -use vortex_array2::buffer::Buffer; -use vortex_array2::{ArrayView, SerdeContext, ToArray}; -use vortex_error::{vortex_err, VortexError, VortexResult}; +use vortex::array::chunked::ChunkedArray; +use vortex::array::composite::VORTEX_COMPOSITE_EXTENSIONS; +use vortex::buffer::Buffer; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::{Array, ArrayView, IntoArray, OwnedArray, SerdeContext, ToArray, ToStatic}; +use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult}; use vortex_flatbuffers::{FlatBufferReader, ReadFlatBuffer}; use vortex_schema::{DType, DTypeSerdeContext}; @@ -19,11 +21,6 @@ pub struct StreamReader { pub(crate) ctx: SerdeContext, // Optionally take a projection? - - // Use replace to swap the scratch buffer. - // std::mem::replace - // We could use a cell to avoid the need for mutable borrow. - scratch: Vec, } impl StreamReader> { @@ -43,22 +40,35 @@ impl StreamReader { )?; let ctx: SerdeContext = fb_ctx.try_into()?; - Ok(Self { - read, - ctx, - scratch: Vec::with_capacity(1024), - }) + Ok(Self { read, ctx }) + } + + /// Read a single array from the IPC stream. + pub fn read_array(&mut self) -> VortexResult { + let mut array_reader = self + .next()? + .ok_or_else(|| vortex_err!(InvalidSerde: "Unexpected EOF"))?; + + let mut chunks = vec![]; + while let Some(chunk) = array_reader.next()? { + chunks.push(chunk.to_static()); + } + + if chunks.len() == 1 { + Ok(chunks[0].clone()) + } else { + ChunkedArray::try_new(chunks.into_iter().collect(), array_reader.dtype().clone()) + .map(|chunked| chunked.into_array()) + } } } -/// We implement a lending iterator here so that each StreamArrayChunkReader can be lent as -/// mutable to the caller. This is necessary because we need a mutable handle to the reader. #[gat] impl FallibleLendingIterator for StreamReader { type Error = VortexError; - type Item<'next> = StreamArrayChunkReader<'next, R> where Self: 'next; + type Item<'next> = StreamArrayReader<'next, R> where Self: 'next; - fn next(&mut self) -> Result>, Self::Error> { + fn next(&mut self) -> Result>, Self::Error> { let mut fb_vec = Vec::new(); let msg = self.read.read_message::(&mut fb_vec)?; if msg.is_none() { @@ -74,7 +84,7 @@ impl FallibleLendingIterator for StreamReader { // TODO(ngates): construct this from the SerdeContext. let dtype_ctx = - DTypeSerdeContext::new(COMPOSITE_EXTENSIONS.iter().map(|e| e.id()).collect()); + DTypeSerdeContext::new(VORTEX_COMPOSITE_EXTENSIONS.iter().map(|e| e.id()).collect()); let dtype = DType::read_flatbuffer( &dtype_ctx, &schema @@ -84,7 +94,7 @@ impl FallibleLendingIterator for StreamReader { .map_err(|e| vortex_err!(InvalidSerde: "Failed to parse DType: {}", e))?; // Figure out how many columns we have and therefore how many buffers there? - Ok(Some(StreamArrayChunkReader { + Ok(Some(StreamArrayReader { read: &mut self.read, ctx: &self.ctx, dtype, @@ -95,7 +105,7 @@ impl FallibleLendingIterator for StreamReader { } #[allow(dead_code)] -pub struct StreamArrayChunkReader<'a, R: Read> { +pub struct StreamArrayReader<'a, R: Read> { read: &'a mut R, ctx: &'a SerdeContext, dtype: DType, @@ -103,18 +113,30 @@ pub struct StreamArrayChunkReader<'a, R: Read> { column_msg_buffer: Vec, } -impl<'a, R: Read> StreamArrayChunkReader<'a, R> { +impl<'a, R: Read> StreamArrayReader<'a, R> { pub fn dtype(&self) -> &DType { &self.dtype } + + pub fn take(&self, indices: &Array<'_>) -> VortexResult { + if !indices + .statistics() + .compute_as::(Stat::IsSorted) + .unwrap_or_default() + { + vortex_bail!("Indices must be sorted to take from IPC stream") + } + todo!() + } } #[gat] -impl<'iter, R: Read> FallibleLendingIterator for StreamArrayChunkReader<'iter, R> { +impl<'iter, R: Read> FallibleLendingIterator for StreamArrayReader<'iter, R> { type Error = VortexError; - type Item<'next> = ArrayView<'next> where Self: 'next; + type Item<'next> = Array<'next> where Self: 'next; - fn next(&mut self) -> Result>, Self::Error> { + fn next(&mut self) -> Result>, Self::Error> { + self.column_msg_buffer.clear(); let msg = self .read .read_message::(&mut self.column_msg_buffer)?; @@ -141,10 +163,20 @@ impl<'iter, R: Read> FallibleLendingIterator for StreamArrayChunkReader<'iter, R let to_kill = buffer.offset() - offset; io::copy(&mut self.read.take(to_kill), &mut io::sink()).unwrap(); - let mut bytes = MutableBuffer::with_capacity(buffer.length() as usize); - unsafe { bytes.set_len(buffer.length() as usize) } - self.read.read_exact(bytes.as_slice_mut()).unwrap(); - self.buffers.push(Buffer::Owned(bytes.into())); + let buffer_length = buffer.length(); + let mut bytes = Vec::with_capacity(buffer_length as usize); + let bytes_read = self + .read + .take(buffer.length()) + .read_to_end(&mut bytes) + .unwrap(); + if bytes_read < buffer_length as usize { + return Err(vortex_err!(InvalidSerde: "Unexpected EOF reading buffer")); + } + + let arrow_buffer = ArrowBuffer::from_vec(bytes); + assert_eq!(arrow_buffer.len(), buffer_length as usize); + self.buffers.push(Buffer::Owned(arrow_buffer)); offset = buffer.offset() + buffer.length(); } @@ -158,19 +190,6 @@ impl<'iter, R: Read> FallibleLendingIterator for StreamArrayChunkReader<'iter, R // Validate it view.to_array().with_dyn(|_| Ok::<(), VortexError>(()))?; - Ok(Some(view)) + Ok(Some(view.into_array())) } } - -/// FIXME(ngates): this exists to detach the lifetimes of the object as read by read_flatbuffer. -/// We should be able to fix that. -pub fn read_into(read: &mut R, buffer: &mut Vec) -> VortexResult<()> { - buffer.clear(); - let mut buffer_len: [u8; 4] = [0; 4]; - // FIXME(ngates): return optional for EOF? - read.read_exact(&mut buffer_len)?; - let buffer_len = u32::from_le_bytes(buffer_len) as usize; - read.take(buffer_len as u64).read_to_end(buffer)?; - - Ok(()) -} diff --git a/vortex-ipc/src/writer.rs b/vortex-ipc/src/writer.rs index 12be942bd4..e8ddeb7229 100644 --- a/vortex-ipc/src/writer.rs +++ b/vortex-ipc/src/writer.rs @@ -1,9 +1,11 @@ use std::io::{BufWriter, Write}; use itertools::Itertools; -use vortex_array2::{ArrayTrait, SerdeContext}; +use vortex::array::chunked::ChunkedArray; +use vortex::{Array, ArrayDType, SerdeContext, ToArrayData}; use vortex_error::VortexResult; use vortex_flatbuffers::FlatBufferWriter; +use vortex_schema::DType; use crate::messages::{IPCChunk, IPCContext, IPCMessage, IPCSchema}; use crate::ALIGNMENT; @@ -27,20 +29,27 @@ impl StreamWriter { Ok(Self { write, ctx }) } - pub fn write(&mut self, array: &dyn ArrayTrait) -> VortexResult<()> { - // First, write a schema message indicating the start of an array. - self.write - .write_message(&IPCMessage::Schema(IPCSchema(array.dtype())), ALIGNMENT)?; - - // Then we write the array in batchs. - // TODO(ngates): should we do any batching ourselves? - // TODO(ngates): If it's a batched array, use those batchs. Else write the whole thing. + pub fn write_array(&mut self, array: &Array) -> VortexResult<()> { + self.write_schema(array.dtype())?; + match ChunkedArray::try_from(array) { + Ok(chunked) => { + for chunk in chunked.chunks() { + self.write_batch(&chunk)?; + } + Ok(()) + } + Err(_) => self.write_batch(array), + } + } - // For now, we write a single batch. - self.write_batch(array) + pub fn write_schema(&mut self, dtype: &DType) -> VortexResult<()> { + Ok(self + .write + .write_message(&IPCMessage::Schema(IPCSchema(dtype)), ALIGNMENT)?) } - fn write_batch(&mut self, array: &dyn ArrayTrait) -> VortexResult<()> { + pub fn write_batch(&mut self, array: &Array) -> VortexResult<()> { + // TODO(ngates): support writing from an ArrayView. let data = array.to_array_data(); let buffer_offsets = data.all_buffer_offsets(ALIGNMENT); @@ -52,13 +61,14 @@ impl StreamWriter { let mut current_offset = 0; for (buffer, &buffer_end) in data .depth_first_traversal() - .flat_map(|data| data.buffers().iter()) + .flat_map(|data| data.buffer().into_iter()) .zip_eq(buffer_offsets.iter().skip(1)) { + let buffer_len = buffer.len(); self.write.write_all(buffer.as_slice())?; - current_offset += buffer.len(); - let padding = (buffer_end as usize) - current_offset; + let padding = (buffer_end as usize) - current_offset - buffer_len; self.write.write_all(&vec![0; padding])?; + current_offset = buffer_end as usize; } Ok(()) diff --git a/vortex-ree/Cargo.toml b/vortex-ree/Cargo.toml index f12878e617..d296932351 100644 --- a/vortex-ree/Cargo.toml +++ b/vortex-ree/Cargo.toml @@ -12,15 +12,17 @@ edition = { workspace = true } rust-version = { workspace = true } [dependencies] -vortex-array = { path = "../vortex-array" } -vortex-error = { path = "../vortex-error" } -vortex-schema = { path = "../vortex-schema" } arrow-array = { workspace = true } arrow-buffer = { workspace = true } -linkme = { workspace = true } half = { workspace = true } -num-traits = { workspace = true } itertools = { workspace = true } +linkme = { workspace = true } +num-traits = { workspace = true } +paste = { workspace = true } +serde = { workspace = true } +vortex-array = { path = "../vortex-array" } +vortex-error = { path = "../vortex-error" } +vortex-schema = { path = "../vortex-schema" } [lints] workspace = true diff --git a/vortex-ree/src/compress.rs b/vortex-ree/src/compress.rs index 5c98ce2460..624ad1475d 100644 --- a/vortex-ree/src/compress.rs +++ b/vortex-ree/src/compress.rs @@ -2,35 +2,34 @@ use std::cmp::min; use itertools::Itertools; use num_traits::AsPrimitive; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::{Array, ArrayRef}; +use vortex::array::primitive::{Primitive, PrimitiveArray}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; -use vortex::encoding::Encoding; -use vortex::match_each_integer_ptype; use vortex::ptype::{match_each_native_ptype, NativePType}; -use vortex::stats::Stat; -use vortex::validity::OwnedValidity; +use vortex::stats::{ArrayStatistics, Stat}; use vortex::validity::Validity; +use vortex::ArrayDType; +use vortex::ArrayTrait; +use vortex::{match_each_integer_ptype, Array, ArrayDef, IntoArray, OwnedArray}; use vortex_error::VortexResult; +use vortex_schema::Nullability; -use crate::downcast::DowncastREE; use crate::{REEArray, REEEncoding}; impl EncodingCompression for REEEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { - if array.encoding().id() != PrimitiveEncoding.id() { + if array.encoding().id() != Primitive::ID { return None; } let avg_run_length = array.len() as f32 / array - .stats() - .get_or_compute_or::(array.len(), &Stat::RunCount) as f32; + .statistics() + .compute_as(Stat::RunCount) + .unwrap_or(array.len()) as f32; if avg_run_length < config.ree_average_run_threshold { return None; } @@ -40,48 +39,55 @@ impl EncodingCompression for REEEncoding { fn compress( &self, - array: &dyn Array, - like: Option<&dyn Array>, + array: &Array, + like: Option<&Array>, ctx: CompressCtx, - ) -> VortexResult { - let ree_like = like.map(|like_arr| like_arr.as_ree()); + ) -> VortexResult { + let ree_like = like.map(|like_arr| REEArray::try_from(like_arr).unwrap()); + let ree_like_ref = ree_like.as_ref(); let primitive_array = array.as_primitive(); - let (ends, values) = ree_encode(primitive_array); + let (ends, values) = ree_encode(&primitive_array); let compressed_ends = ctx .auxiliary("ends") - .compress(&ends, ree_like.map(|ree| ree.ends()))?; - let compressed_values = ctx - .named("values") - .excluding(&REEEncoding) - .compress(&values, ree_like.map(|ree| ree.values()))?; + .compress(ends.array(), ree_like_ref.map(|ree| ree.ends()).as_ref())?; + let compressed_values = ctx.named("values").excluding(&REEEncoding).compress( + values.array(), + ree_like_ref.map(|ree| ree.values()).as_ref(), + )?; - Ok(REEArray::new( + REEArray::try_new( compressed_ends, compressed_values, ctx.compress_validity(primitive_array.validity())?, ) - .into_array()) + .map(|a| a.into_array()) } } -pub fn ree_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) { +pub fn ree_encode<'a>(array: &PrimitiveArray) -> (PrimitiveArray<'a>, PrimitiveArray<'a>) { + let validity = if array.validity().nullability() == Nullability::NonNullable { + Validity::NonNullable + } else { + Validity::AllValid + }; match_each_native_ptype!(array.ptype(), |$P| { let (ends, values) = ree_encode_primitive(array.typed_data::<$P>()); - let mut compressed_values = PrimitiveArray::from(values).into_nullable(array.dtype().nullability()); - compressed_values.stats().set(Stat::IsConstant, false.into()); - compressed_values.stats().set(Stat::RunCount, compressed_values.len().into()); - compressed_values.stats().set_many(&array.stats(), vec![ - &Stat::Min, &Stat::Max, &Stat::IsSorted, &Stat::IsStrictSorted, - ]); + let mut compressed_values = PrimitiveArray::from_vec(values, validity); + compressed_values.statistics().set(Stat::IsConstant, false.into()); + compressed_values.statistics().set(Stat::RunCount, compressed_values.len().into()); + array.statistics().get(Stat::Min).map(|s| compressed_values.statistics().set(Stat::Min, s)); + array.statistics().get(Stat::Max).map(|s| compressed_values.statistics().set(Stat::Max, s)); + array.statistics().get(Stat::IsSorted).map(|s| compressed_values.statistics().set(Stat::IsSorted, s)); + array.statistics().get(Stat::IsStrictSorted).map(|s| compressed_values.statistics().set(Stat::IsStrictSorted, s)); let compressed_ends = PrimitiveArray::from(ends); - compressed_ends.stats().set(Stat::IsSorted, true.into()); - compressed_ends.stats().set(Stat::IsStrictSorted, true.into()); - compressed_ends.stats().set(Stat::IsConstant, false.into()); - compressed_ends.stats().set(Stat::Max, array.len().into()); - compressed_ends.stats().set(Stat::RunCount, compressed_ends.len().into()); + compressed_ends.statistics().set(Stat::IsSorted, true.into()); + compressed_ends.statistics().set(Stat::IsStrictSorted, true.into()); + compressed_ends.statistics().set(Stat::IsConstant, false.into()); + compressed_ends.statistics().set(Stat::Max, array.len().into()); + compressed_ends.statistics().set(Stat::RunCount, compressed_ends.len().into()); assert_eq!(array.dtype(), compressed_values.dtype()); (compressed_ends, compressed_values) @@ -113,16 +119,16 @@ fn ree_encode_primitive(elements: &[T]) -> (Vec, Vec) { (ends, values) } -pub fn ree_decode( +pub fn ree_decode<'a>( ends: &PrimitiveArray, values: &PrimitiveArray, - validity: Option, + validity: Validity, offset: usize, length: usize, -) -> VortexResult { +) -> VortexResult> { match_each_native_ptype!(values.ptype(), |$P| { match_each_integer_ptype!(ends.ptype(), |$E| { - Ok(PrimitiveArray::from_nullable(ree_decode_primitive( + Ok(PrimitiveArray::from_vec(ree_decode_primitive( ends.typed_data::<$E>(), values.typed_data::<$P>(), offset, @@ -154,12 +160,10 @@ pub fn ree_decode_primitive + Ord, T: Native #[cfg(test)] mod test { - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; - use vortex::array::{Array, IntoArray}; + use vortex::validity::ArrayValidity; use vortex::validity::Validity; - use vortex::validity::{ArrayValidity, OwnedValidity}; - use vortex::view::ToOwnedView; + use vortex::{ArrayTrait, IntoArray}; use crate::compress::{ree_decode, ree_encode}; use crate::REEArray; @@ -177,7 +181,7 @@ mod test { fn decode() { let ends = PrimitiveArray::from(vec![2, 5, 10]); let values = PrimitiveArray::from(vec![1i32, 2, 3]); - let decoded = ree_decode(&ends, &values, None, 0, 10).unwrap(); + let decoded = ree_decode(&ends, &values, Validity::NonNullable, 0, 10).unwrap(); assert_eq!( decoded.typed_data::(), @@ -193,16 +197,17 @@ mod test { validity[7] = false; Validity::from(validity) }; - let arr = REEArray::new( + let arr = REEArray::try_new( vec![2u32, 5, 10].into_array(), - vec![1i32, 2, 3].into_array(), - Some(validity), - ); + PrimitiveArray::from_vec(vec![1i32, 2, 3], Validity::AllValid).into_array(), + validity, + ) + .unwrap(); let decoded = ree_decode( - arr.ends().as_primitive(), - arr.values().as_primitive(), - arr.validity().to_owned_view(), + &arr.ends().into_primitive(), + &arr.values().into_primitive(), + arr.validity(), 0, arr.len(), ) @@ -213,7 +218,7 @@ mod test { vec![1i32, 1, 2, 2, 2, 3, 3, 3, 3, 3].as_slice() ); assert_eq!( - decoded.logical_validity(), + decoded.logical_validity().into_validity(), Validity::from(vec![ true, true, false, true, true, true, true, false, true, true, ]) diff --git a/vortex-ree/src/compute.rs b/vortex-ree/src/compute.rs index 4e06724f2e..034598f7bf 100644 --- a/vortex-ree/src/compute.rs +++ b/vortex-ree/src/compute.rs @@ -1,24 +1,15 @@ use vortex::array::primitive::PrimitiveArray; -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{flatten, flatten_primitive, FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; -use vortex::match_each_integer_ptype; use vortex::scalar::Scalar; -use vortex::validity::OwnedValidity; -use vortex::view::ToOwnedView; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; +use vortex::{match_each_integer_ptype, Array, IntoArray, OwnedArray}; +use vortex_error::VortexResult; -use crate::compress::ree_decode; use crate::REEArray; -impl ArrayCompute for REEArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - +impl ArrayCompute for REEArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -32,38 +23,15 @@ impl ArrayCompute for REEArray { } } -impl FlattenFn for REEArray { - fn flatten(&self) -> VortexResult { - let ends = flatten(self.ends())?; - let FlattenedArray::Primitive(pends) = ends else { - vortex_bail!("REE Ends array didn't flatten to primitive",); - }; - - let values = flatten(self.values())?; - if let FlattenedArray::Primitive(pvalues) = values { - ree_decode( - &pends, - &pvalues, - self.validity().to_owned_view(), - self.offset(), - self.len(), - ) - .map(FlattenedArray::Primitive) - } else { - Err(vortex_err!("Cannot yet flatten non-primitive REE array")) - } - } -} - -impl ScalarAtFn for REEArray { +impl ScalarAtFn for REEArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - scalar_at(self.values(), self.find_physical_index(index)?) + scalar_at(&self.values(), self.find_physical_index(index)?) } } -impl TakeFn for REEArray { - fn take(&self, indices: &dyn Array) -> VortexResult { - let primitive_indices = flatten_primitive(indices)?; +impl TakeFn for REEArray<'_> { + fn take(&self, indices: &Array) -> VortexResult { + let primitive_indices = indices.clone().flatten_primitive()?; let physical_indices = match_each_integer_ptype!(primitive_indices.ptype(), |$P| { primitive_indices .typed_data::<$P>() @@ -74,20 +42,21 @@ impl TakeFn for REEArray { }) .collect::>>()? }); - take(self.values(), &PrimitiveArray::from(physical_indices)) + take( + &self.values(), + PrimitiveArray::from(physical_indices).array(), + ) } } -impl SliceFn for REEArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for REEArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { let slice_begin = self.find_physical_index(start)?; let slice_end = self.find_physical_index(stop)?; Ok(REEArray::with_offset_and_size( - slice(self.ends(), slice_begin, slice_end + 1)?, - slice(self.values(), slice_begin, slice_end + 1)?, - self.validity() - .map(|v| v.slice(slice_begin, slice_end + 1)) - .transpose()?, + slice(&self.ends(), slice_begin, slice_end + 1)?, + slice(&self.values(), slice_begin, slice_end + 1)?, + self.validity().slice(slice_begin, slice_end + 1)?, stop - start, start, )? @@ -97,19 +66,22 @@ impl SliceFn for REEArray { #[cfg(test)] mod test { - use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::compute::take::take; + use vortex::ToArray; use crate::REEArray; #[test] fn ree_take() { - let ree = REEArray::encode(&PrimitiveArray::from(vec![ - 1, 1, 1, 4, 4, 4, 2, 2, 5, 5, 5, 5, - ])) + let ree = REEArray::encode( + PrimitiveArray::from(vec![1, 1, 1, 4, 4, 4, 2, 2, 5, 5, 5, 5]).to_array(), + ) .unwrap(); - let taken = take(&ree, &PrimitiveArray::from(vec![9, 8, 1, 3])).unwrap(); - assert_eq!(taken.as_primitive().typed_data::(), &[5, 5, 1, 4]); + let taken = take(ree.array(), PrimitiveArray::from(vec![9, 8, 1, 3]).array()).unwrap(); + assert_eq!( + taken.flatten_primitive().unwrap().typed_data::(), + &[5, 5, 1, 4] + ); } } diff --git a/vortex-ree/src/downcast.rs b/vortex-ree/src/downcast.rs deleted file mode 100644 index 2f5dbd6ec2..0000000000 --- a/vortex-ree/src/downcast.rs +++ /dev/null @@ -1,31 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::REEArray; - -mod private { - pub trait Sealed {} -} - -pub trait DowncastREE: private::Sealed { - fn maybe_ree(&self) -> Option<&REEArray>; - - fn as_ree(&self) -> &REEArray { - self.maybe_ree().unwrap() - } -} - -impl private::Sealed for dyn Array + '_ {} - -impl DowncastREE for dyn Array + '_ { - fn maybe_ree(&self) -> Option<&REEArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastREE for ArrayRef { - fn maybe_ree(&self) -> Option<&REEArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-ree/src/lib.rs b/vortex-ree/src/lib.rs index 5e95e56999..1454f5e018 100644 --- a/vortex-ree/src/lib.rs +++ b/vortex-ree/src/lib.rs @@ -1,12 +1,5 @@ -use linkme::distributed_slice; pub use ree::*; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod compress; mod compute; -mod downcast; mod ree; -mod serde; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_REE: EncodingRef = &REEEncoding; diff --git a/vortex-ree/src/ree.rs b/vortex-ree/src/ree.rs index c87d112d83..971b264da8 100644 --- a/vortex-ree/src/ree.rs +++ b/vortex-ree/src/ree.rs @@ -1,213 +1,162 @@ -use std::sync::{Arc, RwLock}; - -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; +use serde::{Deserialize, Serialize}; +use vortex::array::primitive::{Primitive, PrimitiveArray}; use vortex::compute::scalar_at::scalar_at; use vortex::compute::search_sorted::{search_sorted, SearchSortedSide}; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; -use vortex::validity::Validity; -use vortex::validity::{OwnedValidity, ValidityView}; -use vortex::view::{AsView, ToOwnedView}; -use vortex::{impl_array, ArrayWalker}; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::DType; +use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute}; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, IntoArrayData}; +use vortex_error::{vortex_bail, VortexResult}; + +use crate::compress::{ree_decode, ree_encode}; -use crate::compress::ree_encode; +impl_encoding!("vortex.ree", REE); -#[derive(Debug, Clone)] -pub struct REEArray { - ends: ArrayRef, - values: ArrayRef, - validity: Option, +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct REEMetadata { + validity: ValidityMetadata, + ends_dtype: DType, offset: usize, length: usize, - stats: Arc>, } -impl REEArray { - pub fn new(ends: ArrayRef, values: ArrayRef, validity: Option) -> Self { - Self::try_new(ends, values, validity).unwrap() - } - - pub fn try_new( - ends: ArrayRef, - values: ArrayRef, - validity: Option, - ) -> VortexResult { +impl REEArray<'_> { + pub fn try_new(ends: Array, values: Array, validity: Validity) -> VortexResult { let length: usize = scalar_at(&ends, ends.len() - 1)?.try_into()?; Self::with_offset_and_size(ends, values, validity, length, 0) } pub(crate) fn with_offset_and_size( - ends: ArrayRef, - values: ArrayRef, - validity: Option, + ends: Array, + values: Array, + validity: Validity, length: usize, offset: usize, ) -> VortexResult { - if let Some(v) = &validity { - assert_eq!(v.len(), length); + if values.dtype().is_nullable() == (validity == Validity::NonNullable) { + vortex_bail!("incorrect validity {:?}", validity); } - if !ends.stats().get_as(&Stat::IsStrictSorted).unwrap_or(true) { + if !ends + .statistics() + .get_as(Stat::IsStrictSorted) + .unwrap_or(true) + { vortex_bail!("Ends array must be strictly sorted",); } - - Ok(Self { - ends, - values, - validity, - length, + let dtype = values.dtype().clone(); + let metadata = REEMetadata { + validity: validity.to_metadata(length)?, + ends_dtype: ends.dtype().clone(), offset, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - - pub fn find_physical_index(&self, index: usize) -> VortexResult { - search_sorted(self.ends(), index + self.offset, SearchSortedSide::Right) - .map(|s| s.to_index()) - } + length, + }; - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Primitive(p) => { - let (ends, values) = ree_encode(p); - Ok(REEArray::new( - ends.into_array(), - values.into_array(), - p.validity().to_owned_view(), - ) - .into_array()) - } - _ => Err(vortex_err!("REE can only encode primitive arrays")), + let mut children = Vec::with_capacity(3); + children.push(ends.into_array_data()); + children.push(values.into_array_data()); + if let Some(a) = validity.into_array_data() { + children.push(a) } - } - #[inline] - pub fn offset(&self) -> usize { - self.offset - } - - #[inline] - pub fn ends(&self) -> &ArrayRef { - &self.ends - } - - #[inline] - pub fn values(&self) -> &ArrayRef { - &self.values - } -} - -impl Array for REEArray { - impl_array!(); - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) + Self::try_from_parts(dtype, metadata, children.into(), HashMap::new()) } - #[inline] - fn len(&self) -> usize { - self.length + pub fn find_physical_index(&self, index: usize) -> VortexResult { + search_sorted(&self.ends(), index + self.offset(), SearchSortedSide::Right) + .map(|s| s.to_index()) } - #[inline] - fn is_empty(&self) -> bool { - self.length == 0 + pub fn encode(array: Array) -> VortexResult { + if array.encoding().id() == Primitive::ID { + let primitive = PrimitiveArray::try_from(array)?; + let (ends, values) = ree_encode(&primitive); + REEArray::try_new(ends.into_array(), values.into_array(), primitive.validity()) + } else { + vortex_bail!("REE can only encode primitive arrays") + } } - #[inline] - fn dtype(&self) -> &DType { - self.values.dtype() + pub fn validity(&self) -> Validity { + self.metadata() + .validity + .to_validity(self.array().child(2, &Validity::DTYPE)) } #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) + pub fn offset(&self) -> usize { + self.metadata().offset } #[inline] - fn encoding(&self) -> EncodingRef { - &REEEncoding + pub fn ends(&self) -> Array { + self.array() + .child(0, &self.metadata().ends_dtype) + .expect("missing ends") } #[inline] - // Values and ends have been sliced to the nearest run end value so the size in bytes is accurate - fn nbytes(&self) -> usize { - self.values.nbytes() + self.ends.nbytes() + pub fn values(&self) -> Array { + self.array().child(1, self.dtype()).expect("missing values") } +} - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) +impl ArrayValidity for REEArray<'_> { + fn is_valid(&self, index: usize) -> bool { + self.validity().is_valid(index) } - fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { - walker.visit_child(self.values())?; - walker.visit_child(self.ends()) + fn logical_validity(&self) -> LogicalValidity { + self.validity().to_logical(self.len()) } } -impl OwnedValidity for REEArray { - fn validity(&self) -> Option { - self.validity.as_view() +impl ArrayFlatten for REEArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + let pends = self.ends().flatten_primitive()?; + let pvalues = self.values().flatten_primitive()?; + ree_decode(&pends, &pvalues, self.validity(), self.offset(), self.len()) + .map(Flattened::Primitive) } } -impl StatsCompute for REEArray {} - -#[derive(Debug)] -pub struct REEEncoding; - -impl REEEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.ree"); -} - -impl Encoding for REEEncoding { - fn id(&self) -> EncodingId { - Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl AcceptArrayVisitor for REEArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("ends", &self.ends())?; + visitor.visit_child("values", &self.values())?; + visitor.visit_validity(&self.validity()) } } -impl ArrayDisplay for REEArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.child("values", self.values())?; - f.child("ends", self.ends()) +impl ArrayStatisticsCompute for REEArray<'_> {} + +impl ArrayTrait for REEArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } #[cfg(test)] mod test { - use vortex::array::Array; - use vortex::array::IntoArray; - use vortex::compute::flatten::flatten_primitive; use vortex::compute::scalar_at::scalar_at; use vortex::compute::slice::slice; + use vortex::validity::Validity; + use vortex::{ArrayDType, ArrayTrait, IntoArray}; use vortex_schema::{DType, IntWidth, Nullability, Signedness}; use crate::REEArray; #[test] fn new() { - let arr = REEArray::new( + let arr = REEArray::try_new( vec![2u32, 5, 10].into_array(), vec![1i32, 2, 3].into_array(), - None, - ); + Validity::NonNullable, + ) + .unwrap(); assert_eq!(arr.len(), 10); assert_eq!( arr.dtype(), @@ -217,20 +166,22 @@ mod test { // 0, 1 => 1 // 2, 3, 4 => 2 // 5, 6, 7, 8, 9 => 3 - assert_eq!(scalar_at(&arr, 0).unwrap(), 1.into()); - assert_eq!(scalar_at(&arr, 2).unwrap(), 2.into()); - assert_eq!(scalar_at(&arr, 5).unwrap(), 3.into()); - assert_eq!(scalar_at(&arr, 9).unwrap(), 3.into()); + assert_eq!(scalar_at(arr.array(), 0).unwrap(), 1.into()); + assert_eq!(scalar_at(arr.array(), 2).unwrap(), 2.into()); + assert_eq!(scalar_at(arr.array(), 5).unwrap(), 3.into()); + assert_eq!(scalar_at(arr.array(), 9).unwrap(), 3.into()); } #[test] fn slice_array() { let arr = slice( - &REEArray::new( + REEArray::try_new( vec![2u32, 5, 10].into_array(), vec![1i32, 2, 3].into_array(), - None, - ), + Validity::NonNullable, + ) + .unwrap() + .array(), 3, 8, ) @@ -242,20 +193,24 @@ mod test { assert_eq!(arr.len(), 5); assert_eq!( - flatten_primitive(&arr).unwrap().typed_data::(), + arr.flatten_primitive().unwrap().typed_data::(), vec![2, 2, 3, 3, 3] ); } #[test] fn flatten() { - let arr = REEArray::new( + let arr = REEArray::try_new( vec![2u32, 5, 10].into_array(), vec![1i32, 2, 3].into_array(), - None, - ); + Validity::NonNullable, + ) + .unwrap(); assert_eq!( - flatten_primitive(&arr).unwrap().typed_data::(), + arr.into_array() + .flatten_primitive() + .unwrap() + .typed_data::(), vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3] ); } diff --git a/vortex-ree/src/serde.rs b/vortex-ree/src/serde.rs deleted file mode 100644 index d26c0ca8f6..0000000000 --- a/vortex-ree/src/serde.rs +++ /dev/null @@ -1,75 +0,0 @@ -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex::validity::OwnedValidity; -use vortex_error::VortexResult; - -use crate::{REEArray, REEEncoding}; - -impl ArraySerde for REEArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - ctx.write_usize(self.offset())?; - ctx.write_validity(self.validity())?; - // TODO(robert): Stop writing this - ctx.dtype(self.ends().dtype())?; - ctx.write(self.ends())?; - ctx.write(self.values()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for REEEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let length = ctx.read_usize()?; - let offset = ctx.read_usize()?; - let validity = ctx.read_validity()?; - let ends_dtype = ctx.dtype()?; - let ends = ctx.with_schema(&ends_dtype).read()?; - let values = ctx.read()?; - Ok(REEArray::with_offset_and_size(ends, values, validity, length, offset)?.into_array()) - } -} - -#[cfg(test)] -mod test { - use vortex::array::downcast::DowncastArrayBuiltin; - use vortex::array::IntoArray; - use vortex::array::{Array, ArrayRef}; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - use crate::downcast::DowncastREE; - use crate::REEArray; - - fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } - - #[test] - fn roundtrip() { - let arr = REEArray::new( - vec![0u8, 9, 20, 32, 49].into_array(), - vec![-7i64, -13, 17, 23].into_array(), - None, - ); - let read_arr = roundtrip_array(&arr).unwrap(); - let read_ree = read_arr.as_ree(); - - assert_eq!( - arr.ends().as_primitive().typed_data::(), - read_ree.ends().as_primitive().typed_data::() - ); - assert_eq!( - arr.values().as_primitive().typed_data::(), - read_ree.values().as_primitive().typed_data::() - ); - } -} diff --git a/vortex-roaring/Cargo.toml b/vortex-roaring/Cargo.toml index 68b60f7bfa..8921cef7e6 100644 --- a/vortex-roaring/Cargo.toml +++ b/vortex-roaring/Cargo.toml @@ -20,6 +20,8 @@ linkme = { workspace = true } croaring = { workspace = true } num-traits = { workspace = true } log = { workspace = true } +serde = { workspace = true } +paste = { workspace = true } [lints] workspace = true diff --git a/vortex-roaring/src/boolean/compress.rs b/vortex-roaring/src/boolean/compress.rs index b1bff86b4b..ea5e3ebe15 100644 --- a/vortex-roaring/src/boolean/compress.rs +++ b/vortex-roaring/src/boolean/compress.rs @@ -1,22 +1,22 @@ use croaring::Bitmap; -use vortex::array::bool::{BoolArray, BoolEncoding}; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::{Array, ArrayRef}; +use vortex::array::bool::BoolArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray}; use vortex_error::VortexResult; use vortex_schema::DType; use vortex_schema::Nullability::NonNullable; -use crate::boolean::{RoaringBoolArray, RoaringBoolEncoding}; +use crate::boolean::RoaringBoolArray; +use crate::{OwnedRoaringBoolArray, RoaringBool, RoaringBoolEncoding}; impl EncodingCompression for RoaringBoolEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support bool enc arrays - if array.encoding().id() != BoolEncoding::ID { + if array.encoding().id() != RoaringBool::ID { return None; } @@ -34,19 +34,19 @@ impl EncodingCompression for RoaringBoolEncoding { fn compress( &self, - array: &dyn Array, - _like: Option<&dyn Array>, + array: &Array, + _like: Option<&Array>, _ctx: CompressCtx, - ) -> VortexResult { - Ok(roaring_encode(array.as_bool()).into_array()) + ) -> VortexResult { + roaring_encode(array.clone().flatten_bool()?).map(move |a| a.into_array()) } } -pub fn roaring_encode(bool_array: &BoolArray) -> RoaringBoolArray { +pub fn roaring_encode(bool_array: BoolArray) -> VortexResult { let mut bitmap = Bitmap::new(); bitmap.extend( bool_array - .buffer() + .boolean_buffer() .iter() .enumerate() .filter(|(_, b)| *b) @@ -55,5 +55,5 @@ pub fn roaring_encode(bool_array: &BoolArray) -> RoaringBoolArray { bitmap.run_optimize(); bitmap.shrink_to_fit(); - RoaringBoolArray::new(bitmap, bool_array.buffer().len()) + RoaringBoolArray::try_new(bitmap, bool_array.len()) } diff --git a/vortex-roaring/src/boolean/compute.rs b/vortex-roaring/src/boolean/compute.rs index bc741fe8e3..272a9aff11 100644 --- a/vortex-roaring/src/boolean/compute.rs +++ b/vortex-roaring/src/boolean/compute.rs @@ -1,23 +1,14 @@ -use arrow_buffer::{BooleanBuffer, Buffer}; use croaring::Bitmap; -use vortex::array::bool::BoolArray; -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::slice::SliceFn; use vortex::compute::ArrayCompute; -use vortex::scalar::{AsBytes, Scalar}; -use vortex::validity::Validity; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::Nullability; +use vortex::scalar::Scalar; +use vortex::{IntoArray, OwnedArray}; +use vortex_error::VortexResult; use crate::RoaringBoolArray; -impl ArrayCompute for RoaringBoolArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } - +impl ArrayCompute for RoaringBoolArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -27,30 +18,9 @@ impl ArrayCompute for RoaringBoolArray { } } -impl FlattenFn for RoaringBoolArray { - fn flatten(&self) -> VortexResult { - // TODO(ngates): benchmark the fastest conversion from BitMap. - // Via bitset requires two copies. - let bitset = self - .bitmap - .to_bitset() - .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; - - let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; - let buffer = Buffer::from_slice_ref(bytes); - Ok(FlattenedArray::Bool(BoolArray::new( - BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), - match self.nullability() { - Nullability::NonNullable => None, - Nullability::Nullable => Some(Validity::Valid(self.len())), - }, - ))) - } -} - -impl ScalarAtFn for RoaringBoolArray { +impl ScalarAtFn for RoaringBoolArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - if self.bitmap.contains(index as u32) { + if self.bitmap().contains(index as u32) { Ok(true.into()) } else { Ok(false.into()) @@ -58,11 +28,11 @@ impl ScalarAtFn for RoaringBoolArray { } } -impl SliceFn for RoaringBoolArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for RoaringBoolArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { let slice_bitmap = Bitmap::from_range(start as u32..stop as u32); - let bitmap = self.bitmap.and(&slice_bitmap).add_offset(-(start as i64)); + let bitmap = self.bitmap().and(&slice_bitmap).add_offset(-(start as i64)); - Ok(RoaringBoolArray::new(bitmap, stop - start).into_array()) + RoaringBoolArray::try_new(bitmap, stop - start).map(|a| a.into_array()) } } diff --git a/vortex-roaring/src/boolean/mod.rs b/vortex-roaring/src/boolean/mod.rs index cbef387c7d..80f4b65701 100644 --- a/vortex-roaring/src/boolean/mod.rs +++ b/vortex-roaring/src/boolean/mod.rs @@ -1,99 +1,67 @@ -use std::sync::{Arc, RwLock}; - +use arrow_buffer::BooleanBuffer; +use arrow_buffer::Buffer as ArrowBuffer; use compress::roaring_encode; -use croaring::{Bitmap, Native}; -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::DType; +use croaring::{Bitmap, Portable}; +use serde::{Deserialize, Serialize}; +use vortex::array::bool::{Bool, BoolArray}; +use vortex::buffer::Buffer; +use vortex::scalar::AsBytes; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, OwnedArray}; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; +use vortex_schema::Nullability; use vortex_schema::Nullability::NonNullable; +use Nullability::Nullable; mod compress; mod compute; -mod serde; -mod stats; -#[derive(Debug, Clone)] -pub struct RoaringBoolArray { - bitmap: Bitmap, +impl_encoding!("vortex.roaring_bool", RoaringBool); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RoaringBoolMetadata { length: usize, - stats: Arc>, } -impl RoaringBoolArray { - pub fn new(bitmap: Bitmap, length: usize) -> Self { - Self { - bitmap, - length, - stats: Arc::new(RwLock::new(StatsSet::new())), +impl RoaringBoolArray<'_> { + pub fn try_new(bitmap: Bitmap, length: usize) -> VortexResult { + if length < bitmap.cardinality() as usize { + vortex_bail!("RoaringBoolArray length is less than bitmap cardinality") + } else { + Ok(Self { + typed: TypedArray::try_from_parts( + DType::Bool(NonNullable), + RoaringBoolMetadata { length }, + Some(Buffer::Owned(bitmap.serialize::().into())), + vec![].into(), + HashMap::default(), + )?, + }) } } - pub fn bitmap(&self) -> &Bitmap { - &self.bitmap + pub fn bitmap(&self) -> Bitmap { + //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + Bitmap::deserialize::( + self.array() + .buffer() + .expect("RoaringBoolArray buffer is missing") + .as_slice(), + ) } - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Bool(p) => Ok(roaring_encode(p)), - _ => Err(vortex_err!("RoaringBool can only encode bool arrays")), + pub fn encode(array: OwnedArray) -> VortexResult { + if array.encoding().id() == Bool::ID { + roaring_encode(BoolArray::try_from(array)?).map(|a| a.into_array()) + } else { + Err(vortex_err!("RoaringInt can only encode boolean arrays")) } } } - -impl Array for RoaringBoolArray { - impl_array!(); - #[inline] - fn len(&self) -> usize { - self.length - } - - #[inline] - fn is_empty(&self) -> bool { - self.length == 0 - } - - #[inline] - fn dtype(&self) -> &DType { - &DType::Bool(NonNullable) - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &RoaringBoolEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - // TODO(ngates): do we want Native serializer? Or portable? Or frozen? - self.bitmap.get_serialized_size_in_bytes::() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { +impl AcceptArrayVisitor for RoaringBoolArray<'_> { + fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { // TODO(ngates): should we store a buffer in memory? Or delay serialization? // Or serialize into metadata? The only reason we support buffers is so we can write to // the wire without copying into FlatBuffers. But if we need to allocate to serialize @@ -102,59 +70,64 @@ impl Array for RoaringBoolArray { } } -impl ArrayValidity for RoaringBoolArray { - fn logical_validity(&self) -> Validity { - Validity::Valid(self.len()) - } - - fn is_valid(&self, _index: usize) -> bool { - true - } -} - -impl ArrayDisplay for RoaringBoolArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("bitmap", format!("{:?}", self.bitmap())) +impl ArrayTrait for RoaringBoolArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } -#[derive(Debug)] -pub struct RoaringBoolEncoding; - -impl RoaringBoolEncoding { - pub const ID: EncodingId = EncodingId::new("roaring.bool"); -} +impl ArrayStatisticsCompute for RoaringBoolArray<'_> {} -impl Encoding for RoaringBoolEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl ArrayValidity for RoaringBoolArray<'_> { + fn logical_validity(&self) -> LogicalValidity { + LogicalValidity::AllValid(self.len()) } - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) + fn is_valid(&self, _index: usize) -> bool { + true } +} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayFlatten for RoaringBoolArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + // TODO(ngates): benchmark the fastest conversion from BitMap. + // Via bitset requires two copies. + let bitset = self + .bitmap() + .to_bitset() + .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; + + let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; + let buffer = ArrowBuffer::from_slice_ref(bytes); + Ok(Flattened::Bool(BoolArray::try_new( + BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), + match self.dtype().nullability() { + NonNullable => Validity::NonNullable, + Nullable => Validity::AllValid, + }, + )?)) } } #[cfg(test)] mod test { use vortex::array::bool::BoolArray; - use vortex::array::Array; use vortex::compute::scalar_at::scalar_at; use vortex::scalar::Scalar; + use vortex::IntoArray; use vortex_error::VortexResult; use crate::RoaringBoolArray; #[test] pub fn iter() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, false, true, true]); - let array = RoaringBoolArray::encode(bool)?; - - let values = array.bitmap().to_vec(); + let bool: BoolArray = BoolArray::from(vec![true, false, true, true]); + let array = RoaringBoolArray::encode(bool.into_array())?; + let round_trip = RoaringBoolArray::try_from(array.clone())?; + let values = round_trip.bitmap().to_vec(); assert_eq!(values, vec![0, 2, 3]); Ok(()) @@ -162,8 +135,8 @@ mod test { #[test] pub fn test_scalar_at() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, false, true, true]); - let array = RoaringBoolArray::encode(bool)?; + let bool: BoolArray = BoolArray::from(vec![true, false, true, true]); + let array = RoaringBoolArray::encode(bool.into_array())?; let truthy: Scalar = true.into(); let falsy: Scalar = false.into(); diff --git a/vortex-roaring/src/boolean/serde.rs b/vortex-roaring/src/boolean/serde.rs deleted file mode 100644 index ed246594ad..0000000000 --- a/vortex-roaring/src/boolean/serde.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::io; -use std::io::ErrorKind; - -use croaring::{Bitmap, Portable}; -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{RoaringBoolArray, RoaringBoolEncoding}; - -impl ArraySerde for RoaringBoolArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - let mut data = Vec::new(); - self.bitmap().serialize_into::(&mut data); - ctx.write_slice(data.as_slice()) - } - - fn metadata(&self) -> VortexResult>> { - todo!() - } -} - -impl EncodingSerde for RoaringBoolEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let bitmap_data = ctx.read_slice()?; - Ok(RoaringBoolArray::new( - Bitmap::try_deserialize::(bitmap_data.as_slice()) - .ok_or(io::Error::new(ErrorKind::InvalidData, "invalid bitmap"))?, - len, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use croaring::Bitmap; - - use crate::downcast::DowncastRoaring; - use crate::serde_tests::test::roundtrip_array; - use crate::RoaringBoolArray; - - #[test] - fn roundtrip() { - let arr = RoaringBoolArray::new(Bitmap::from_range(245..63000), 65536); - let read_arr = roundtrip_array(&arr).unwrap(); - - let read_roaring = read_arr.as_roaring_bool(); - assert_eq!(arr.bitmap(), read_roaring.bitmap()); - } -} diff --git a/vortex-roaring/src/boolean/stats.rs b/vortex-roaring/src/boolean/stats.rs deleted file mode 100644 index aefcdbf82c..0000000000 --- a/vortex-roaring/src/boolean/stats.rs +++ /dev/null @@ -1,111 +0,0 @@ -use vortex::array::Array; -use vortex::stats::{Stat, StatsCompute, StatsSet}; -use vortex_error::VortexResult; - -use crate::boolean::RoaringBoolArray; - -impl StatsCompute for RoaringBoolArray { - fn compute(&self, stat: &Stat) -> VortexResult { - let cardinality = self.bitmap().cardinality() as usize; - if let Some(value) = match stat { - Stat::IsConstant => Some((cardinality == self.len() || cardinality == 0).into()), - Stat::Max => { - if self.len() > 0 { - Some((cardinality > 0).into()) - } else { - None - } - } - Stat::Min => { - if self.len() > 0 { - Some((cardinality == self.len()).into()) - } else { - None - } - } - Stat::TrueCount => Some(cardinality.into()), - Stat::NullCount => Some(0.into()), - _ => None, - } { - Ok(StatsSet::of(*stat, value)) - } else { - Ok(StatsSet::default()) - } - } -} - -#[cfg(test)] -mod test { - use vortex::array::bool::BoolArray; - use vortex::array::Array; - use vortex::stats::Stat::*; - use vortex_error::VortexResult; - - use crate::RoaringBoolArray; - - #[test] - pub fn stats_all_true() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, true]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(true) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(true)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(true)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 2 - ); - - Ok(()) - } - - #[test] - pub fn stats_all_false() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![false, false]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(true) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(false)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(false)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 0 - ); - - Ok(()) - } - - #[test] - pub fn stats_mixed() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![false, true, true]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(false) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(false)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(true)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 2 - ); - - Ok(()) - } -} diff --git a/vortex-roaring/src/downcast.rs b/vortex-roaring/src/downcast.rs deleted file mode 100644 index 2ad13158e5..0000000000 --- a/vortex-roaring/src/downcast.rs +++ /dev/null @@ -1,46 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::{RoaringBoolArray, RoaringIntArray}; - -mod private { - pub trait Sealed {} -} - -#[allow(dead_code)] -pub trait DowncastRoaring: private::Sealed { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray>; - - fn as_roaring_int(&self) -> &RoaringIntArray { - self.maybe_roaring_int().unwrap() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray>; - - fn as_roaring_bool(&self) -> &RoaringBoolArray { - self.maybe_roaring_bool().unwrap() - } -} - -impl private::Sealed for dyn Array {} - -impl DowncastRoaring for dyn Array { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray> { - self.as_any().downcast_ref() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastRoaring for ArrayRef { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray> { - self.as_any().downcast_ref() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-roaring/src/integer/compress.rs b/vortex-roaring/src/integer/compress.rs index ec978f3db6..e342c400a9 100644 --- a/vortex-roaring/src/integer/compress.rs +++ b/vortex-roaring/src/integer/compress.rs @@ -1,27 +1,26 @@ use croaring::Bitmap; use log::debug; use num_traits::NumCast; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::{Array, ArrayRef}; +use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::ptype::{NativePType, PType}; -use vortex::stats::Stat; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::{Array, ArrayDType, ArrayDef, IntoArray, OwnedArray, ToStatic}; use vortex_error::VortexResult; use vortex_schema::DType; use vortex_schema::Nullability::NonNullable; use vortex_schema::Signedness::Unsigned; -use crate::{RoaringIntArray, RoaringIntEncoding}; +use crate::{OwnedRoaringIntArray, RoaringInt, RoaringIntArray, RoaringIntEncoding}; impl EncodingCompression for RoaringIntEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive enc arrays - if array.encoding().id() != PrimitiveEncoding::ID { + if array.encoding().id() != RoaringInt::ID { return None; } @@ -33,14 +32,20 @@ impl EncodingCompression for RoaringIntEncoding { // Only support sorted unique arrays if !array - .stats() - .get_or_compute_or(false, &Stat::IsStrictSorted) + .statistics() + .compute_as(Stat::IsStrictSorted) + .unwrap_or(false) { debug!("Skipping roaring int, not strict sorted"); return None; } - if array.stats().get_or_compute_or(0usize, &Stat::Max) > u32::MAX as usize { + if array + .statistics() + .compute_as(Stat::Max) + .map(|s: usize| s > u32::MAX as usize) + .unwrap_or(false) + { debug!("Skipping roaring int, max is larger than {}", u32::MAX); return None; } @@ -51,25 +56,26 @@ impl EncodingCompression for RoaringIntEncoding { fn compress( &self, - array: &dyn Array, - _like: Option<&dyn Array>, + array: &Array, + _like: Option<&Array>, _ctx: CompressCtx, - ) -> VortexResult { - Ok(roaring_encode(array.as_primitive()).into_array()) + ) -> VortexResult { + let parray = array.clone().flatten_primitive()?; + Ok(roaring_encode(parray).into_array().to_static()) } } -pub fn roaring_encode(primitive_array: &PrimitiveArray) -> RoaringIntArray { - match primitive_array.ptype() { - PType::U8 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U16 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U32 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U64 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - _ => panic!("Unsupported ptype {}", primitive_array.ptype()), +pub fn roaring_encode(parray: PrimitiveArray) -> RoaringIntArray { + match parray.ptype() { + PType::U8 => roaring_encode_primitive::(parray.typed_data()), + PType::U16 => roaring_encode_primitive::(parray.typed_data()), + PType::U32 => roaring_encode_primitive::(parray.typed_data()), + PType::U64 => roaring_encode_primitive::(parray.typed_data()), + _ => panic!("Unsupported ptype {}", parray.ptype()), } } -fn roaring_encode_primitive(values: &[T]) -> RoaringIntArray { +fn roaring_encode_primitive(values: &[T]) -> OwnedRoaringIntArray { let mut bitmap = Bitmap::new(); bitmap.extend(values.iter().map(|i| i.to_u32().unwrap())); bitmap.run_optimize(); diff --git a/vortex-roaring/src/integer/compute.rs b/vortex-roaring/src/integer/compute.rs index 9ca725c680..69d87863c5 100644 --- a/vortex-roaring/src/integer/compute.rs +++ b/vortex-roaring/src/integer/compute.rs @@ -6,17 +6,17 @@ use vortex_error::VortexResult; use crate::RoaringIntArray; -impl ArrayCompute for RoaringIntArray { +impl ArrayCompute for RoaringIntArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } } -impl ScalarAtFn for RoaringIntArray { +impl ScalarAtFn for RoaringIntArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { // Unwrap since we know the index is valid - let bitmap_value = self.bitmap.select(index as u32).unwrap(); - let scalar: Scalar = match self.ptype { + let bitmap_value = self.bitmap().select(index as u32).unwrap(); + let scalar: Scalar = match self.metadata().ptype { PType::U8 => (bitmap_value as u8).into(), PType::U16 => (bitmap_value as u16).into(), PType::U32 => bitmap_value.into(), diff --git a/vortex-roaring/src/integer/mod.rs b/vortex-roaring/src/integer/mod.rs index 9a9c7c78a0..af3c3bb34a 100644 --- a/vortex-roaring/src/integer/mod.rs +++ b/vortex-roaring/src/integer/mod.rs @@ -1,34 +1,30 @@ -use std::sync::{Arc, RwLock}; - use compress::roaring_encode; -use croaring::{Bitmap, Native}; -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; +use croaring::{Bitmap, Portable}; +use serde::{Deserialize, Serialize}; +use vortex::array::primitive::{Primitive, PrimitiveArray}; +use vortex::buffer::Buffer; use vortex::ptype::PType; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayFlatten, OwnedArray}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::DType; +use vortex_schema::Nullability::NonNullable; mod compress; mod compute; -mod serde; -mod stats; -#[derive(Debug, Clone)] -pub struct RoaringIntArray { - bitmap: Bitmap, +impl_encoding!("vortex.roaring_int", RoaringInt); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RoaringIntMetadata { ptype: PType, - stats: Arc>, + // NB: this is stored because we want to avoid the overhead of deserializing the bitmap + // on every len() call. It's CRITICAL that this is kept up-to date. + length: usize, } -impl RoaringIntArray { +impl RoaringIntArray<'_> { pub fn new(bitmap: Bitmap, ptype: PType) -> Self { Self::try_new(bitmap, ptype).unwrap() } @@ -37,81 +33,46 @@ impl RoaringIntArray { if !ptype.is_unsigned_int() { vortex_bail!("RoaringInt expected unsigned int"); } - Ok(Self { - bitmap, - ptype, - stats: Arc::new(RwLock::new(StatsSet::new())), + typed: TypedArray::try_from_parts( + DType::Bool(NonNullable), + RoaringIntMetadata { + ptype, + length: bitmap.statistics().cardinality as usize, + }, + Some(Buffer::Owned(bitmap.serialize::().into())), + vec![].into(), + HashMap::default(), + )?, }) } - pub fn bitmap(&self) -> &Bitmap { - &self.bitmap + pub fn bitmap(&self) -> Bitmap { + //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + Bitmap::deserialize::( + self.array() + .buffer() + .expect("RoaringBoolArray buffer is missing") + .as_slice(), + ) } pub fn ptype(&self) -> PType { - self.ptype + self.metadata().ptype } - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Primitive(p) => Ok(roaring_encode(p)), - _ => Err(vortex_err!("RoaringInt can only encode primitive arrays")), + pub fn encode(array: OwnedArray) -> VortexResult { + if array.encoding().id() == Primitive::ID { + Ok(roaring_encode(PrimitiveArray::try_from(array)?).into_array()) + } else { + Err(vortex_err!("RoaringInt can only encode primitive arrays")) } } } -impl Array for RoaringIntArray { - impl_array!(); - #[inline] - fn len(&self) -> usize { - self.bitmap.cardinality() as usize - } - - #[inline] - fn is_empty(&self) -> bool { - self.bitmap().is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - self.ptype.into() - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &RoaringIntEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.bitmap.get_serialized_size_in_bytes::() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - todo!() - } -} - -impl ArrayValidity for RoaringIntArray { - fn logical_validity(&self) -> Validity { - Validity::Valid(self.len()) +impl ArrayValidity for RoaringIntArray<'_> { + fn logical_validity(&self) -> LogicalValidity { + LogicalValidity::AllValid(self.bitmap().iter().count()) } fn is_valid(&self, _index: usize) -> bool { @@ -119,30 +80,26 @@ impl ArrayValidity for RoaringIntArray { } } -impl ArrayDisplay for RoaringIntArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("bitmap", format!("{:?}", self.bitmap())) +impl ArrayFlatten for RoaringIntArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + todo!() } } -#[derive(Debug)] -pub struct RoaringIntEncoding; - -impl RoaringIntEncoding { - pub const ID: EncodingId = EncodingId::new("roaring.int"); -} - -impl Encoding for RoaringIntEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl AcceptArrayVisitor for RoaringIntArray<'_> { + fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + todo!() } +} - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } +impl ArrayStatisticsCompute for RoaringIntArray<'_> {} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for RoaringIntArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } @@ -150,14 +107,15 @@ impl Encoding for RoaringIntEncoding { mod test { use vortex::array::primitive::PrimitiveArray; use vortex::compute::scalar_at::scalar_at; + use vortex::IntoArray; use vortex_error::VortexResult; use crate::RoaringIntArray; #[test] pub fn test_scalar_at() -> VortexResult<()> { - let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]); - let array = RoaringIntArray::encode(&ints)?; + let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); + let array = RoaringIntArray::encode(ints)?; assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); diff --git a/vortex-roaring/src/integer/serde.rs b/vortex-roaring/src/integer/serde.rs deleted file mode 100644 index f30cf0dcb1..0000000000 --- a/vortex-roaring/src/integer/serde.rs +++ /dev/null @@ -1,57 +0,0 @@ -use std::io; -use std::io::ErrorKind; - -use croaring::{Bitmap, Portable}; -use vortex::array::{Array, ArrayRef}; -use vortex::ptype::PType; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{RoaringIntArray, RoaringIntEncoding}; - -impl ArraySerde for RoaringIntArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - let mut data = Vec::new(); - self.bitmap().serialize_into::(&mut data); - ctx.write_slice(data.as_slice()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for RoaringIntEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let bitmap_data = ctx.read_slice()?; - let ptype: PType = ctx - .schema() - .try_into() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - Ok(RoaringIntArray::new( - Bitmap::try_deserialize::(bitmap_data.as_slice()) - .ok_or(io::Error::new(ErrorKind::InvalidData, "invalid bitmap"))?, - ptype, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use croaring::Bitmap; - use vortex::ptype::PType; - - use crate::downcast::DowncastRoaring; - use crate::serde_tests::test::roundtrip_array; - use crate::RoaringIntArray; - - #[test] - fn roundtrip() { - let arr = RoaringIntArray::new(Bitmap::from_range(245..63000), PType::U32); - let read_arr = roundtrip_array(&arr).unwrap(); - let read_roaring = read_arr.as_roaring_int(); - assert_eq!(arr.ptype(), read_roaring.ptype()); - assert_eq!(arr.bitmap(), read_roaring.bitmap()); - } -} diff --git a/vortex-roaring/src/integer/stats.rs b/vortex-roaring/src/integer/stats.rs deleted file mode 100644 index d4c69f4a7d..0000000000 --- a/vortex-roaring/src/integer/stats.rs +++ /dev/null @@ -1,22 +0,0 @@ -use vortex::stats::{Stat, StatsCompute, StatsSet}; -use vortex_error::VortexResult; - -use crate::RoaringIntArray; - -impl StatsCompute for RoaringIntArray { - fn compute(&self, stat: &Stat) -> VortexResult { - if let Some(value) = match stat { - Stat::IsConstant => Some((self.bitmap.cardinality() <= 1).into()), - Stat::IsSorted => Some(true.into()), - Stat::IsStrictSorted => Some(true.into()), - Stat::Max => self.bitmap.minimum().map(|v| v.into()), - Stat::Min => self.bitmap.maximum().map(|v| v.into()), - Stat::NullCount => Some(0.into()), - _ => None, - } { - Ok(StatsSet::of(*stat, value)) - } else { - Ok(StatsSet::default()) - } - } -} diff --git a/vortex-roaring/src/lib.rs b/vortex-roaring/src/lib.rs index dd7e25b519..8f1ab0eed3 100644 --- a/vortex-roaring/src/lib.rs +++ b/vortex-roaring/src/lib.rs @@ -1,15 +1,5 @@ pub use boolean::*; pub use integer::*; -use linkme::distributed_slice; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod boolean; -mod downcast; mod integer; -mod serde_tests; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_ROARING_BOOL: EncodingRef = &RoaringBoolEncoding; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_ROARING_INT: EncodingRef = &RoaringIntEncoding; diff --git a/vortex-roaring/src/serde_tests.rs b/vortex-roaring/src/serde_tests.rs deleted file mode 100644 index 88935c0e11..0000000000 --- a/vortex-roaring/src/serde_tests.rs +++ /dev/null @@ -1,15 +0,0 @@ -#[cfg(test)] -pub mod test { - use vortex::array::{Array, ArrayRef}; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - pub fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } -} diff --git a/vortex-schema/src/dtype.rs b/vortex-schema/src/dtype.rs index 3ee8914916..e1094c6195 100644 --- a/vortex-schema/src/dtype.rs +++ b/vortex-schema/src/dtype.rs @@ -25,6 +25,15 @@ impl From for Nullability { } } +impl From for bool { + fn from(value: Nullability) -> Self { + match value { + Nullability::NonNullable => false, + Nullability::Nullable => true, + } + } +} + impl Display for Nullability { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { diff --git a/vortex-schema/src/serde.rs b/vortex-schema/src/serde.rs index 42301ae1d9..a5bbd242df 100644 --- a/vortex-schema/src/serde.rs +++ b/vortex-schema/src/serde.rs @@ -1,23 +1,19 @@ #![cfg(feature = "serde")] -use flatbuffers::{root, FlatBufferBuilder}; +use flatbuffers::root; use serde::de::{DeserializeSeed, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use vortex_flatbuffers::{ReadFlatBuffer, WriteFlatBuffer}; +use vortex_flatbuffers::{FlatBufferToBytes, ReadFlatBuffer}; use crate::DType; use crate::{flatbuffers as fb, DTypeSerdeContext}; -/// Implement the `Serialize` and trait for `DType` using the flatbuffers. impl Serialize for DType { fn serialize(&self, serializer: S) -> Result where S: Serializer, { - let mut fbb = FlatBufferBuilder::new(); - let root = self.write_flatbuffer(&mut fbb); - fbb.finish_minimal(root); - serializer.serialize_bytes(fbb.finished_data()) + self.with_flatbuffer_bytes(|bytes| serializer.serialize_bytes(bytes)) } } diff --git a/vortex-schema/src/serialize.rs b/vortex-schema/src/serialize.rs index fa421acaae..d7c2bd234b 100644 --- a/vortex-schema/src/serialize.rs +++ b/vortex-schema/src/serialize.rs @@ -176,21 +176,17 @@ impl From<&FloatWidth> for fb::FloatWidth { mod test { use std::sync::Arc; - use flatbuffers::{root, FlatBufferBuilder}; - use vortex_flatbuffers::{ReadFlatBuffer, WriteFlatBuffer}; + use flatbuffers::root; + use vortex_flatbuffers::{FlatBufferToBytes, ReadFlatBuffer}; use crate::flatbuffers as fb; use crate::{DType, DTypeSerdeContext, FloatWidth, IntWidth, Nullability, Signedness}; fn roundtrip_dtype(dtype: DType) { - let mut fbb = FlatBufferBuilder::new(); - let root_offset = dtype.write_flatbuffer(&mut fbb); - fbb.finish_minimal(root_offset); - - let bytes = fbb.finished_data(); + let bytes = dtype.with_flatbuffer_bytes(|bytes| bytes.to_vec()); let deserialized = DType::read_flatbuffer( &DTypeSerdeContext::new(vec![]), - &root::(bytes).unwrap(), + &root::(&bytes).unwrap(), ) .unwrap(); assert_eq!(dtype, deserialized);