diff --git a/Cargo.lock b/Cargo.lock index b9c2e334ae..ea1e366e92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,9 +96,9 @@ checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arc-swap" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b3d0060af21e8d11a926981cc00c6c1541aa91dd64b9f881985c3da1094425f" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" [[package]] name = "arrayref" @@ -552,9 +552,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a116f46a969224200a0a97f29cfd4c50e7534e4b4826bd23ea2c3c533039c82c" +checksum = "86a9249d1447a85f95810c620abea82e001fe58a31713fcce614caf52499f905" dependencies = [ "bzip2", "flate2", @@ -576,7 +576,7 @@ checksum = "30c5ef0ede93efbf733c1a727f3b6b5a1060bbedd5600183e66f6e4be4af0ec5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -587,7 +587,7 @@ checksum = "a507401cad91ec6a857ed5513a2073c82a9b9048762b885bb98655b306964681" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -607,9 +607,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "aws-config" @@ -931,9 +931,9 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.70" +version = "0.3.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95d8e92cac0961e91dbd517496b00f7e9b92363dbe6d42c3198268323798860c" +checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" dependencies = [ "addr2line", "cc", @@ -979,7 +979,7 @@ dependencies = [ "log", "parquet 50.0.0", "parquet 51.0.0", - "reqwest 0.12.1", + "reqwest 0.12.2", "simplelog", "tokio", "vortex-alp", @@ -988,6 +988,7 @@ dependencies = [ "vortex-dict", "vortex-error", "vortex-fastlanes", + "vortex-ipc", "vortex-ree", "vortex-roaring", "vortex-schema", @@ -1013,7 +1014,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.53", + "syn 2.0.58", "which", ] @@ -1107,9 +1108,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "bytes-utils" @@ -1206,9 +1207,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.35" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf5903dcbc0a39312feb77df2ff4c76387d591b9fc7b04a238dcf8bb62639a" +checksum = "8a0d04d43504c61aa6c7531f1871dd0d418d91130162063b789da00fd7057a5e" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1279,9 +1280,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.3" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", ] @@ -1842,7 +1843,7 @@ checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -1897,6 +1898,35 @@ dependencies = [ "version_check", ] +[[package]] +name = "ext-trait" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d772df1c1a777963712fb68e014235e80863d6a91a85c4e06ba2d16243a310e5" +dependencies = [ + "ext-trait-proc_macros", +] + +[[package]] +name = "ext-trait-proc_macros" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ab7934152eaf26aa5aa9f7371408ad5af4c31357073c9e84c3b9d7f11ad639a" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "extension-traits" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a296e5a895621edf9fa8329c83aa1cb69a964643e36cf54d8d7a69b789089537" +dependencies = [ + "ext-trait", +] + [[package]] name = "fastlanez" version = "0.1.0" @@ -1923,9 +1953,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.1" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" [[package]] name = "fixedbitset" @@ -1962,6 +1992,19 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flexbuffers" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15d14128f06405808ce75bfebe11e9b0f9da18719ede6d7bdb1702d6bfe0f7e8" +dependencies = [ + "bitflags 1.3.2", + "byteorder", + "num_enum 0.5.11", + "serde", + "serde_derive", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2048,7 +2091,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -2447,9 +2490,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.5" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", "hashbrown 0.14.3", @@ -2457,9 +2500,9 @@ dependencies = [ [[package]] name = "indoc" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] name = "integer-encoding" @@ -2493,6 +2536,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -2504,9 +2556,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" @@ -2528,9 +2580,9 @@ dependencies = [ [[package]] name = "lance" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238458ad84aebcb6dc74c2292bcdd0ff14b72471397137694af72f0773643d53" +checksum = "89669e7ace91c716c253208f5d6c53c405d6da2ebd6c8798f205a773b99635e7" dependencies = [ "arrow 50.0.0", "arrow-arith 50.0.0", @@ -2586,9 +2638,9 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd8edf7bcae899cab87241b5444546ea0a9878a9dcdaf43951e5122d1e6b2cd" +checksum = "f6b5e663bd15a2ea3dd2f998db34d7843526b210b730791d55746baaeafb3e4e" dependencies = [ "arrow-array 50.0.0", "arrow-buffer 50.0.0", @@ -2605,9 +2657,9 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92b31b3fd19ecd9bbd8bb65e78e1983cf6e80a375e8924a1b98be3ab3a80d816" +checksum = "c2aa28b21beef26fdfc09cf13436306fdf6ed4545972e3268be26974578ef8cb" dependencies = [ "arrow-array 50.0.0", "arrow-buffer 50.0.0", @@ -2640,9 +2692,9 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960068a2d79c6a942d563334dca8ff664863067591f61e9bea85953c80322f44" +checksum = "80afb083dda662157b95adfbd35009d8804c4d4d77b7f62c69cff99eb36b7d15" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", @@ -2663,9 +2715,9 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8b8be219dd422b9becde24fb9c4ff6ac1222dc78ba0987142d1e8c4d2fdbfc" +checksum = "7e462c472fed261cad4e7b3bfff77516d6a368c2848d4ea5a834964d6d4a0d18" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", @@ -2679,9 +2731,9 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "313b8e3b0806b72b4df8445e3621cfbf3756efde471109c21dae00c791473c3e" +checksum = "8cf410862c083785165d615349155474fa91ce3fbcb0da9a41235bf22f6edc75" dependencies = [ "arrow-arith 50.0.0", "arrow-array 50.0.0", @@ -2708,9 +2760,9 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1929cab8905ba5d817bf18d3e6558fa06c354c3d3be4b9f29092b1d9cfe23ca2" +checksum = "bcf64473aae15d2ed1ed6f11887ab9a06667d91e9131020141321f34ee0d860e" dependencies = [ "arrow 50.0.0", "arrow-array 50.0.0", @@ -2754,9 +2806,9 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3ee8ec20156550e63a0e20993963a4f43a09ff225910c66d7c5bd258ab1085" +checksum = "f45be6af4e101c838bcdd29f5a0ab40500bbba675d055d9a4e311fd725544fec" dependencies = [ "arrow 50.0.0", "arrow-arith 50.0.0", @@ -2792,9 +2844,9 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "921107f1c0cbcb838d3fd7ebafbc447a60150b01fe64cfdf93d1d277cb3795ba" +checksum = "c839f75209a94ead48be4bca7fc6e5ac1d6b974410c56cbad89e067fbcb58984" dependencies = [ "arrow-array 50.0.0", "arrow-ord 50.0.0", @@ -2815,9 +2867,9 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.10.5" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1de9ef6433da7d146c25a1e54facd805ba8246b5da83b725b32975e42ece679d" +checksum = "99d7e5691cec82369133ac1f83a0c7ec9d88cfc279052368c67bfbe0f4ad4592" dependencies = [ "arrow-array 50.0.0", "arrow-buffer 50.0.0", @@ -2867,6 +2919,31 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" +[[package]] +name = "lending-iterator" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc07588c853b50689205fb5c00498aa681d89828e0ce8cbd965ebc7a5d8ae260" +dependencies = [ + "extension-traits", + "lending-iterator-proc_macros", + "macro_rules_attribute", + "never-say-never", + "nougat", + "polonius-the-crab", +] + +[[package]] +name = "lending-iterator-proc_macros" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5445dd1c0deb1e97b8a16561d17fc686ca83e8411128fb036e9668a72d51b1d" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "lexical-core" version = "0.8.5" @@ -2967,13 +3044,12 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libredox" -version = "0.0.1" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85c833ca1e66078851dba29046874e38f08b2c883700aa29a03ddd3b23814ee8" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags 2.5.0", "libc", - "redox_syscall", ] [[package]] @@ -3005,7 +3081,7 @@ checksum = "adf157a4dc5a29b7b464aa8fe7edeff30076e07e13646a1c3874f58477dc99f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3038,9 +3114,9 @@ checksum = "9106e1d747ffd48e6be5bb2d97fa706ed25b144fbee4d5c02eae110cd8d6badd" [[package]] name = "lz4_flex" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ "twox-hash", ] @@ -3065,6 +3141,22 @@ dependencies = [ "libc", ] +[[package]] +name = "macro_rules_attribute" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" + [[package]] name = "md-5" version = "0.10.6" @@ -3077,15 +3169,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "memoffset" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" dependencies = [ "autocfg", ] @@ -3177,6 +3269,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "never-say-never" +version = "6.6.666" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" + [[package]] name = "nom" version = "7.1.3" @@ -3187,6 +3285,27 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nougat" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b57b9ced431322f054fc673f1d3c7fa52d80efd9df74ad2fc759f044742510" +dependencies = [ + "macro_rules_attribute", + "nougat-proc_macros", +] + +[[package]] +name = "nougat-proc_macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c84f77a45e99a2f9b492695d99e1c23844619caa5f3e57647cffacad773ca257" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "num" version = "0.4.1" @@ -3279,13 +3398,34 @@ dependencies = [ "libc", ] +[[package]] +name = "num_enum" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f646caf906c20226733ed5b1374287eb97e3c2a5c227ce668c1f2ce20ae57c9" +dependencies = [ + "num_enum_derive 0.5.11", +] + [[package]] name = "num_enum" version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" dependencies = [ - "num_enum_derive", + "num_enum_derive 0.7.2", +] + +[[package]] +name = "num_enum_derive" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbff9bc912032c62bf65ef1d5aea88983b420f4f839db1e9b0c281a25c9c799" +dependencies = [ + "proc-macro-crate 1.3.1", + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] @@ -3294,10 +3434,10 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" dependencies = [ - "proc-macro-crate", + "proc-macro-crate 3.1.0", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3384,7 +3524,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3395,9 +3535,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.101" +version = "0.9.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" dependencies = [ "cc", "libc", @@ -3617,14 +3757,14 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -3666,6 +3806,12 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polonius-the-crab" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a69ee997a6282f8462abf1e0d8c38c965e968799e912b3bed8c9e8a28c2f9f" + [[package]] name = "portable-atomic" version = "1.6.0" @@ -3686,12 +3832,22 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.53", + "syn 2.0.58", +] + +[[package]] +name = "proc-macro-crate" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" +dependencies = [ + "once_cell", + "toml_edit 0.19.15", ] [[package]] @@ -3700,7 +3856,7 @@ version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" dependencies = [ - "toml_edit", + "toml_edit 0.21.1", ] [[package]] @@ -3730,7 +3886,7 @@ checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" dependencies = [ "bytes", "heck", - "itertools 0.10.5", + "itertools 0.11.0", "log", "multimap", "once_cell", @@ -3739,7 +3895,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.53", + "syn 2.0.58", "tempfile", "which", ] @@ -3751,10 +3907,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.11.0", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3835,7 +3991,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3848,7 +4004,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -3957,9 +4113,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -3986,9 +4142,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a18479200779601e498ada4e8c1e1f50e3ee19deb0259c25825a98b5603b2cb4" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" dependencies = [ "getrandom", "libredox", @@ -3997,9 +4153,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", @@ -4026,9 +4182,9 @@ checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "regress" @@ -4085,9 +4241,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e333b1eb9fe677f6893a9efcb0d277a2d3edd83f358a236b657c32301dc6e5f6" +checksum = "2d66674f2b6fb864665eea7a3c1ac4e3dfacd2fda83cf6f935a612e01b0e3338" dependencies = [ "base64 0.21.7", "bytes", @@ -4245,9 +4401,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "868e20fada228fefaf6b652e00cc73623d54f8171e7352c18bb281571f2d92da" +checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" @@ -4340,9 +4496,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.9.2" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" dependencies = [ "bitflags 1.3.2", "core-foundation", @@ -4353,9 +4509,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" dependencies = [ "core-foundation-sys", "libc", @@ -4393,7 +4549,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4409,9 +4565,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", @@ -4427,7 +4583,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4597,7 +4753,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4606,6 +4762,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "strum" version = "0.25.0" @@ -4631,7 +4793,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4644,7 +4806,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4664,7 +4826,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.53", + "syn 2.0.58", "typify", "walkdir", ] @@ -4688,9 +4850,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.53" +version = "2.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" dependencies = [ "proc-macro2", "quote", @@ -4784,7 +4946,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4867,9 +5029,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -4890,7 +5052,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -4944,6 +5106,17 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +[[package]] +name = "toml_edit" +version = "0.19.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + [[package]] name = "toml_edit" version = "0.21.1" @@ -5003,7 +5176,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -5066,7 +5239,7 @@ dependencies = [ "regress", "schemars", "serde_json", - "syn 2.0.53", + "syn 2.0.58", "thiserror", "unicode-ident", ] @@ -5083,7 +5256,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.53", + "syn 2.0.58", "typify-impl", ] @@ -5228,6 +5401,9 @@ dependencies = [ "arrow-buffer 51.0.0", "arrow-schema 51.0.0", "criterion", + "flatbuffers", + "flatc", + "flexbuffers", "half", "humansize", "itertools 0.12.1", @@ -5235,13 +5411,15 @@ dependencies = [ "linkme", "log", "num-traits", - "num_enum", + "num_enum 0.7.2", "paste", "rand", "thiserror", "vortex-alloc", "vortex-error", + "vortex-flatbuffers", "vortex-schema", + "walkdir", ] [[package]] @@ -5278,6 +5456,7 @@ name = "vortex-error" version = "0.1.0" dependencies = [ "arrow-schema 51.0.0", + "flatbuffers", "parquet 51.0.0", "thiserror", ] @@ -5298,6 +5477,32 @@ dependencies = [ "vortex-schema", ] +[[package]] +name = "vortex-flatbuffers" +version = "0.1.0" +dependencies = [ + "flatbuffers", + "vortex-error", +] + +[[package]] +name = "vortex-ipc" +version = "0.1.0" +dependencies = [ + "arrow-buffer 51.0.0", + "flatbuffers", + "flatc", + "itertools 0.12.1", + "lending-iterator", + "nougat", + "streaming-iterator", + "vortex-array", + "vortex-error", + "vortex-flatbuffers", + "vortex-schema", + "walkdir", +] + [[package]] name = "vortex-ree" version = "0.1.0" @@ -5335,8 +5540,10 @@ dependencies = [ "flatbuffers", "flatc", "itertools 0.12.1", + "linkme", "thiserror", "vortex-error", + "vortex-flatbuffers", "walkdir", ] @@ -5404,7 +5611,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", "wasm-bindgen-shared", ] @@ -5438,7 +5645,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5707,7 +5914,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.58", ] [[package]] @@ -5727,27 +5934,27 @@ dependencies = [ [[package]] name = "zstd" -version = "0.13.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" +checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.0.0" +version = "7.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" +checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" +version = "2.0.10+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" dependencies = [ "cc", "pkg-config", diff --git a/Cargo.toml b/Cargo.toml index dc0a4009bc..2d1f3f03f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,8 @@ members = [ "vortex-dict", "vortex-error", "vortex-fastlanes", + "vortex-flatbuffers", + "vortex-ipc", "vortex-ree", "vortex-roaring", "vortex-schema", @@ -47,6 +49,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] } croaring = "1.0.1" divan = "0.1.14" flatbuffers = "23.5.26" +flexbuffers = "2.0.0" flatc = "0.2.2" half = { version = "^2", features = ["std", "num-traits"] } hashbrown = "0.14.3" diff --git a/README.md b/README.md index 601d0664cd..0d5a249fa0 100644 --- a/README.md +++ b/README.md @@ -159,8 +159,15 @@ without prior discussion infeasible. If you are interested in contributing, plea This repo uses submodules for non-Rust dependencies (e.g., for the zig fastlanez repo). Before building make sure to run -* `git submodule update --init --recursive` -* `./zigup` (this will install the zig version required by fastlanez) +```bash +git submodule update --init --recursive + +# Install the zig version required by fastlanez +./zigup + +# Install Rye from https://rye-up.com, and setup the virtualenv +rye sync +``` ## License @@ -172,24 +179,31 @@ This project is inspired by and--in some cases--directly based upon the existing and OSS developers. In particular, the following academic papers greatly influenced the development: -* Maximilian Kuschewski, David Sauerwein, Adnan Alhomssi, and Viktor Leis. 2023. [BtrBlocks: Efficient Columnar Compression -for Data Lakes](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf). Proc. ACM Manag. Data 1, 2, -Article 118 (June 2023), 14 pages. https://doi.org/10.1145/3589263 -* Azim Afroozeh and Peter Boncz. [The FastLanes Compression Layout: Decoding >100 Billion Integers per Second with Scalar -Code](https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf). PVLDB, 16(9): 2132 - 2144, 2023. -* Peter Boncz, Thomas Neumann, and Viktor Leis. [FSST: Fast Random Access String -Compression](https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf). -PVLDB, 13(11): 2649-2661, 2020. -* Azim Afroozeh, Leonardo X. Kuffo, and Peter Boncz. 2023. [ALP: Adaptive Lossless floating-Point -Compression](https://ir.cwi.nl/pub/33334/33334.pdf). Proc. ACM -Manag. Data 1, 4 (SIGMOD), Article 230 (December 2023), 26 pages. https://doi.org/10.1145/3626717 + +* Maximilian Kuschewski, David Sauerwein, Adnan Alhomssi, and Viktor Leis. + 2023. [BtrBlocks: Efficient Columnar Compression + for Data Lakes](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf). Proc. ACM Manag. Data 1, + 2, + Article 118 (June 2023), 14 pages. https://doi.org/10.1145/3589263 +* Azim Afroozeh and Peter + Boncz. [The FastLanes Compression Layout: Decoding >100 Billion Integers per Second with Scalar + Code](https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf). PVLDB, 16(9): 2132 - 2144, 2023. +* Peter Boncz, Thomas Neumann, and Viktor Leis. [FSST: Fast Random Access String + Compression](https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf). + PVLDB, 13(11): 2649-2661, 2020. +* Azim Afroozeh, Leonardo X. Kuffo, and Peter Boncz. 2023. [ALP: Adaptive Lossless floating-Point + Compression](https://ir.cwi.nl/pub/33334/33334.pdf). Proc. ACM + Manag. Data 1, 4 (SIGMOD), Article 230 (December 2023), 26 pages. https://doi.org/10.1145/3626717 Additionally, we benefited greatly from: -* the collected OSS work of [Daniel Lemire](https://github.com/lemire), such as [FastPFor](https://github.com/lemire/FastPFor), -and [StreamVByte](https://github.com/lemire/streamvbyte). -* the [parquet2](https://github.com/jorgecarleitao/parquet2) project by [Jorge Leitao](https://github.com/jorgecarleitao). + +* the collected OSS work of [Daniel Lemire](https://github.com/lemire), such + as [FastPFor](https://github.com/lemire/FastPFor), + and [StreamVByte](https://github.com/lemire/streamvbyte). +* the [parquet2](https://github.com/jorgecarleitao/parquet2) project + by [Jorge Leitao](https://github.com/jorgecarleitao). * the public discussions around choices of compression codecs, as well as the C++ implementations thereof, -from [duckdb](https://github.com/duckdb/duckdb). + from [duckdb](https://github.com/duckdb/duckdb). * the existence, ideas, & implementation of the [Apache Arrow](https://arrow.apache.org) project. * the [Velox](https://github.com/facebookincubator/velox) project and discussions with its maintainers. diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index b74c6dc881..2557eaebaa 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -23,6 +23,7 @@ vortex-datetime = { path = "../vortex-datetime" } vortex-dict = { path = "../vortex-dict" } vortex-error = { path = "../vortex-error", features = ["parquet"] } vortex-fastlanes = { path = "../vortex-fastlanes" } +vortex-ipc = { path = "../vortex-ipc" } vortex-ree = { path = "../vortex-ree" } vortex-roaring = { path = "../vortex-roaring" } vortex-schema = { path = "../vortex-schema" } diff --git a/bench-vortex/src/bin/ipc.rs b/bench-vortex/src/bin/ipc.rs new file mode 100644 index 0000000000..0a5a91a158 --- /dev/null +++ b/bench-vortex/src/bin/ipc.rs @@ -0,0 +1,46 @@ +use log::LevelFilter; +use std::fs::File; + +use bench_vortex::reader::open_vortex; +use bench_vortex::setup_logger; +use bench_vortex::taxi_data::taxi_data_vortex; +use vortex::array::primitive::PrimitiveArray; +use vortex::array::Array; +use vortex::compute::take::take; +use vortex::serde::context::SerdeContext; +use vortex_error::VortexResult; +use vortex_ipc::iter::FallibleLendingIterator; +use vortex_ipc::reader::StreamReader; +use vortex_ipc::writer::StreamWriter; + +pub fn main() -> VortexResult<()> { + setup_logger(LevelFilter::Error); + + let array = open_vortex(&taxi_data_vortex())?; + println!("Array {}", &array); + + //let ipc = idempotent("ipc.vortex", |path| { + let ipc = "bench-vortex/data/ipc.vortex"; + let mut write = File::create("bench-vortex/data/ipc.vortex")?; + let ctx = SerdeContext::default(); + let mut writer = StreamWriter::try_new(&mut write, ctx)?; + writer.write(&array)?; + //})?; + + // Now try to read from the IPC stream. + let mut read = File::open(ipc)?; + let mut ipc_reader = StreamReader::try_new(&mut read)?; + + // We know we only wrote a single array. + // TODO(ngates): create an option to skip the multi-array reader? + let mut array_reader = ipc_reader.next()?.unwrap(); + println!("DType: {:?}", array_reader.dtype()); + // Read some number of chunks from the stream. + while let Some(chunk) = array_reader.next().unwrap() { + println!("VIEW: {}", (&chunk as &dyn Array)); + let taken = take(&chunk, &PrimitiveArray::from(vec![0, 1, 0, 1])).unwrap(); + println!("Taken: {}", &taken); + } + + Ok(()) +} diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index f7ba9be9e0..cca6d9af8f 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -13,9 +13,10 @@ use crate::taxi_data::taxi_data_parquet; use vortex::array::chunked::ChunkedArray; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::IntoArray; -use vortex::array::{Array, ArrayRef, EncodingRef, ENCODINGS}; +use vortex::array::{Array, ArrayRef}; use vortex::arrow::FromArrowType; use vortex::compress::{CompressConfig, CompressCtx}; +use vortex::encoding::{EncodingRef, ENCODINGS}; use vortex::formatter::display_tree; use vortex_alp::ALPEncoding; use vortex_datetime::DateTimeEncoding; diff --git a/flatbuffers.build.rs b/flatbuffers.build.rs new file mode 100644 index 0000000000..1d885a0ee0 --- /dev/null +++ b/flatbuffers.build.rs @@ -0,0 +1,55 @@ +use flatc::flatc; +use std::env; +use std::ffi::OsStr; +use std::path::{Path, PathBuf}; +use std::process::Command; + +use walkdir::WalkDir; + +fn main() { + let flatbuffers_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()) + .canonicalize() + .expect("Failed to canonicalize CARGO_MANIFEST_DIR") + .join("flatbuffers"); + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()) + .canonicalize() + .expect("Failed to canonicalize OUT_DIR"); + + let fbs_files = WalkDir::new(&flatbuffers_dir) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension() == Some(OsStr::new("fbs"))) + .map(|e| { + rerun_if_changed(e.path()); + e.path().to_path_buf() + }) + .collect::>(); + + if !Command::new(flatc()) + .arg("--rust") + .arg("--filename-suffix") + .arg("") + .arg("-I") + .arg(flatbuffers_dir.join("../../")) + .arg("--include-prefix") + .arg("flatbuffers::deps") + .arg("-o") + .arg(out_dir.join("flatbuffers")) + .args(fbs_files) + .status() + .unwrap() + .success() + { + panic!("Failed to run flatc"); + } +} + +fn rerun_if_changed(path: &Path) { + println!( + "cargo:rerun-if-changed={}", + path.canonicalize() + .unwrap_or_else(|_| panic!("failed to canonicalize {}", path.to_str().unwrap())) + .to_str() + .unwrap() + ); +} diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index e9ab7fe353..868270d68e 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -10,7 +10,8 @@ use vortex::array::sparse::{SparseArray, SparseEncoding}; use vortex::array::struct_::{StructArray, StructEncoding}; use vortex::array::varbin::{VarBinArray, VarBinEncoding}; use vortex::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; -use vortex::array::{Array, ArrayKind, ArrayRef, EncodingRef}; +use vortex::array::{Array, ArrayKind, ArrayRef}; +use vortex::encoding::EncodingRef; use vortex_alp::{ALPArray, ALPEncoding}; use vortex_dict::{DictArray, DictEncoding}; use vortex_fastlanes::{ diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index 3846f86b66..5f92e5cfd2 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -1,7 +1,7 @@ use pyo3::types::PyType; use pyo3::{pyclass, pyfunction, pymethods, Py, PyResult, Python}; use std::sync::Arc; -use vortex::array::ENCODINGS; +use vortex::encoding::ENCODINGS; use vortex::compress::{CompressConfig, CompressCtx}; diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index c3cfeb4f00..db3fe9cfeb 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -23,7 +23,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { debug!( "Discovered encodings: {:?}", - vortex::array::ENCODINGS + vortex::encoding::ENCODINGS .iter() .map(|e| e.id().to_string()) .collect::>() diff --git a/requirements-dev.lock b/requirements-dev.lock index 39e324aa54..6e964b7647 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -5,54 +5,104 @@ # pre: false # features: [] # all-features: false +# with-sources: false -e file:pyvortex -e file:. babel==2.14.0 + # via mkdocs-material bracex==2.4 + # via wcmatch certifi==2024.2.2 + # via requests charset-normalizer==3.3.2 + # via requests click==8.1.7 + # via mkdocs colorama==0.4.6 + # via mkdocs-material ghp-import==2.1.0 + # via mkdocs idna==3.6 + # via requests importlib-metadata==7.0.1 + # via mike importlib-resources==6.1.2 + # via mike iniconfig==2.0.0 + # via pytest jinja2==3.1.3 + # via mike + # via mkdocs + # via mkdocs-material markdown==3.5.2 + # via mkdocs + # via mkdocs-material + # via pymdown-extensions markupsafe==2.1.5 + # via jinja2 + # via mkdocs maturin==1.4.0 mergedeep==1.3.4 + # via mkdocs mike==2.0.0 mkdocs==1.5.3 + # via mike + # via mkdocs-include-markdown-plugin + # via mkdocs-material mkdocs-include-markdown-plugin==6.0.4 mkdocs-material==9.5.12 mkdocs-material-extensions==1.3.1 + # via mkdocs-material numpy==1.26.4 + # via pyarrow packaging==23.2 + # via mkdocs + # via pytest paginate==0.5.6 + # via mkdocs-material pathspec==0.12.1 + # via mkdocs platformdirs==4.2.0 + # via mkdocs pluggy==1.4.0 + # via pytest py-cpuinfo==9.0.0 + # via pytest-benchmark pyarrow==15.0.0 pygments==2.17.2 + # via mkdocs-material pymdown-extensions==10.7 + # via mkdocs-material pyparsing==3.1.1 + # via mike pytest==7.4.0 + # via pytest-benchmark pytest-benchmark==4.0.0 python-dateutil==2.9.0 + # via ghp-import pyyaml==6.0.1 + # via mike + # via mkdocs + # via pymdown-extensions + # via pyyaml-env-tag pyyaml-env-tag==0.1 + # via mkdocs regex==2023.12.25 + # via mkdocs-material requests==2.31.0 + # via mkdocs-material ruff==0.2.2 six==1.16.0 + # via python-dateutil urllib3==2.2.1 + # via requests verspec==0.1.0 + # via mike watchdog==4.0.0 + # via mkdocs wcmatch==8.5.1 + # via mkdocs-include-markdown-plugin zipp==3.17.0 -# The following packages are considered to be unsafe in a requirements file: + # via importlib-metadata pip==24.0 diff --git a/requirements.lock b/requirements.lock index c9283eb85a..8f59ff5e9c 100644 --- a/requirements.lock +++ b/requirements.lock @@ -5,6 +5,7 @@ # pre: false # features: [] # all-features: false +# with-sources: false -e file:pyvortex -e file:. diff --git a/vortex-alp/src/array.rs b/vortex-alp/src/array.rs index e4eaa96b16..acc56c4f43 100644 --- a/vortex-alp/src/array.rs +++ b/vortex-alp/src/array.rs @@ -1,12 +1,14 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayKind, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_schema::{DType, IntWidth, Signedness}; @@ -115,6 +117,10 @@ impl Array for ALPArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.encoded()) + } } impl ArrayDisplay for ALPArray { diff --git a/vortex-alp/src/lib.rs b/vortex-alp/src/lib.rs index f104396cd5..59081e565f 100644 --- a/vortex-alp/src/lib.rs +++ b/vortex-alp/src/lib.rs @@ -2,7 +2,7 @@ pub use alp::*; pub use array::*; use linkme::distributed_slice; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; mod alp; mod array; diff --git a/vortex-alp/src/serde.rs b/vortex-alp/src/serde.rs index 9fb2d40366..846ba87125 100644 --- a/vortex-alp/src/serde.rs +++ b/vortex-alp/src/serde.rs @@ -13,6 +13,10 @@ impl ArraySerde for ALPArray { ctx.write_fixed_slice([self.exponents().e, self.exponents().f])?; ctx.write(self.encoded()) } + + fn metadata(&self) -> VortexResult>> { + Ok(Some(vec![self.exponents().e, self.exponents().f])) + } } impl EncodingSerde for ALPEncoding { diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 3a967319e1..11fac1473d 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -23,6 +23,8 @@ allocator-api2 = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } +flatbuffers = { workspace = true } +flexbuffers = { workspace = true } half = { workspace = true } humansize = { workspace = true } itertools = { workspace = true } @@ -36,11 +38,16 @@ rand = { workspace = true } thiserror = { workspace = true } vortex-alloc = { path = "../vortex-alloc" } vortex-error = { path = "../vortex-error" } +vortex-flatbuffers = { path = "../vortex-flatbuffers" } vortex-schema = { path = "../vortex-schema" } +[build-dependencies] +flatc = { workspace = true } +walkdir = { workspace = true } + [dev-dependencies] criterion = { workspace = true } [[bench]] name = "search_sorted" -harness = false \ No newline at end of file +harness = false diff --git a/vortex-array/build.rs b/vortex-array/build.rs new file mode 120000 index 0000000000..7cb528993c --- /dev/null +++ b/vortex-array/build.rs @@ -0,0 +1 @@ +../flatbuffers.build.rs \ No newline at end of file diff --git a/vortex-array/flatbuffers/array.fbs b/vortex-array/flatbuffers/array.fbs new file mode 100644 index 0000000000..2dedd3927c --- /dev/null +++ b/vortex-array/flatbuffers/array.fbs @@ -0,0 +1,15 @@ +namespace vortex.array; + +enum Version: uint8 { + V0 = 0, +} + +table Array { + version: Version = V0; + encoding: uint16; + metadata: [ubyte]; + children: [Array]; + nbuffers: uint16; +} + +root_type Array; diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index b45aabd3e3..9aa4543d13 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -3,17 +3,18 @@ use std::sync::{Arc, RwLock}; use arrow_buffer::buffer::BooleanBuffer; use linkme::distributed_slice; -use vortex_error::VortexResult; -use vortex_schema::{DType, Nullability}; - use crate::array::IntoArray; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stat, Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; +use vortex_error::VortexResult; +use vortex_schema::{DType, Nullability}; -use super::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS}; +use super::{check_slice_bounds, Array, ArrayRef}; mod compute; mod serde; @@ -115,6 +116,14 @@ impl Array for BoolArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + if let Some(v) = self.validity() { + // FIXME(ngates): Validity to implement Array? + walker.visit_child(&v.to_array())?; + } + walker.visit_buffer(self.buffer.inner()) + } } impl ArrayValidity for BoolArray { diff --git a/vortex-array/src/array/bool/serde.rs b/vortex-array/src/array/bool/serde.rs index 4fc00601eb..12a0a99dbc 100644 --- a/vortex-array/src/array/bool/serde.rs +++ b/vortex-array/src/array/bool/serde.rs @@ -4,7 +4,7 @@ use vortex_error::VortexResult; use crate::array::bool::{BoolArray, BoolEncoding}; use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; +use crate::serde::{ArraySerde, ArrayView, BytesSerde, EncodingSerde, ReadCtx, WriteCtx}; use crate::validity::ArrayValidity; impl ArraySerde for BoolArray { @@ -12,9 +12,17 @@ impl ArraySerde for BoolArray { ctx.write_validity(self.validity())?; ctx.write_buffer(self.len(), &self.buffer().sliced()) } + + fn metadata(&self) -> VortexResult>> { + Ok(Some(self.len().serialize())) + } } impl EncodingSerde for BoolEncoding { + fn len(&self, view: &ArrayView) -> usize { + usize::deserialize(view.metadata().unwrap()).unwrap() + } + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let validity = ctx.read_validity()?; let (logical_len, buf) = ctx.read_buffer(|len| (len + 7) / 8)?; diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index ff99a81f1d..a3b26e32d8 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -6,14 +6,14 @@ use linkme::distributed_slice; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; -use crate::array::{ - check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; mod compute; mod serde; @@ -145,6 +145,13 @@ impl Array for ChunkedArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + for chunk in self.chunks() { + walker.visit_child(&chunk)?; + } + Ok(()) + } } impl ArrayValidity for ChunkedArray { diff --git a/vortex-array/src/array/chunked/serde.rs b/vortex-array/src/array/chunked/serde.rs index 1dca8ad900..6461f05a5a 100644 --- a/vortex-array/src/array/chunked/serde.rs +++ b/vortex-array/src/array/chunked/serde.rs @@ -1,8 +1,9 @@ +use flexbuffers::Builder; use vortex_error::VortexResult; use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; +use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for ChunkedArray { fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { @@ -12,9 +13,27 @@ impl ArraySerde for ChunkedArray { } Ok(()) } + + fn metadata(&self) -> VortexResult>> { + // TODO(ngates) #163 - the chunk lengths should probably themselves be an array? + let mut builder = Builder::default(); + let mut vec = builder.start_vector(); + for end in self.chunk_ends() { + vec.push(*end); + } + vec.end_vector(); + Ok(Some(builder.take_buffer())) + } } impl EncodingSerde for ChunkedEncoding { + fn len(&self, view: &ArrayView) -> usize { + (0..view.nchildren()) + .map(|c| view.child(c, view.dtype()).unwrap()) + .map(|v| v.len()) + .sum() + } + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let chunk_len = ctx.read_usize()?; let mut chunks = Vec::::with_capacity(chunk_len); diff --git a/vortex-array/src/array/composite/array.rs b/vortex-array/src/array/composite/array.rs index d97a50dc37..93e4413eaf 100644 --- a/vortex-array/src/array/composite/array.rs +++ b/vortex-array/src/array/composite/array.rs @@ -7,14 +7,15 @@ use vortex_error::VortexResult; use vortex_schema::{CompositeID, DType}; use crate::array::composite::{find_extension, CompositeExtensionRef, TypedCompositeArray}; -use crate::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS}; +use crate::array::{Array, ArrayRef}; use crate::compress::EncodingCompression; use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::serde::{ArraySerde, BytesSerde, EncodingSerde}; use crate::stats::{Stats, StatsCompute, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; pub trait CompositeMetadata: 'static + Debug + Display + Send + Sync + Sized + Clone + BytesSerde @@ -120,6 +121,10 @@ impl Array for CompositeArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.underlying()) + } } impl StatsCompute for CompositeArray {} diff --git a/vortex-array/src/array/composite/serde.rs b/vortex-array/src/array/composite/serde.rs index 74e6125df2..c69f2ee65b 100644 --- a/vortex-array/src/array/composite/serde.rs +++ b/vortex-array/src/array/composite/serde.rs @@ -1,11 +1,12 @@ +use flatbuffers::FlatBufferBuilder; use std::sync::Arc; -use vortex_error::VortexResult; -use vortex_schema::DType; - use crate::array::composite::{CompositeArray, CompositeEncoding}; use crate::array::{Array, ArrayRef}; use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; +use vortex_error::VortexResult; +use vortex_flatbuffers::WriteFlatBuffer; +use vortex_schema::DType; impl ArraySerde for CompositeArray { fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { @@ -14,6 +15,13 @@ impl ArraySerde for CompositeArray { ctx.dtype(underlying.dtype())?; ctx.write(self.underlying()) } + + fn metadata(&self) -> VortexResult>> { + let mut fbb = FlatBufferBuilder::new(); + let dtype = self.underlying().dtype().write_flatbuffer(&mut fbb); + fbb.finish_minimal(dtype); + Ok(Some(fbb.finished_data().to_vec())) + } } impl EncodingSerde for CompositeEncoding { diff --git a/vortex-array/src/array/constant/mod.rs b/vortex-array/src/array/constant/mod.rs index 0a3d0facb7..faa90b0d50 100644 --- a/vortex-array/src/array/constant/mod.rs +++ b/vortex-array/src/array/constant/mod.rs @@ -2,18 +2,17 @@ use std::sync::{Arc, RwLock}; use linkme::distributed_slice; -use vortex_error::VortexResult; -use vortex_schema::DType; - -use crate::array::{ - check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::scalar::Scalar; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stat, Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; +use vortex_error::VortexResult; +use vortex_schema::DType; mod compute; mod serde; @@ -96,6 +95,10 @@ impl Array for ConstantArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { + Ok(()) + } } impl ArrayValidity for ConstantArray { diff --git a/vortex-array/src/array/constant/serde.rs b/vortex-array/src/array/constant/serde.rs index 1316c1e708..e2e70e2f3f 100644 --- a/vortex-array/src/array/constant/serde.rs +++ b/vortex-array/src/array/constant/serde.rs @@ -9,6 +9,15 @@ impl ArraySerde for ConstantArray { ctx.write_usize(self.len())?; ctx.scalar(self.scalar()) } + + fn metadata(&self) -> VortexResult>> { + // FIXME(ngates): use flatbuffer / serde. + let mut vec = Vec::new(); + let mut ctx = WriteCtx::new(&mut vec); + ctx.write_usize(self.len())?; + ctx.scalar(self.scalar())?; + Ok(Some(vec)) + } } impl EncodingSerde for ConstantEncoding { diff --git a/vortex-array/src/array/mod.rs b/vortex-array/src/array/mod.rs index b20e06e3da..ffdc0370fe 100644 --- a/vortex-array/src/array/mod.rs +++ b/vortex-array/src/array/mod.rs @@ -1,10 +1,7 @@ use std::any::Any; use std::fmt::{Debug, Display, Formatter}; -use std::hash::{Hash, Hasher}; use std::sync::Arc; -use linkme::distributed_slice; - use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, Nullability}; @@ -18,19 +15,9 @@ use crate::array::sparse::{SparseArray, SparseEncoding}; use crate::array::struct_::{StructArray, StructEncoding}; use crate::array::varbin::{VarBinArray, VarBinEncoding}; use crate::array::varbinview::{VarBinViewArray, VarBinViewEncoding}; -use crate::compress::EncodingCompression; -use crate::compute::as_arrow::AsArrowArray; -use crate::compute::as_contiguous::AsContiguousFn; -use crate::compute::cast::CastFn; -use crate::compute::fill::FillForwardFn; -use crate::compute::flatten::FlattenFn; -use crate::compute::patch::PatchFn; -use crate::compute::scalar_at::ScalarAtFn; -use crate::compute::search_sorted::SearchSortedFn; -use crate::compute::take::TakeFn; use crate::compute::ArrayCompute; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::serde::{ArraySerde, EncodingSerde}; +use crate::serde::ArraySerde; use crate::stats::Stats; use crate::validity::{ArrayValidity, Validity}; @@ -55,7 +42,7 @@ pub type ArrayRef = Arc; /// /// This differs from Apache Arrow where logical and physical are combined in /// the data type, e.g. LargeString, RunEndEncoded. -pub trait Array: ArrayCompute + ArrayValidity + ArrayDisplay + Debug + Send + Sync { +pub trait Array: ArrayValidity + ArrayDisplay + Debug + Send + Sync { /// Converts itself to a reference of [`Any`], which enables downcasting to concrete types. fn as_any(&self) -> &dyn Any; fn into_any(self: Arc) -> Arc; @@ -68,8 +55,12 @@ pub trait Array: ArrayCompute + ArrayValidity + ArrayDisplay + Debug + Send + Sy fn is_empty(&self) -> bool; /// Get the dtype of the array fn dtype(&self) -> &DType; + /// Get statistics for the array + /// TODO(ngates): this is interesting. What type do we return from this? + /// Maybe we actually need to model stats more like compute? fn stats(&self) -> Stats; + /// Limit array to start..stop range fn slice(&self, start: usize, stop: usize) -> VortexResult; /// Encoding kind of the array @@ -77,9 +68,40 @@ pub trait Array: ArrayCompute + ArrayValidity + ArrayDisplay + Debug + Send + Sy /// Approximate size in bytes of the array. Only takes into account variable size portion of the array fn nbytes(&self) -> usize; + fn with_compute_mut( + &self, + _f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + vortex_bail!( + "with_compute_mut not implemented for {}", + self.encoding().id() + ) + } + fn serde(&self) -> Option<&dyn ArraySerde> { None } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()>; +} + +pub trait WithArrayCompute { + fn with_compute VortexResult>(&self, f: F) + -> VortexResult; +} + +impl WithArrayCompute for dyn Array + '_ { + fn with_compute VortexResult>( + &self, + f: F, + ) -> VortexResult { + let mut result: Option = None; + self.with_compute_mut(&mut |compute| { + result = Some(f(compute)?); + Ok(()) + })?; + Ok(result.unwrap()) + } } pub trait IntoArray { @@ -108,49 +130,21 @@ macro_rules! impl_array { fn into_array(self) -> ArrayRef { std::sync::Arc::new(self) } + + #[inline] + fn with_compute_mut( + &self, + f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + f(self) + } }; } +use crate::encoding::EncodingRef; +use crate::ArrayWalker; pub use impl_array; -impl ArrayCompute for ArrayRef { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - self.as_ref().as_arrow() - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - self.as_ref().as_contiguous() - } - - fn cast(&self) -> Option<&dyn CastFn> { - self.as_ref().cast() - } - - fn flatten(&self) -> Option<&dyn FlattenFn> { - self.as_ref().flatten() - } - - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { - self.as_ref().fill_forward() - } - - fn patch(&self) -> Option<&dyn PatchFn> { - self.as_ref().patch() - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - self.as_ref().scalar_at() - } - - fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { - self.as_ref().search_sorted() - } - - fn take(&self) -> Option<&dyn TakeFn> { - self.as_ref().take() - } -} - impl ArrayValidity for ArrayRef { fn nullability(&self) -> Nullability { self.as_ref().nullability() @@ -214,9 +208,21 @@ impl Array for ArrayRef { self.as_ref().nbytes() } + fn with_compute_mut( + &self, + f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + self.as_ref().with_compute_mut(f) + } + fn serde(&self) -> Option<&dyn ArraySerde> { self.as_ref().serde() } + + #[allow(unused_variables)] + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + self.as_ref().walk(walker) + } } impl ArrayDisplay for ArrayRef { @@ -225,44 +231,6 @@ impl ArrayDisplay for ArrayRef { } } -impl<'a, T: ArrayCompute> ArrayCompute for &'a T { - fn as_arrow(&self) -> Option<&dyn AsArrowArray> { - T::as_arrow(self) - } - - fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> { - T::as_contiguous(self) - } - - fn cast(&self) -> Option<&dyn CastFn> { - T::cast(self) - } - - fn flatten(&self) -> Option<&dyn FlattenFn> { - T::flatten(self) - } - - fn fill_forward(&self) -> Option<&dyn FillForwardFn> { - T::fill_forward(self) - } - - fn patch(&self) -> Option<&dyn PatchFn> { - T::patch(self) - } - - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { - T::scalar_at(self) - } - - fn search_sorted(&self) -> Option<&dyn SearchSortedFn> { - T::search_sorted(self) - } - - fn take(&self) -> Option<&dyn TakeFn> { - T::take(self) - } -} - impl<'a, T: ArrayValidity> ArrayValidity for &'a T { fn nullability(&self) -> Nullability { T::nullability(self) @@ -329,6 +297,17 @@ impl<'a, T: Array + Clone> Array for &'a T { fn serde(&self) -> Option<&dyn ArraySerde> { T::serde(self) } + + fn with_compute_mut( + &self, + f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + T::with_compute_mut(self, f) + } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + T::walk(self, walker) + } } impl<'a, T: ArrayDisplay> ArrayDisplay for &'a T { @@ -365,65 +344,6 @@ pub fn check_validity_buffer(validity: Option<&ArrayRef>, expected_len: usize) - Ok(()) } -#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] -pub struct EncodingId(&'static str); - -impl EncodingId { - pub const fn new(id: &'static str) -> Self { - Self(id) - } - - #[inline] - pub fn name(&self) -> &'static str { - self.0 - } -} - -impl Display for EncodingId { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - Display::fmt(self.0, f) - } -} - -pub trait Encoding: Debug + Send + Sync + 'static { - fn id(&self) -> EncodingId; - - /// Whether this encoding provides a compressor. - fn compression(&self) -> Option<&dyn EncodingCompression> { - None - } - - /// Array serialization - fn serde(&self) -> Option<&dyn EncodingSerde> { - None - } -} - -impl Display for dyn Encoding { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.id()) - } -} - -pub type EncodingRef = &'static dyn Encoding; - -impl PartialEq for EncodingRef { - fn eq(&self, other: &Self) -> bool { - self.id() == other.id() - } -} - -impl Eq for EncodingRef {} - -impl Hash for EncodingRef { - fn hash(&self, state: &mut H) { - self.id().hash(state) - } -} - -#[distributed_slice] -pub static ENCODINGS: [EncodingRef] = [..]; - #[derive(Debug, Clone)] pub enum ArrayKind<'a> { Bool(&'a BoolArray), diff --git a/vortex-array/src/array/primitive/compute/patch.rs b/vortex-array/src/array/primitive/compute/patch.rs index b7a85a8759..9a0d92cb5b 100644 --- a/vortex-array/src/array/primitive/compute/patch.rs +++ b/vortex-array/src/array/primitive/compute/patch.rs @@ -15,7 +15,7 @@ impl PatchFn for PrimitiveArray { match patch.encoding().id() { SparseEncoding::ID => patch_with_sparse(self, patch.as_sparse()), // TODO(ngates): support a default implementation based on iter_arrow? - _ => Err(vortex_err!(NotImplemented: "patch", self.encoding().id().0)), + _ => Err(vortex_err!(NotImplemented: "patch", self.encoding().id().name())), } } } diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index a31a1bd7c9..cb2013a2bf 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -9,25 +9,28 @@ use allocator_api2::alloc::Allocator; use arrow_buffer::buffer::{Buffer, ScalarBuffer}; use linkme::distributed_slice; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, Nullability}; use crate::accessor::ArrayAccessor; use crate::array::IntoArray; -use crate::array::{ - check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::iterator::ArrayIter; use crate::ptype::{match_each_native_ptype, NativePType, PType}; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; mod compute; mod serde; mod stats; +mod view; + +use crate::compute::ArrayCompute; +pub use view::*; #[derive(Debug, Clone)] pub struct PrimitiveArray { @@ -200,6 +203,14 @@ impl Array for PrimitiveArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + if let Some(v) = self.validity() { + // FIXME(ngates): should validity implement Array? + walker.visit_child(&v.to_array())?; + } + walker.visit_buffer(self.buffer()) + } } impl ArrayValidity for PrimitiveArray { diff --git a/vortex-array/src/array/primitive/serde.rs b/vortex-array/src/array/primitive/serde.rs index 4ae4ecb42b..038e3f285e 100644 --- a/vortex-array/src/array/primitive/serde.rs +++ b/vortex-array/src/array/primitive/serde.rs @@ -1,8 +1,10 @@ use vortex_error::VortexResult; -use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding}; +use crate::array::primitive::{PrimitiveArray, PrimitiveEncoding, PrimitiveView}; use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; +use crate::compute::ArrayCompute; +use crate::match_each_native_ptype; +use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; use crate::validity::ArrayValidity; impl ArraySerde for PrimitiveArray { @@ -11,9 +13,24 @@ impl ArraySerde for PrimitiveArray { ctx.write_validity(self.validity())?; ctx.write_buffer(self.len(), self.buffer()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for PrimitiveEncoding { + fn with_view_compute<'view>( + &self, + view: &'view ArrayView, + f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + let view = PrimitiveView::try_new(view)?; + match_each_native_ptype!(view.ptype(), |$T| { + f(&view.as_trait::<$T>()) + }) + } + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { let ptype = ctx.ptype()?; let validity = ctx.read_validity()?; diff --git a/vortex-array/src/array/primitive/view.rs b/vortex-array/src/array/primitive/view.rs new file mode 100644 index 0000000000..526ecd3e9b --- /dev/null +++ b/vortex-array/src/array/primitive/view.rs @@ -0,0 +1,114 @@ +use arrow_buffer::Buffer; +use num_traits::PrimInt; + +use crate::array::PrimitiveArray; +use crate::array::{Array, ArrayRef}; +use crate::compute::flatten::{flatten_primitive, FlattenFn, FlattenedArray}; +use crate::compute::take::TakeFn; +use crate::compute::ArrayCompute; +use crate::match_each_integer_ptype; +use crate::ptype::{NativePType, PType}; +use crate::serde::ArrayView; +use crate::validity::Validity; +use vortex_error::{vortex_err, VortexResult}; + +pub struct PrimitiveView<'a> { + ptype: PType, + buffer: &'a Buffer, + // TODO(ngates): look at a ValidityView? + validity: Option, +} + +impl<'a> PrimitiveView<'a> { + pub fn try_new(view: &'a ArrayView<'a>) -> VortexResult { + // TODO(ngates): validate the number of buffers / children. We could even extract them? + let ptype = PType::try_from(view.dtype())?; + let buffer = view + .buffers() + .first() + .ok_or_else(|| vortex_err!(InvalidSerde: "Missing primitive buffer"))?; + let validity = view + .child(0, &Validity::DTYPE) + // FIXME(ngates): avoid this clone. + .map(|v| Validity::Array(Array::to_array(&v))); + + Ok(Self { + ptype, + buffer, + validity, + }) + } + + pub fn ptype(&self) -> PType { + self.ptype + } + + pub fn as_trait(&self) -> &dyn PrimitiveTrait { + assert_eq!(self.ptype, T::PTYPE); + self + } +} + +impl<'a, T: NativePType> PrimitiveTrait for PrimitiveView<'a> { + fn ptype(&self) -> PType { + self.ptype + } + + fn validity(&self) -> Option { + self.validity.clone() + } + + fn typed_data(&self) -> &[T] { + self.buffer.typed_data::() + } + + fn to_array(&self) -> ArrayRef { + PrimitiveArray::new(self.ptype, self.buffer.clone(), self.validity.clone()).into_array() + } +} + +// The question is how can we implement ArrayCompute for PrimitiveArray + PrimitiveView? +// We can't use a trait since typed_data doesn't work? Or maybe we can but we just return Buffer? +pub trait PrimitiveTrait { + fn ptype(&self) -> PType; + fn validity(&self) -> Option; + fn typed_data(&self) -> &[T]; + fn to_array(&self) -> ArrayRef; +} + +// TODO(ngates): migrate all primitive compute over to PrimitiveTrait. +impl ArrayCompute for &dyn PrimitiveTrait { + fn flatten(&self) -> Option<&dyn FlattenFn> { + Some(self) + } + + fn take(&self) -> Option<&dyn TakeFn> { + Some(self) + } +} + +impl FlattenFn for &dyn PrimitiveTrait { + fn flatten(&self) -> VortexResult { + todo!() + } +} + +impl TakeFn for &dyn PrimitiveTrait { + fn take(&self, indices: &dyn Array) -> VortexResult { + let validity = self.validity().map(|v| v.take(indices)).transpose()?; + let indices = flatten_primitive(indices)?; + match_each_integer_ptype!(indices.ptype(), |$I| { + Ok(PrimitiveArray::from_nullable( + take_primitive(self.typed_data(), indices.typed_data::<$I>()), + validity, + ).into_array()) + }) + } +} + +fn take_primitive(array: &[T], indices: &[I]) -> Vec { + indices + .iter() + .map(|&idx| array[idx.to_usize().unwrap()]) + .collect() +} diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index 87b987bec6..aa25f1aa22 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -3,21 +3,21 @@ use std::sync::{Arc, RwLock}; use itertools::Itertools; use linkme::distributed_slice; -use vortex_error::{vortex_bail, VortexResult}; -use vortex_schema::DType; - -use crate::array::ENCODINGS; -use crate::array::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; use crate::compress::EncodingCompression; use crate::compute::cast::cast; use crate::compute::flatten::flatten_primitive; use crate::compute::search_sorted::{search_sorted, SearchSortedSide}; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::ptype::PType; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsCompute, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::DType; mod compress; mod compute; @@ -140,6 +140,11 @@ impl Array for SparseArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.indices())?; + walker.visit_child(self.values()) + } } impl StatsCompute for SparseArray {} diff --git a/vortex-array/src/array/sparse/serde.rs b/vortex-array/src/array/sparse/serde.rs index 117cfdd1af..34930dc6ff 100644 --- a/vortex-array/src/array/sparse/serde.rs +++ b/vortex-array/src/array/sparse/serde.rs @@ -16,6 +16,16 @@ impl ArraySerde for SparseArray { ctx.write(self.indices())?; ctx.write(self.values()) } + + fn metadata(&self) -> VortexResult>> { + // FIXME(ngates): use flatbuffer / serde. + let mut vec = Vec::new(); + let mut ctx = WriteCtx::new(&mut vec); + ctx.write_usize(self.len())?; + // TODO(robert): Rewrite indices and don't store offset + ctx.write_usize(self.indices_offset())?; + Ok(Some(vec)) + } } impl EncodingSerde for SparseEncoding { diff --git a/vortex-array/src/array/struct_/compress.rs b/vortex-array/src/array/struct_/compress.rs index 7350ef8c45..7b37270a4e 100644 --- a/vortex-array/src/array/struct_/compress.rs +++ b/vortex-array/src/array/struct_/compress.rs @@ -36,6 +36,6 @@ impl EncodingCompression for StructEncoding { }) .try_collect()?; - Ok(StructArray::new(struct_array.names().clone(), fields).into_array()) + Ok(StructArray::new(struct_array.names().clone(), fields, array.len()).into_array()) } } diff --git a/vortex-array/src/array/struct_/compute.rs b/vortex-array/src/array/struct_/compute.rs index 2338289ac7..0ae2d89b2d 100644 --- a/vortex-array/src/array/struct_/compute.rs +++ b/vortex-array/src/array/struct_/compute.rs @@ -87,6 +87,7 @@ impl AsContiguousFn for StructArray { .iter() .map(|field_arrays| as_contiguous(field_arrays)) .try_collect()?, + self.len, ) .into_array()) } @@ -100,6 +101,7 @@ impl FlattenFn for StructArray { .iter() .map(|field| flatten(field.as_ref()).map(FlattenedArray::into_array)) .try_collect()?, + self.len, ))) } } @@ -125,6 +127,7 @@ impl TakeFn for StructArray { .iter() .map(|field| take(field, indices)) .try_collect()?, + indices.len(), ) .into_array()) } diff --git a/vortex-array/src/array/struct_/mod.rs b/vortex-array/src/array/struct_/mod.rs index 4fe222b54c..22406ba11c 100644 --- a/vortex-array/src/array/struct_/mod.rs +++ b/vortex-array/src/array/struct_/mod.rs @@ -3,17 +3,18 @@ use std::sync::{Arc, RwLock}; use itertools::Itertools; use linkme::distributed_slice; -use vortex_error::VortexResult; -use vortex_schema::{DType, FieldNames}; - use crate::compress::EncodingCompression; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsCompute, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; +use vortex_error::VortexResult; +use vortex_schema::{DType, FieldNames}; -use super::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS}; +use super::{check_slice_bounds, Array, ArrayRef}; mod compress; mod compute; @@ -23,19 +24,21 @@ mod serde; pub struct StructArray { fields: Vec, dtype: DType, + len: usize, stats: Arc>, } impl StructArray { - pub fn new(names: FieldNames, fields: Vec) -> Self { + pub fn new(names: FieldNames, fields: Vec, len: usize) -> Self { assert!( - fields.iter().map(|v| v.len()).all_equal(), + fields.iter().all(|v| v.len() == len), "Fields didn't have the same length" ); let dtype = DType::Struct(names, fields.iter().map(|a| a.dtype().clone()).collect()); Self { fields, dtype, + len, stats: Arc::new(RwLock::new(StatsSet::new())), } } @@ -66,7 +69,7 @@ impl Array for StructArray { impl_array!(); fn len(&self) -> usize { - self.fields.first().map_or(0, |a| a.len()) + self.len } #[inline] @@ -95,6 +98,7 @@ impl Array for StructArray { Ok(Self { fields, dtype: self.dtype.clone(), + len: stop - start, stats: Arc::new(RwLock::new(StatsSet::new())), } .into_array()) @@ -112,6 +116,13 @@ impl Array for StructArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + for field in self.fields() { + walker.visit_child(field)?; + } + Ok(()) + } } impl ArrayValidity for StructArray { diff --git a/vortex-array/src/array/struct_/serde.rs b/vortex-array/src/array/struct_/serde.rs index 5a83c6247a..6912f9603f 100644 --- a/vortex-array/src/array/struct_/serde.rs +++ b/vortex-array/src/array/struct_/serde.rs @@ -1,22 +1,52 @@ +use itertools::Itertools; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; use crate::array::struct_::{StructArray, StructEncoding}; use crate::array::{Array, ArrayRef}; -use crate::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; +use crate::serde::{ArraySerde, ArrayView, EncodingSerde, ReadCtx, WriteCtx}; impl ArraySerde for StructArray { fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { + ctx.write_usize(self.len())?; ctx.write_usize(self.fields().len())?; for f in self.fields() { ctx.write(f.as_ref())?; } Ok(()) } + + fn metadata(&self) -> VortexResult>> { + let length = self.len() as u64; + Ok(Some(length.to_le_bytes().to_vec())) + } } impl EncodingSerde for StructEncoding { + fn to_array(&self, view: &ArrayView) -> ArrayRef { + let DType::Struct(names, fields) = view.dtype() else { + panic!("Incorrect DType {}", view.dtype()) + }; + assert_eq!(fields.len(), view.nchildren()); + StructArray::new( + names.clone(), + fields + .iter() + .enumerate() + .map(|(i, field)| view.child(i, field).unwrap().into_array()) + .collect_vec(), + self.len(view), + ) + .into_array() + } + + fn len(&self, view: &ArrayView) -> usize { + let length = u64::from_le_bytes(view.metadata().unwrap().try_into().unwrap()); + length as usize + } + fn read(&self, ctx: &mut ReadCtx) -> VortexResult { + let len = ctx.read_usize()?; let num_fields = ctx.read_usize()?; let mut fields = Vec::::with_capacity(num_fields); // TODO(robert): use read_vectored @@ -26,7 +56,7 @@ impl EncodingSerde for StructEncoding { let DType::Struct(names, _) = ctx.schema() else { vortex_bail!(MismatchedTypes: "any struct", ctx.schema()); }; - Ok(StructArray::new(names.clone(), fields).into_array()) + Ok(StructArray::new(names.clone(), fields, len).into_array()) } } @@ -52,6 +82,7 @@ mod test { vec![7u8, 37, 71, 97].into_array(), PrimitiveArray::from_iter(vec![Some(0), None, Some(2), Some(42)]).into_array(), ], + 4, ); let read_arr = roundtrip_array(&arr).unwrap(); diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index db92e6cbec..ca68ffd198 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -8,19 +8,19 @@ use vortex_schema::{DType, IntWidth, Nullability, Signedness}; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::primitive::PrimitiveArray; -use crate::array::{ - check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; use crate::compress::EncodingCompression; use crate::compute::flatten::flatten_primitive; use crate::compute::scalar_at::scalar_at; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::iterator::ArrayIter; use crate::ptype::NativePType; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; mod accessor; mod builder; @@ -244,6 +244,11 @@ impl Array for VarBinArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.offsets())?; + walker.visit_child(self.bytes()) + } } impl ArrayValidity for VarBinArray { diff --git a/vortex-array/src/array/varbin/serde.rs b/vortex-array/src/array/varbin/serde.rs index 22acd8a74f..7a9d278b4b 100644 --- a/vortex-array/src/array/varbin/serde.rs +++ b/vortex-array/src/array/varbin/serde.rs @@ -12,6 +12,10 @@ impl ArraySerde for VarBinArray { ctx.write(self.offsets())?; ctx.write(self.bytes()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for VarBinEncoding { diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index db8d2b86b2..4872fad606 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -6,15 +6,15 @@ use linkme::distributed_slice; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, IntWidth, Nullability, Signedness}; -use crate::array::{ - check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef, ENCODINGS, -}; +use crate::array::{check_slice_bounds, Array, ArrayRef}; use crate::compute::flatten::flatten_primitive; +use crate::compute::ArrayCompute; +use crate::encoding::{Encoding, EncodingId, EncodingRef, ENCODINGS}; use crate::formatter::{ArrayDisplay, ArrayFormatter}; -use crate::impl_array; use crate::serde::{ArraySerde, EncodingSerde}; use crate::stats::{Stats, StatsSet}; use crate::validity::{ArrayValidity, Validity}; +use crate::{impl_array, ArrayWalker}; mod compute; mod serde; @@ -233,6 +233,14 @@ impl Array for VarBinViewArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.views())?; + for data in self.data() { + walker.visit_child(data)?; + } + Ok(()) + } } impl ArrayValidity for VarBinViewArray { diff --git a/vortex-array/src/array/varbinview/serde.rs b/vortex-array/src/array/varbinview/serde.rs index feb16812b7..3d0eae040b 100644 --- a/vortex-array/src/array/varbinview/serde.rs +++ b/vortex-array/src/array/varbinview/serde.rs @@ -15,6 +15,10 @@ impl ArraySerde for VarBinViewArray { } Ok(()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for VarBinViewEncoding { diff --git a/vortex-array/src/arrow/recordbatch.rs b/vortex-array/src/arrow/recordbatch.rs index 3cc307edf0..326220ec19 100644 --- a/vortex-array/src/arrow/recordbatch.rs +++ b/vortex-array/src/arrow/recordbatch.rs @@ -19,6 +19,7 @@ impl IntoArray for &RecordBatch { .zip(self.schema().fields()) .map(|(array, field)| ArrayRef::from_arrow(array.clone(), field.is_nullable())) .collect(), + self.num_rows(), ) .into_array() } diff --git a/vortex-array/src/compress.rs b/vortex-array/src/compress.rs index 1f1b4d4629..e2a864c5f3 100644 --- a/vortex-array/src/compress.rs +++ b/vortex-array/src/compress.rs @@ -12,9 +12,10 @@ use crate::array::constant::ConstantArray; use crate::array::sparse::SparseEncoding; use crate::array::struct_::{StructArray, StructEncoding}; use crate::array::varbin::VarBinEncoding; -use crate::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingRef, ENCODINGS}; +use crate::array::{Array, ArrayKind, ArrayRef}; use crate::compute; use crate::compute::scalar_at::scalar_at; +use crate::encoding::{Encoding, EncodingRef, ENCODINGS}; use crate::formatter::display_tree; use crate::sampling::stratified_slices; use crate::stats::Stat; @@ -241,7 +242,10 @@ impl CompressCtx { .iter() .map(|field| self.compress_array(field)) .collect(); - Ok(StructArray::new(strct.names().clone(), compressed_fields?).into_array()) + Ok( + StructArray::new(strct.names().clone(), compressed_fields?, strct.len()) + .into_array(), + ) } _ => { // Otherwise, we run sampled compression over pluggable encodings diff --git a/vortex-array/src/compute/as_arrow.rs b/vortex-array/src/compute/as_arrow.rs index df5bbe22b0..85ed27a747 100644 --- a/vortex-array/src/compute/as_arrow.rs +++ b/vortex-array/src/compute/as_arrow.rs @@ -4,24 +4,25 @@ use itertools::Itertools; use vortex_error::{vortex_err, VortexResult}; use crate::array::downcast::DowncastArrayBuiltin; -use crate::array::Array; +use crate::array::{Array, WithArrayCompute}; use crate::compute::flatten::flatten; -use crate::compute::ArrayCompute; pub trait AsArrowArray { fn as_arrow(&self) -> VortexResult; } pub fn as_arrow(array: &dyn Array) -> VortexResult { - // If as_arrow is implemented, then invoke that. - if let Some(a) = array.as_arrow() { - return a.as_arrow(); - } + array.with_compute(|c| { + // If as_arrow is implemented, then invoke that. + if let Some(a) = c.as_arrow() { + return a.as_arrow(); + } - // Otherwise, flatten and try again. - let array = flatten(array)?.into_array(); - array.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "as_arrow", array.encoding().id().name())) + // Otherwise, flatten and try again. + let array = flatten(array)?.into_array(); + c.as_arrow().map(|a| a.as_arrow()).unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "as_arrow", array.encoding().id().name())) + }) }) } diff --git a/vortex-array/src/compute/as_contiguous.rs b/vortex-array/src/compute/as_contiguous.rs index b1ae4e1787..924e501978 100644 --- a/vortex-array/src/compute/as_contiguous.rs +++ b/vortex-array/src/compute/as_contiguous.rs @@ -2,7 +2,7 @@ use itertools::Itertools; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::ArrayRef; +use crate::array::{Array, ArrayRef, WithArrayCompute}; pub trait AsContiguousFn { fn as_contiguous(&self, arrays: &[ArrayRef]) -> VortexResult; @@ -19,13 +19,14 @@ pub fn as_contiguous(arrays: &[ArrayRef]) -> VortexResult { } let first = arrays.first().unwrap(); - first - .as_contiguous() - .map(|f| f.as_contiguous(arrays)) - .unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "as_contiguous", - first.encoding().id().name() - )) - }) + first.with_compute(|c| { + c.as_contiguous() + .map(|f| f.as_contiguous(arrays)) + .unwrap_or_else(|| { + Err(vortex_err!( + NotImplemented: "as_contiguous", + first.encoding().id().name() + )) + }) + }) } diff --git a/vortex-array/src/compute/cast.rs b/vortex-array/src/compute/cast.rs index 0c35a20ad2..f254307f7f 100644 --- a/vortex-array/src/compute/cast.rs +++ b/vortex-array/src/compute/cast.rs @@ -1,7 +1,7 @@ use vortex_error::{vortex_err, VortexResult}; use vortex_schema::DType; -use crate::array::{Array, ArrayRef}; +use crate::array::{Array, ArrayRef, WithArrayCompute}; pub trait CastFn { fn cast(&self, dtype: &DType) -> VortexResult; @@ -13,8 +13,9 @@ pub fn cast(array: &dyn Array, dtype: &DType) -> VortexResult { } // TODO(ngates): check for null_count if dtype is non-nullable - array - .cast() - .map(|f| f.cast(dtype)) - .unwrap_or_else(|| Err(vortex_err!(NotImplemented: "cast", array.encoding().id().name()))) + array.with_compute(|c| { + c.cast().map(|f| f.cast(dtype)).unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "cast", array.encoding().id().name())) + }) + }) } diff --git a/vortex-array/src/compute/fill.rs b/vortex-array/src/compute/fill.rs index 86c43a3e72..aa2a8a9083 100644 --- a/vortex-array/src/compute/fill.rs +++ b/vortex-array/src/compute/fill.rs @@ -1,6 +1,6 @@ use vortex_error::{vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef}; +use crate::array::{Array, ArrayRef, WithArrayCompute}; pub trait FillForwardFn { fn fill_forward(&self) -> VortexResult; @@ -11,13 +11,14 @@ pub fn fill_forward(array: &dyn Array) -> VortexResult { return Ok(array.to_array()); } - array - .fill_forward() - .map(|t| t.fill_forward()) - .unwrap_or_else(|| { - Err(vortex_err!( - NotImplemented: "fill_forward", - array.encoding().id().name() - )) - }) + array.with_compute(|c| { + c.fill_forward() + .map(|t| t.fill_forward()) + .unwrap_or_else(|| { + Err(vortex_err!( + NotImplemented: "fill_forward", + array.encoding().id().name() + )) + }) + }) } diff --git a/vortex-array/src/compute/flatten.rs b/vortex-array/src/compute/flatten.rs index 672e0ffbb0..446eedb684 100644 --- a/vortex-array/src/compute/flatten.rs +++ b/vortex-array/src/compute/flatten.rs @@ -7,7 +7,7 @@ use crate::array::primitive::PrimitiveArray; use crate::array::struct_::StructArray; use crate::array::varbin::VarBinArray; use crate::array::varbinview::VarBinViewArray; -use crate::array::{Array, ArrayRef}; +use crate::array::{Array, ArrayRef, WithArrayCompute}; pub trait FlattenFn { fn flatten(&self) -> VortexResult; @@ -41,8 +41,10 @@ impl FlattenedArray { /// Flatten an array into one of the flat encodings. /// This does not guarantee that the array is recursively flattened. pub fn flatten(array: &dyn Array) -> VortexResult { - array.flatten().map(|f| f.flatten()).unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "flatten", array.encoding().id().name())) + array.with_compute(|c| { + c.flatten().map(|f| f.flatten()).unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "flatten", array.encoding().id().name())) + }) }) } diff --git a/vortex-array/src/compute/patch.rs b/vortex-array/src/compute/patch.rs index 04e89e4e2a..8a98e5ea77 100644 --- a/vortex-array/src/compute/patch.rs +++ b/vortex-array/src/compute/patch.rs @@ -1,6 +1,6 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef}; +use crate::array::{Array, ArrayRef, WithArrayCompute}; pub trait PatchFn { fn patch(&self, patch: &dyn Array) -> VortexResult; @@ -20,8 +20,9 @@ pub fn patch(array: &dyn Array, patch: &dyn Array) -> VortexResult { vortex_bail!(MismatchedTypes: array.dtype(), patch.dtype()); } - array - .patch() - .map(|t| t.patch(patch)) - .unwrap_or_else(|| Err(vortex_err!(NotImplemented: "take", array.encoding().id().name()))) + array.with_compute(|c| { + c.patch().map(|t| t.patch(patch)).unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) + }) + }) } diff --git a/vortex-array/src/compute/scalar_at.rs b/vortex-array/src/compute/scalar_at.rs index 9556679a83..ee4855b50f 100644 --- a/vortex-array/src/compute/scalar_at.rs +++ b/vortex-array/src/compute/scalar_at.rs @@ -1,6 +1,6 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::array::Array; +use crate::array::{Array, WithArrayCompute}; use crate::scalar::Scalar; pub trait ScalarAtFn { @@ -12,10 +12,11 @@ pub fn scalar_at(array: &dyn Array, index: usize) -> VortexResult { vortex_bail!(OutOfBounds: index, 0, array.len()); } - array - .scalar_at() - .map(|t| t.scalar_at(index)) - .unwrap_or_else(|| { - Err(vortex_err!(NotImplemented: "scalar_at", array.encoding().id().name())) - }) + array.with_compute(|c| { + c.scalar_at() + .map(|t| t.scalar_at(index)) + .unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "scalar_at", array.encoding().id().name())) + }) + }) } diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 0be65c65e4..90f21b8023 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering::{Equal, Greater, Less}; use vortex_error::{vortex_err, VortexResult}; -use crate::array::Array; +use crate::array::{Array, WithArrayCompute}; use crate::compute::scalar_at::scalar_at; use crate::scalar::Scalar; @@ -23,18 +23,20 @@ pub fn search_sorted>( side: SearchSortedSide, ) -> VortexResult { let scalar = target.into().cast(array.dtype())?; - if let Some(search_sorted) = array.search_sorted() { - return search_sorted.search_sorted(&scalar, side); - } + array.with_compute(|c| { + if let Some(search_sorted) = c.search_sorted() { + return search_sorted.search_sorted(&scalar, side); + } - if array.scalar_at().is_some() { - return Ok(SearchSorted::search_sorted(&array, &scalar, side)); - } + if c.scalar_at().is_some() { + return Ok(SearchSorted::search_sorted(&array, &scalar, side)); + } - Err(vortex_err!( - NotImplemented: "search_sorted", - array.encoding().id().name() - )) + Err(vortex_err!( + NotImplemented: "search_sorted", + array.encoding().id().name() + )) + }) } pub trait IndexOrd { diff --git a/vortex-array/src/compute/take.rs b/vortex-array/src/compute/take.rs index 193b92a374..35f18af44c 100644 --- a/vortex-array/src/compute/take.rs +++ b/vortex-array/src/compute/take.rs @@ -2,7 +2,7 @@ use log::info; use vortex_error::{vortex_err, VortexResult}; -use crate::array::{Array, ArrayRef}; +use crate::array::{Array, ArrayRef, WithArrayCompute}; use crate::compute::flatten::flatten; pub trait TakeFn { @@ -10,15 +10,17 @@ pub trait TakeFn { } pub fn take(array: &dyn Array, indices: &dyn Array) -> VortexResult { - if let Some(take) = array.take() { - return take.take(indices); - } + array.with_compute(|c| { + if let Some(take) = c.take() { + return take.take(indices); + } - // Otherwise, flatten and try again. - info!("TakeFn not implemented for {}, flattening", array); - flatten(array)? - .into_array() - .take() - .map(|t| t.take(indices)) - .unwrap_or_else(|| Err(vortex_err!(NotImplemented: "take", array.encoding().id().name()))) + // Otherwise, flatten and try again. + info!("TakeFn not implemented for {}, flattening", array); + flatten(array)?.into_array().with_compute(|c| { + c.take().map(|t| t.take(indices)).unwrap_or_else(|| { + Err(vortex_err!(NotImplemented: "take", array.encoding().id().name())) + }) + }) + }) } diff --git a/vortex-array/src/encode.rs b/vortex-array/src/encode.rs index 666c0f1eb2..a82dc1d5a8 100644 --- a/vortex-array/src/encode.rs +++ b/vortex-array/src/encode.rs @@ -170,6 +170,7 @@ impl FromArrowArray<&ArrowStructArray> for ArrayRef { .zip(value.fields()) .map(|(c, field)| ArrayRef::from_arrow(c.clone(), field.is_nullable())) .collect(), + value.len(), ) .into_array() } diff --git a/vortex-array/src/encoding.rs b/vortex-array/src/encoding.rs new file mode 100644 index 0000000000..3943b3d706 --- /dev/null +++ b/vortex-array/src/encoding.rs @@ -0,0 +1,68 @@ +use crate::compress::EncodingCompression; +use crate::serde::EncodingSerde; +use linkme::distributed_slice; +use std::fmt::{Debug, Display, Formatter}; +use std::hash::{Hash, Hasher}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub struct EncodingId(&'static str); + +impl EncodingId { + pub const fn new(id: &'static str) -> Self { + Self(id) + } + + #[inline] + pub fn name(&self) -> &'static str { + self.0 + } +} + +impl Display for EncodingId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Display::fmt(self.0, f) + } +} + +pub trait Encoding: Debug + Send + Sync + 'static { + fn id(&self) -> EncodingId; + + /// Whether this encoding provides a compressor. + fn compression(&self) -> Option<&dyn EncodingCompression> { + None + } + + /// Array serialization + fn serde(&self) -> Option<&dyn EncodingSerde> { + None + } +} + +impl Display for dyn Encoding { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.id()) + } +} + +pub type EncodingRef = &'static dyn Encoding; + +impl PartialEq for EncodingRef { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for EncodingRef {} + +impl Hash for EncodingRef { + fn hash(&self, state: &mut H) { + self.id().hash(state) + } +} + +#[distributed_slice] +pub static ENCODINGS: [EncodingRef] = [..]; + +pub fn find_encoding(id: &str) -> Option { + ENCODINGS.iter().find(|&x| x.id().name() == id).cloned() +} diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 453b08058c..dc428993e1 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -1,3 +1,5 @@ +extern crate core; + pub mod array; pub mod arrow; pub mod scalar; @@ -7,6 +9,7 @@ pub mod compress; pub mod compute; pub mod datetime; pub mod encode; +pub mod encoding; pub mod formatter; pub mod iterator; pub mod ptype; @@ -14,3 +17,19 @@ mod sampling; pub mod serde; pub mod stats; pub mod validity; + +mod walk; + +pub use walk::*; + +pub mod flatbuffers { + pub use generated::vortex::*; + + #[allow(unused_imports)] + #[allow(dead_code)] + #[allow(non_camel_case_types)] + #[allow(clippy::all)] + mod generated { + include!(concat!(env!("OUT_DIR"), "/flatbuffers/array.rs")); + } +} diff --git a/vortex-array/src/serde/context.rs b/vortex-array/src/serde/context.rs new file mode 100644 index 0000000000..ec5bcfb33e --- /dev/null +++ b/vortex-array/src/serde/context.rs @@ -0,0 +1,37 @@ +use crate::encoding::{EncodingId, EncodingRef, ENCODINGS}; +use itertools::Itertools; +use std::sync::Arc; + +#[derive(Debug)] +pub struct SerdeContext { + encodings: Arc<[EncodingRef]>, +} + +impl SerdeContext { + pub fn new(encodings: Arc<[EncodingRef]>) -> Self { + Self { encodings } + } + + pub fn encodings(&self) -> &[EncodingRef] { + self.encodings.as_ref() + } + + pub fn find_encoding(&self, encoding_id: u16) -> Option { + self.encodings.get(encoding_id as usize).cloned() + } + + pub fn encoding_idx(&self, encoding_id: EncodingId) -> Option { + self.encodings + .iter() + .position(|e| e.id() == encoding_id) + .map(|i| i as u16) + } +} + +impl Default for SerdeContext { + fn default() -> Self { + Self { + encodings: ENCODINGS.iter().cloned().collect_vec().into(), + } + } +} diff --git a/vortex-array/src/serde/data.rs b/vortex-array/src/serde/data.rs new file mode 100644 index 0000000000..cbc59340a0 --- /dev/null +++ b/vortex-array/src/serde/data.rs @@ -0,0 +1,129 @@ +use crate::array::Array; +use crate::encoding::EncodingId; +use crate::walk::ArrayWalker; +use arrow_buffer::Buffer; +use vortex_error::{vortex_err, VortexResult}; + +pub struct ArrayData { + columns: Vec, +} + +impl ArrayData { + pub fn new(columns: Vec) -> Self { + Self { columns } + } + + pub fn columns(&self) -> &[ColumnData] { + &self.columns + } +} + +#[derive(Debug)] +pub struct ColumnData { + encoding: EncodingId, + metadata: Option, + children: Vec, + buffers: Vec, +} + +impl ColumnData { + pub fn try_from_array(array: &dyn Array) -> VortexResult { + let mut data = ColumnData { + encoding: array.encoding().id(), + metadata: array + .serde() + .ok_or_else(|| { + vortex_err!(InvalidSerde: "Array {} does not support serde", array.encoding()) + })? + .metadata()? + .map(Buffer::from_vec), + children: Vec::new(), + buffers: Vec::new(), + }; + array.walk(&mut data)?; + Ok(data) + } + + pub fn new( + encoding: EncodingId, + metadata: Option, + children: Vec, + buffers: Vec, + ) -> Self { + Self { + encoding, + metadata, + children, + buffers, + } + } + + pub fn encoding(&self) -> EncodingId { + self.encoding + } + + pub fn metadata(&self) -> Option<&Buffer> { + self.metadata.as_ref() + } + + pub fn children(&self) -> &[ColumnData] { + &self.children + } + + pub fn buffers(&self) -> &[Buffer] { + &self.buffers + } + + pub fn depth_first_traversal(&self) -> ColumnDataIterator { + ColumnDataIterator { stack: vec![self] } + } + + /// Return the buffer offsets and the total length of all buffers, assuming the given alignment. + /// This includes all child buffers. + pub fn all_buffer_offsets(&self, alignment: usize) -> Vec { + let mut offsets = Vec::with_capacity(self.buffers.len() + 1); + let mut offset = 0; + + for col_data in self.depth_first_traversal() { + for buffer in col_data.buffers() { + offsets.push(offset as u64); + + let buffer_size = buffer.len(); + let aligned_size = (buffer_size + (alignment - 1)) & !(alignment - 1); + offset += aligned_size; + } + } + offsets.push(offset as u64); + + offsets + } +} + +impl ArrayWalker for ColumnData { + fn visit_child(&mut self, array: &dyn Array) -> VortexResult<()> { + self.children.push(ColumnData::try_from_array(array)?); + Ok(()) + } + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { + self.buffers.push(buffer.clone()); + Ok(()) + } +} + +/// A depth-first iterator over a ColumnData. +pub struct ColumnDataIterator<'a> { + stack: Vec<&'a ColumnData>, +} + +impl<'a> Iterator for ColumnDataIterator<'a> { + type Item = &'a ColumnData; + + fn next(&mut self) -> Option { + let next = self.stack.pop()?; + for child in &next.children { + self.stack.push(child); + } + Some(next) + } +} diff --git a/vortex-array/src/serde/mod.rs b/vortex-array/src/serde/mod.rs index 0d7543e86d..ac3920a768 100644 --- a/vortex-array/src/serde/mod.rs +++ b/vortex-array/src/serde/mod.rs @@ -1,25 +1,69 @@ +use arrow_buffer::BooleanBuffer; use std::io; -use std::io::{ErrorKind, Read, Write}; +use std::io::{Cursor, ErrorKind, Read, Write}; use arrow_buffer::buffer::{Buffer, MutableBuffer}; +use flatbuffers::root; +use itertools::Itertools; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::{DType, FbDeserialize, FbSerialize, IntWidth, Nullability, Signedness}; - -use crate::array::composite::find_extension_id; -use crate::array::{Array, ArrayRef, EncodingId, ENCODINGS}; +use crate::array::composite::COMPOSITE_EXTENSIONS; +use crate::array::{Array, ArrayRef}; +use crate::encoding::{find_encoding, EncodingId, ENCODINGS}; use crate::ptype::PType; use crate::scalar::{Scalar, ScalarReader, ScalarWriter}; use crate::serde::ptype::PTypeTag; use crate::validity::Validity; +use vortex_error::{vortex_err, VortexResult}; +use vortex_schema::DTypeSerdeContext; +use vortex_schema::{DType, IntWidth, Nullability, Signedness}; +pub mod context; +pub mod data; mod ptype; +pub mod view; + +use crate::array::bool::BoolArray; +use crate::compute::ArrayCompute; +pub use view::*; +use vortex_flatbuffers::{FlatBufferToBytes, ReadFlatBuffer}; pub trait ArraySerde { fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()>; + + fn metadata(&self) -> VortexResult>>; } pub trait EncodingSerde { + fn validate(&self, _view: &ArrayView) -> VortexResult<()> { + Ok(()) + // todo!("Validate not implemented for {}", _view.encoding().id()); + } + + fn to_array(&self, view: &ArrayView) -> ArrayRef { + BoolArray::new( + BooleanBuffer::new(view.buffers().first().unwrap().clone(), 0, view.len()), + view.child(0, &Validity::DTYPE) + .map(|c| Validity::Array(c.into_array())), + ) + .into_array() + } + + // TODO(ngates): remove this ideally? It can error... Maybe store lengths in array views? + fn len(&self, _view: &ArrayView) -> usize { + todo!( + "EncodingSerde.len not implemented for {}", + _view.encoding().id() + ); + } + + fn with_view_compute<'view>( + &self, + _view: &'view ArrayView, + _f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + Err(vortex_err!(ComputeError: "Compute not implemented")) + } + fn read(&self, ctx: &mut ReadCtx) -> VortexResult; } @@ -32,6 +76,22 @@ where fn deserialize(data: &[u8]) -> VortexResult; } +impl BytesSerde for usize { + fn serialize(&self) -> Vec { + let mut vec = Vec::new(); + // IOError only happens on EOF. + leb128::write::unsigned(&mut vec, *self as u64).unwrap(); + vec + } + + fn deserialize(data: &[u8]) -> VortexResult { + let mut cursor = Cursor::new(data); + leb128::read::unsigned(&mut cursor) + .map(|v| v as usize) + .map_err(|e| vortex_err!(InvalidSerde: "Failed to parse leb128 {}", e)) + } +} + pub struct ReadCtx<'a> { schema: &'a DType, encodings: Vec, @@ -82,7 +142,11 @@ impl<'a> ReadCtx<'a> { #[inline] pub fn dtype(&mut self) -> VortexResult { let dtype_bytes = self.read_slice()?; - DType::deserialize(&dtype_bytes, find_extension_id) + let ctx = DTypeSerdeContext::new(COMPOSITE_EXTENSIONS.iter().map(|e| e.id()).collect_vec()); + DType::read_flatbuffer( + &ctx, + &(root::(&dtype_bytes)?), + ) } pub fn ptype(&mut self) -> VortexResult { @@ -170,11 +234,8 @@ impl<'a> ReadCtx<'a> { pub fn read(&mut self) -> VortexResult { let encoding_id = self.read_usize()?; - if let Some(serde) = ENCODINGS - .iter() - .filter(|e| e.id().name() == self.encodings[encoding_id].name()) - .flat_map(|e| e.serde()) - .next() + if let Some(serde) = + find_encoding(self.encodings[encoding_id].name()).and_then(|e| e.serde()) { serde.read(self) } else { @@ -198,8 +259,8 @@ impl<'a> WriteCtx<'a> { } pub fn dtype(&mut self, dtype: &DType) -> VortexResult<()> { - let (bytes, head) = dtype.serialize(); - self.write_slice(&bytes[head..]) + let (bytes, offset) = dtype.flatbuffer_to_bytes(); + self.write_slice(&bytes[offset..]) } pub fn ptype(&mut self, ptype: PType) -> VortexResult<()> { diff --git a/vortex-array/src/serde/view.rs b/vortex-array/src/serde/view.rs new file mode 100644 index 0000000000..eb4c1fab49 --- /dev/null +++ b/vortex-array/src/serde/view.rs @@ -0,0 +1,221 @@ +use crate::array::{Array, ArrayRef}; +use crate::compute::ArrayCompute; +use crate::encoding::EncodingRef; +use crate::flatbuffers::array as fb; +use crate::formatter::{ArrayDisplay, ArrayFormatter}; +use crate::serde::context::SerdeContext; +use crate::serde::EncodingSerde; +use crate::stats::Stats; +use crate::validity::{ArrayValidity, Validity}; +use crate::ArrayWalker; +use arrow_buffer::Buffer; +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; +use vortex_schema::DType; + +#[derive(Clone)] +pub struct ArrayView<'a> { + encoding: EncodingRef, + dtype: &'a DType, + array: fb::Array<'a>, + buffers: &'a [Buffer], + ctx: &'a SerdeContext, +} + +impl<'a> Debug for ArrayView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrayView") + .field("encoding", &self.encoding) + .field("dtype", &self.dtype) + // .field("array", &self.array) + .field("buffers", &self.buffers) + .field("ctx", &self.ctx) + .finish() + } +} + +impl<'a> ArrayView<'a> { + pub fn try_new( + ctx: &'a SerdeContext, + dtype: &'a DType, + array: fb::Array<'a>, + buffers: &'a [Buffer], + ) -> VortexResult { + let encoding = ctx + .find_encoding(array.encoding()) + .ok_or_else(|| vortex_err!(InvalidSerde: "Encoding ID out of bounds"))?; + let _vtable = encoding.serde().ok_or_else(|| { + // TODO(ngates): we could fall-back to heap-allocating? + vortex_err!(InvalidSerde: "Encoding {} does not support serde", encoding) + })?; + + if buffers.len() != Self::cumulative_nbuffers(array) { + vortex_bail!(InvalidSerde: + "Incorrect number of buffers {}, expected {}", + buffers.len(), + Self::cumulative_nbuffers(array) + ) + } + + Ok(Self { + encoding, + dtype, + array, + buffers, + ctx, + }) + } + + pub fn encoding(&self) -> EncodingRef { + self.encoding + } + + pub fn vtable(&self) -> &dyn EncodingSerde { + self.encoding.serde().unwrap() + } + + pub fn dtype(&self) -> &DType { + self.dtype + } + + pub fn metadata(&self) -> Option<&'a [u8]> { + self.array.metadata().map(|m| m.bytes()) + } + + pub fn nchildren(&self) -> usize { + self.array.children().map(|c| c.len()).unwrap_or_default() + } + + pub fn child(&self, idx: usize, dtype: &'a vortex_schema::DType) -> Option> { + let child = self.array_child(idx)?; + + // Figure out how many buffers to skip... + // We store them depth-first. + let buffer_offset = self + .array + .children()? + .iter() + .take(idx) + .map(|child| Self::cumulative_nbuffers(child)) + .sum(); + let buffer_count = Self::cumulative_nbuffers(child); + + Some( + Self::try_new( + self.ctx, + dtype, + child, + &self.buffers[buffer_offset..][0..buffer_count], + ) + .unwrap(), + ) + } + + fn array_child(&self, idx: usize) -> Option> { + let children = self.array.children()?; + if idx < children.len() { + Some(children.get(idx)) + } else { + None + } + } + + /// The number of buffers used by the current Array. + pub fn nbuffers(&self) -> usize { + self.array.nbuffers() as usize + } + + /// The number of buffers used by the current Array and all its children. + fn cumulative_nbuffers(array: fb::Array) -> usize { + let mut nbuffers = array.nbuffers() as usize; + for child in array.children().unwrap_or_default() { + nbuffers += Self::cumulative_nbuffers(child); + } + nbuffers + } + + pub fn buffers(&self) -> &'a [Buffer] { + // This is only true for the immediate current node? + &self.buffers[0..self.nbuffers()] + } +} + +impl<'a> Array for ArrayView<'a> { + fn as_any(&self) -> &dyn Any { + panic!("Not implemented for ArrayView") + } + + fn into_any(self: Arc) -> Arc { + panic!("Not implemented for ArrayView") + } + + fn to_array(&self) -> ArrayRef { + self.vtable().to_array(self) + } + + fn into_array(self) -> ArrayRef { + // Not much point adding VTable.into_array for ArrayView since everything is by-reference. + self.vtable().to_array(&self) + } + + fn len(&self) -> usize { + self.vtable().len(self) + } + + fn is_empty(&self) -> bool { + todo!() + // self.vtable.is_empty(self).unwrap() + } + + fn dtype(&self) -> &DType { + self.dtype + } + + fn stats(&self) -> Stats { + // TODO(ngates): implement a dynamic trait for stats? + todo!() + } + + fn slice(&self, _start: usize, _stop: usize) -> VortexResult { + todo!() + } + + fn encoding(&self) -> EncodingRef { + self.encoding + } + + fn nbytes(&self) -> usize { + self.buffers.iter().map(|b| b.len()).sum() + } + + fn with_compute_mut( + &self, + f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, + ) -> VortexResult<()> { + self.encoding() + .serde() + .expect("TODO(ngates): heap allocate ArrayView and invoke compute") + .with_view_compute(self, f) + } + + fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { + todo!() + } +} + +impl<'a> ArrayValidity for ArrayView<'a> { + fn validity(&self) -> Option { + todo!() + } +} + +impl<'a> ArrayDisplay for ArrayView<'a> { + fn fmt(&self, fmt: &'_ mut ArrayFormatter) -> std::fmt::Result { + fmt.property("encoding", self.encoding)?; + fmt.property("dtype", self.dtype)?; + fmt.property("metadata", format!("{:?}", self.array.metadata()))?; + fmt.property("nchildren", self.nchildren()) + } +} diff --git a/vortex-array/src/walk.rs b/vortex-array/src/walk.rs new file mode 100644 index 0000000000..ea4571e8f3 --- /dev/null +++ b/vortex-array/src/walk.rs @@ -0,0 +1,9 @@ +use crate::array::Array; +use arrow_buffer::Buffer; +use vortex_error::VortexResult; + +pub trait ArrayWalker { + fn visit_child(&mut self, array: &dyn Array) -> VortexResult<()>; + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()>; +} diff --git a/vortex-datetime/src/datetime.rs b/vortex-datetime/src/datetime.rs index 4db00031f9..e4603111f1 100644 --- a/vortex-datetime/src/datetime.rs +++ b/vortex-datetime/src/datetime.rs @@ -1,12 +1,14 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsCompute, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; @@ -116,6 +118,12 @@ impl Array for DateTimeArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.days())?; + walker.visit_child(self.seconds())?; + walker.visit_child(self.subsecond()) + } } impl StatsCompute for DateTimeArray {} diff --git a/vortex-datetime/src/lib.rs b/vortex-datetime/src/lib.rs index bd4625e172..0b928df34d 100644 --- a/vortex-datetime/src/lib.rs +++ b/vortex-datetime/src/lib.rs @@ -1,7 +1,7 @@ use linkme::distributed_slice; pub use datetime::*; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; mod compress; mod compute; diff --git a/vortex-datetime/src/serde.rs b/vortex-datetime/src/serde.rs index 70f237f355..95b41fb306 100644 --- a/vortex-datetime/src/serde.rs +++ b/vortex-datetime/src/serde.rs @@ -14,6 +14,11 @@ impl ArraySerde for DateTimeArray { ctx.write(self.subsecond())?; ctx.write_validity(self.validity()) } + + fn metadata(&self) -> VortexResult>> { + // FIXME(ngates): I think we need child dtypes? + Ok(None) + } } impl EncodingSerde for DateTimeEncoding { diff --git a/vortex-dict/src/dict.rs b/vortex-dict/src/dict.rs index dda75edb5f..12e5155186 100644 --- a/vortex-dict/src/dict.rs +++ b/vortex-dict/src/dict.rs @@ -1,12 +1,14 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{check_slice_bounds, Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{check_slice_bounds, Array, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, Signedness}; @@ -80,6 +82,11 @@ impl Array for DictArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.values())?; + walker.visit_child(self.codes()) + } } impl ArrayDisplay for DictArray { diff --git a/vortex-dict/src/lib.rs b/vortex-dict/src/lib.rs index 7342c1d293..843f5fdffb 100644 --- a/vortex-dict/src/lib.rs +++ b/vortex-dict/src/lib.rs @@ -1,5 +1,5 @@ use linkme::distributed_slice; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; pub use compress::*; pub use dict::*; diff --git a/vortex-dict/src/serde.rs b/vortex-dict/src/serde.rs index cc92cae129..bcb7280a4c 100644 --- a/vortex-dict/src/serde.rs +++ b/vortex-dict/src/serde.rs @@ -11,6 +11,10 @@ impl ArraySerde for DictArray { ctx.dtype(self.codes().dtype())?; ctx.write(self.codes()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for DictEncoding { diff --git a/vortex-error/Cargo.toml b/vortex-error/Cargo.toml index ab140f94cb..0b617fc7d6 100644 --- a/vortex-error/Cargo.toml +++ b/vortex-error/Cargo.toml @@ -17,6 +17,7 @@ path = "src/lib.rs" [dependencies] arrow-schema = { workspace = true } +flatbuffers = { workspace = true } parquet = { workspace = true, optional = true } thiserror = { workspace = true } diff --git a/vortex-error/src/lib.rs b/vortex-error/src/lib.rs index 26184bfc59..d75ae5546c 100644 --- a/vortex-error/src/lib.rs +++ b/vortex-error/src/lib.rs @@ -2,7 +2,7 @@ use std::backtrace::Backtrace; use std::borrow::Cow; -use std::fmt::{Display, Formatter}; +use std::fmt::{Debug, Display, Formatter}; use std::ops::Deref; use std::{env, fmt, io}; @@ -42,7 +42,7 @@ impl Display for ErrString { } } -#[derive(Debug, thiserror::Error)] +#[derive(thiserror::Error)] pub enum VortexError { #[error("index {0} out of bounds from {1} to {2}\nBacktrace:\n{3}")] OutOfBounds(usize, usize, usize, Backtrace), @@ -50,6 +50,8 @@ pub enum VortexError { ComputeError(ErrString, Backtrace), #[error("{0}\nBacktrace:\n{1}")] InvalidArgument(ErrString, Backtrace), + #[error("{0}\nBacktrace:\n{1}")] + InvalidSerde(ErrString, Backtrace), #[error("function {0} not implemented for {1}\nBacktrace:\n{2}")] NotImplemented(ErrString, ErrString, Backtrace), #[error("expected type: {0} but instead got {1}\nBacktrace:\n{2}")] @@ -61,6 +63,12 @@ pub enum VortexError { arrow_schema::ArrowError, ), #[error(transparent)] + FlatBuffersError( + #[from] + #[backtrace] + flatbuffers::InvalidFlatbuffer, + ), + #[error(transparent)] IOError( #[from] #[backtrace] @@ -77,6 +85,12 @@ pub enum VortexError { pub type VortexResult = Result; +impl Debug for VortexError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(self, f) + } +} + #[macro_export] macro_rules! vortex_err { (OutOfBounds: $idx:expr, $start:expr, $stop:expr) => {{ @@ -109,7 +123,7 @@ macro_rules! vortex_err { $crate::VortexError::$variant(format!($fmt, $($arg),*).into(), Backtrace::capture()) ) }}; - ($variant:ident: $err:expr $(,)?) => {{} + ($variant:ident: $err:expr $(,)?) => { $crate::__private::must_use( $crate::VortexError::$variant($err) ) diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index 26a1be9f7b..20800f4918 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -279,7 +279,7 @@ fn count_exceptions(bit_width: usize, bit_width_freq: &[usize]) -> usize { mod test { use std::sync::Arc; - use vortex::array::{Encoding, EncodingRef}; + use vortex::encoding::{Encoding, EncodingRef}; use super::*; diff --git a/vortex-fastlanes/src/bitpacking/compute.rs b/vortex-fastlanes/src/bitpacking/compute.rs index ce1be79044..9e6f098ca9 100644 --- a/vortex-fastlanes/src/bitpacking/compute.rs +++ b/vortex-fastlanes/src/bitpacking/compute.rs @@ -65,9 +65,10 @@ mod test { use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; - use vortex::array::{Array, EncodingRef}; + use vortex::array::Array; use vortex::compress::{CompressConfig, CompressCtx}; use vortex::compute::take::take; + use vortex::encoding::EncodingRef; use crate::BitPackedEncoding; diff --git a/vortex-fastlanes/src/bitpacking/mod.rs b/vortex-fastlanes/src/bitpacking/mod.rs index f173495c5d..a6d2223fb0 100644 --- a/vortex-fastlanes/src/bitpacking/mod.rs +++ b/vortex-fastlanes/src/bitpacking/mod.rs @@ -1,14 +1,16 @@ use std::cmp::min; use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef}; use vortex::compress::EncodingCompression; use vortex::compute::flatten::flatten_primitive; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, IntWidth, Nullability, Signedness}; @@ -142,6 +144,10 @@ impl Array for BitPackedArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.encoded()) + } } impl ArrayDisplay for BitPackedArray { diff --git a/vortex-fastlanes/src/bitpacking/serde.rs b/vortex-fastlanes/src/bitpacking/serde.rs index b1fe72812f..c3a7a914ec 100644 --- a/vortex-fastlanes/src/bitpacking/serde.rs +++ b/vortex-fastlanes/src/bitpacking/serde.rs @@ -13,6 +13,14 @@ impl ArraySerde for BitPackedArray { ctx.write_usize(self.bit_width())?; ctx.write_usize(self.len()) } + + fn metadata(&self) -> VortexResult>> { + let mut vec = Vec::new(); + let mut ctx = WriteCtx::new(&mut vec); + ctx.write_usize(self.bit_width())?; + ctx.write_usize(self.len())?; + Ok(Some(vec)) + } } impl EncodingSerde for BitPackedEncoding { diff --git a/vortex-fastlanes/src/delta/compress.rs b/vortex-fastlanes/src/delta/compress.rs index a88070f551..c6c470124a 100644 --- a/vortex-fastlanes/src/delta/compress.rs +++ b/vortex-fastlanes/src/delta/compress.rs @@ -192,7 +192,7 @@ where mod test { use std::sync::Arc; - use vortex::array::{Encoding, EncodingRef}; + use vortex::encoding::{Encoding, EncodingRef}; use super::*; diff --git a/vortex-fastlanes/src/delta/mod.rs b/vortex-fastlanes/src/delta/mod.rs index cb771dff24..b64b6fc8f4 100644 --- a/vortex-fastlanes/src/delta/mod.rs +++ b/vortex-fastlanes/src/delta/mod.rs @@ -1,12 +1,14 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; -use vortex::{impl_array, match_each_integer_ptype}; +use vortex::{impl_array, match_each_integer_ptype, ArrayWalker}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; @@ -131,6 +133,11 @@ impl Array for DeltaArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.bases())?; + walker.visit_child(self.deltas()) + } } impl<'arr> AsRef<(dyn Array + 'arr)> for DeltaArray { diff --git a/vortex-fastlanes/src/delta/serde.rs b/vortex-fastlanes/src/delta/serde.rs index 2e48930c64..464677faca 100644 --- a/vortex-fastlanes/src/delta/serde.rs +++ b/vortex-fastlanes/src/delta/serde.rs @@ -12,6 +12,10 @@ impl ArraySerde for DeltaArray { ctx.write(self.deltas())?; ctx.write_validity(self.validity()) } + + fn metadata(&self) -> VortexResult>> { + todo!() + } } impl EncodingSerde for DeltaEncoding { diff --git a/vortex-fastlanes/src/for/compress.rs b/vortex-fastlanes/src/for/compress.rs index 74659bc98e..d3171dcb2f 100644 --- a/vortex-fastlanes/src/for/compress.rs +++ b/vortex-fastlanes/src/for/compress.rs @@ -150,8 +150,8 @@ fn trailing_zeros(array: &dyn Array) -> u8 { mod test { use std::sync::Arc; - use vortex::array::{Encoding, EncodingRef}; use vortex::compute::scalar_at::ScalarAtFn; + use vortex::encoding::{Encoding, EncodingRef}; use crate::BitPackedEncoding; diff --git a/vortex-fastlanes/src/for/mod.rs b/vortex-fastlanes/src/for/mod.rs index bc12c8ef9b..09cfba7509 100644 --- a/vortex-fastlanes/src/for/mod.rs +++ b/vortex-fastlanes/src/for/mod.rs @@ -1,13 +1,16 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayRef}; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; + use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::scalar::Scalar; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::DType; @@ -99,6 +102,10 @@ impl Array for FoRArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.encoded()) + } } impl ArrayDisplay for FoRArray { diff --git a/vortex-fastlanes/src/for/serde.rs b/vortex-fastlanes/src/for/serde.rs index a68fa68fc1..23aacd10dd 100644 --- a/vortex-fastlanes/src/for/serde.rs +++ b/vortex-fastlanes/src/for/serde.rs @@ -10,6 +10,14 @@ impl ArraySerde for FoRArray { ctx.write_usize(self.shift() as usize)?; ctx.write(self.encoded()) } + + fn metadata(&self) -> VortexResult>> { + let mut vec = Vec::new(); + let mut ctx = WriteCtx::new(&mut vec); + ctx.scalar(self.reference())?; + ctx.write_usize(self.shift() as usize)?; + Ok(Some(vec)) + } } impl EncodingSerde for FoREncoding { diff --git a/vortex-fastlanes/src/lib.rs b/vortex-fastlanes/src/lib.rs index 717d3b5e21..8d56633ade 100644 --- a/vortex-fastlanes/src/lib.rs +++ b/vortex-fastlanes/src/lib.rs @@ -6,7 +6,7 @@ use linkme::distributed_slice; pub use bitpacking::*; pub use delta::*; pub use r#for::*; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; mod bitpacking; mod delta; diff --git a/vortex-flatbuffers/Cargo.toml b/vortex-flatbuffers/Cargo.toml new file mode 100644 index 0000000000..f77c4f287a --- /dev/null +++ b/vortex-flatbuffers/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "vortex-flatbuffers" +version.workspace = true +homepage.workspace = true +repository.workspace = true +authors.workspace = true +license.workspace = true +keywords.workspace = true +include.workspace = true +edition.workspace = true +rust-version.workspace = true + +[dependencies] +flatbuffers = { workspace = true } +vortex-error = { path = "../vortex-error" } + +[lints] +workspace = true diff --git a/vortex-flatbuffers/src/lib.rs b/vortex-flatbuffers/src/lib.rs new file mode 100644 index 0000000000..ed74a7ecaf --- /dev/null +++ b/vortex-flatbuffers/src/lib.rs @@ -0,0 +1,95 @@ +use flatbuffers::{root, FlatBufferBuilder, Follow, Verifiable, WIPOffset}; +use std::io; +use std::io::{Read, Write}; +use vortex_error::{vortex_err, VortexResult}; + +pub trait ReadFlatBuffer: Sized { + type Source<'a>; + type Error; + + fn read_flatbuffer(ctx: &Ctx, fb: &Self::Source<'_>) -> Result; +} + +pub trait WriteFlatBuffer { + type Target<'a>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset>; +} + +pub trait FlatBufferToBytes { + fn flatbuffer_to_bytes(&self) -> (Vec, usize); +} + +pub trait FlatBufferRoot {} + +impl FlatBufferToBytes for F { + fn flatbuffer_to_bytes(&self) -> (Vec, usize) { + let mut fbb = FlatBufferBuilder::new(); + let root_offset = self.write_flatbuffer(&mut fbb); + fbb.finish_minimal(root_offset); + fbb.collapse() + } +} + +pub trait FlatBufferReader { + /// Returns Ok(None) if the reader has reached EOF. + fn read_message<'a, F>(&mut self, buffer: &'a mut Vec) -> VortexResult> + where + F: 'a + Follow<'a, Inner = F> + Verifiable; +} + +impl FlatBufferReader for R { + fn read_message<'a, F>(&mut self, buffer: &'a mut Vec) -> VortexResult> + where + F: 'a + Follow<'a, Inner = F> + Verifiable, + { + let mut msg_size: [u8; 4] = [0; 4]; + if let Err(e) = self.read_exact(&mut msg_size) { + return match e.kind() { + io::ErrorKind::UnexpectedEof => Ok(None), + _ => Err(vortex_err!(IOError: e)), + }; + } + let msg_size = u32::from_le_bytes(msg_size) as u64; + if msg_size == 0 { + // FIXME(ngates): I think this is wrong. + return Ok(None); + } + self.take(msg_size).read_to_end(buffer)?; + Ok(Some(root::(buffer)?)) + } +} + +pub trait FlatBufferWriter { + // Write the given FlatBuffer message, appending padding until the total bytes written + // are a multiple of `alignment`. + fn write_message( + &mut self, + msg: &F, + alignment: usize, + ) -> io::Result<()>; +} + +impl FlatBufferWriter for W { + fn write_message( + &mut self, + msg: &F, + alignment: usize, + ) -> io::Result<()> { + let mut fbb = FlatBufferBuilder::new(); + let root = msg.write_flatbuffer(&mut fbb); + fbb.finish_minimal(root); + let fb_data = fbb.finished_data(); + let fb_size = fb_data.len(); + + let aligned_size = (fb_size + (alignment - 1)) & !(alignment - 1); + let padding_bytes = aligned_size - fb_size; + + self.write_all(&(aligned_size as u32).to_le_bytes())?; + self.write_all(fb_data)?; + self.write_all(&vec![0; padding_bytes]) + } +} diff --git a/vortex-ipc/Cargo.toml b/vortex-ipc/Cargo.toml new file mode 100644 index 0000000000..f480a3377d --- /dev/null +++ b/vortex-ipc/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "vortex-ipc" +version = { workspace = true } +description = "Vortex IPC" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[dependencies] +arrow-buffer = { workspace = true } +flatbuffers = { workspace = true } +itertools = { workspace = true } +lending-iterator = "0.1.7" +nougat = "0.2.4" +streaming-iterator = "0.1.9" +vortex-array = { path = "../vortex-array" } +vortex-error = { path = "../vortex-error" } +vortex-flatbuffers = { path = "../vortex-flatbuffers" } +vortex-schema = { path = "../vortex-schema" } + +[build-dependencies] +flatc = { workspace = true } +walkdir = { workspace = true } + +[lints] +# workspace = false diff --git a/vortex-ipc/README.md b/vortex-ipc/README.md new file mode 100644 index 0000000000..fea2beb5ab --- /dev/null +++ b/vortex-ipc/README.md @@ -0,0 +1,8 @@ +# Vortex IPC Format + +Messages: + +* Context - provides configuration context, e.g. which encodings are referenced in the stream. +* Array - indicates the start of an array. Contains the schema. +* Chunk - indices the start of an array chunk. Contains the offsets for each column message. +* ChunkColumn - contains the encoding metadata for a single column of a chunk, including offsets for each buffer. diff --git a/vortex-ipc/build.rs b/vortex-ipc/build.rs new file mode 120000 index 0000000000..7cb528993c --- /dev/null +++ b/vortex-ipc/build.rs @@ -0,0 +1 @@ +../flatbuffers.build.rs \ No newline at end of file diff --git a/vortex-ipc/flatbuffers/message.fbs b/vortex-ipc/flatbuffers/message.fbs new file mode 100644 index 0000000000..dbc66584ed --- /dev/null +++ b/vortex-ipc/flatbuffers/message.fbs @@ -0,0 +1,55 @@ +include "vortex-array/flatbuffers/array.fbs"; +include "vortex-schema/flatbuffers/dtype.fbs"; + +namespace vortex.ipc; + +enum Version: uint8 { + V0 = 0, +} + +table Context { + encodings: [Encoding]; +} + +table Encoding { + id: string; +} + +table Schema { + dtype: vortex.dtype.DType; +} + +table Chunk { + // Indicates the forward offsets for each ChunkColumn message. + column_offsets: [uint64]; +} + +enum Compression: uint8 { + None = 0, +} + +struct Buffer { + offset: uint64; + length: uint64; + compression: Compression; +} + +table ChunkColumn { + array: vortex.array.Array; + buffers: [Buffer]; + buffer_size: uint64; +} + +union MessageHeader { + Context, + Schema, + Chunk, + ChunkColumn, +} + +table Message { + version: Version = V0; + header: MessageHeader; +} + +root_type Message; diff --git a/vortex-ipc/src/chunked.rs b/vortex-ipc/src/chunked.rs new file mode 100644 index 0000000000..a088b47d20 --- /dev/null +++ b/vortex-ipc/src/chunked.rs @@ -0,0 +1,15 @@ +use lending_iterator::prelude::*; +use vortex::array::{Array, ArrayRef}; +use vortex_error::VortexResult; +use vortex_schema::DType; + +/// Stream chunks of a Vortex array. +#[allow(dead_code)] +pub trait ArrayChunkReader: Iterator> { + fn dtype(&self) -> &DType; +} + +#[allow(dead_code)] +pub trait ArrayViewChunkReader: LendingIteratorDyn)> { + fn dtype(&self) -> &DType; +} diff --git a/vortex-ipc/src/iter.rs b/vortex-ipc/src/iter.rs new file mode 100644 index 0000000000..1acd800e68 --- /dev/null +++ b/vortex-ipc/src/iter.rs @@ -0,0 +1,11 @@ +use nougat::gat; + +#[gat] +pub trait FallibleLendingIterator { + type Error; + type Item<'next> + where + Self: 'next; + + fn next(&mut self) -> Result>, Self::Error>; +} diff --git a/vortex-ipc/src/lib.rs b/vortex-ipc/src/lib.rs new file mode 100644 index 0000000000..a1c3902ce4 --- /dev/null +++ b/vortex-ipc/src/lib.rs @@ -0,0 +1,77 @@ +extern crate core; + +use vortex_error::{vortex_err, VortexError}; + +pub const ALIGNMENT: usize = 64; + +pub mod flatbuffers { + pub use generated::vortex::*; + + #[allow(unused_imports)] + #[allow(dead_code)] + #[allow(non_camel_case_types)] + #[allow(clippy::all)] + mod generated { + include!(concat!(env!("OUT_DIR"), "/flatbuffers/message.rs")); + } + + mod deps { + pub mod array { + pub use vortex::flatbuffers::array; + } + pub mod dtype { + pub use vortex_schema::flatbuffers as dtype; + } + } +} + +mod chunked; +pub mod iter; +mod messages; +pub mod reader; +pub mod writer; + +pub(crate) const fn missing(field: &'static str) -> impl FnOnce() -> VortexError { + move || vortex_err!(InvalidSerde: "missing field: {}", field) +} + +#[cfg(test)] +mod tests { + use std::io::{Cursor, Write}; + + use vortex::array::downcast::DowncastArrayBuiltin; + use vortex::array::primitive::PrimitiveArray; + use vortex::compute::take::take; + use vortex::serde::context::SerdeContext; + + use crate::iter::FallibleLendingIterator; + use crate::reader::StreamReader; + use crate::writer::StreamWriter; + + #[test] + fn test_write_flatbuffer() { + let array = PrimitiveArray::from_iter(vec![Some(1i32), None, None, Some(4), Some(5)]); + + let mut cursor = Cursor::new(Vec::new()); + let ctx = SerdeContext::default(); + let mut writer = StreamWriter::try_new_unbuffered(&mut cursor, ctx).unwrap(); + writer.write(&array).unwrap(); + cursor.flush().unwrap(); + cursor.set_position(0); + + let mut ipc_reader = StreamReader::try_new_unbuffered(cursor).unwrap(); + + // Read some number of arrays off the stream. + while let Some(array_reader) = ipc_reader.next().unwrap() { + let mut array_reader = array_reader; + println!("DType: {:?}", array_reader.dtype()); + // Read some number of chunks from the stream. + while let Some(chunk) = array_reader.next().unwrap() { + println!("VIEW: {:?}", &chunk); + let taken = take(&chunk, &PrimitiveArray::from(vec![0, 3, 0, 1])).unwrap(); + let taken = taken.as_primitive().typed_data::(); + println!("Taken: {:?}", &taken); + } + } + } +} diff --git a/vortex-ipc/src/messages.rs b/vortex-ipc/src/messages.rs new file mode 100644 index 0000000000..1911922937 --- /dev/null +++ b/vortex-ipc/src/messages.rs @@ -0,0 +1,210 @@ +use crate::flatbuffers::ipc as fb; +use crate::flatbuffers::ipc::Compression; +use crate::{missing, ALIGNMENT}; +use flatbuffers::{FlatBufferBuilder, WIPOffset}; +use itertools::Itertools; +use vortex::encoding::find_encoding; +use vortex::flatbuffers::array as fba; +use vortex::serde::context::SerdeContext; +use vortex::serde::data::ColumnData; +use vortex_error::{vortex_err, VortexError}; +use vortex_flatbuffers::{FlatBufferRoot, WriteFlatBuffer}; +use vortex_schema::DType; + +pub(crate) enum IPCMessage<'a> { + Context(IPCContext<'a>), + Schema(IPCSchema<'a>), + Chunk(IPCChunk<'a>), + ChunkColumn(IPCChunkColumn<'a>), +} + +pub(crate) struct IPCContext<'a>(pub &'a SerdeContext); +pub(crate) struct IPCSchema<'a>(pub &'a DType); +pub(crate) struct IPCChunk<'a>(pub &'a [u64]); +pub(crate) struct IPCChunkColumn<'a>(pub &'a SerdeContext, pub &'a ColumnData); +pub(crate) struct IPCArray<'a>(pub &'a SerdeContext, pub &'a ColumnData); + +impl FlatBufferRoot for IPCMessage<'_> {} +impl WriteFlatBuffer for IPCMessage<'_> { + type Target<'a> = fb::Message<'a>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let header = match self { + Self::Context(f) => f.write_flatbuffer(fbb).as_union_value(), + Self::Schema(f) => f.write_flatbuffer(fbb).as_union_value(), + Self::Chunk(f) => f.write_flatbuffer(fbb).as_union_value(), + Self::ChunkColumn(f) => f.write_flatbuffer(fbb).as_union_value(), + }; + + let mut msg = fb::MessageBuilder::new(fbb); + msg.add_version(Default::default()); + msg.add_header_type(match self { + Self::Context(_) => fb::MessageHeader::Context, + Self::Schema(_) => fb::MessageHeader::Schema, + Self::Chunk(_) => fb::MessageHeader::Chunk, + Self::ChunkColumn(_) => fb::MessageHeader::ChunkColumn, + }); + msg.add_header(header); + msg.finish() + } +} + +impl<'a> WriteFlatBuffer for IPCContext<'a> { + type Target<'t> = fb::Context<'t>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let fb_encodings = self + .0 + .encodings() + .iter() + .map(|e| e.id().name()) + .map(|name| { + let encoding_id = fbb.create_string(name); + fb::Encoding::create( + fbb, + &fb::EncodingArgs { + id: Some(encoding_id), + }, + ) + }) + .collect_vec(); + let fb_encodings = fbb.create_vector(fb_encodings.as_slice()); + + fb::Context::create( + fbb, + &fb::ContextArgs { + encodings: Some(fb_encodings), + }, + ) + } +} + +impl<'a> TryFrom> for SerdeContext { + type Error = VortexError; + + fn try_from(value: fb::Context<'a>) -> Result { + let fb_encodings = value.encodings().ok_or_else(missing("encodings"))?; + let mut encodings = Vec::with_capacity(fb_encodings.len()); + for fb_encoding in fb_encodings { + let encoding_id = fb_encoding.id().ok_or_else(missing("encoding.id"))?; + encodings.push( + find_encoding(encoding_id) + .ok_or_else(|| vortex_err!("Stream uses unknown encoding {}", encoding_id))?, + ); + } + Ok(Self::new(encodings.into())) + } +} + +impl<'a> WriteFlatBuffer for IPCSchema<'a> { + type Target<'t> = fb::Schema<'t>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let dtype = Some(self.0.write_flatbuffer(fbb)); + fb::Schema::create(fbb, &fb::SchemaArgs { dtype }) + } +} + +impl<'a> WriteFlatBuffer for IPCChunk<'a> { + type Target<'t> = fb::Chunk<'t>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let offsets = fbb.create_vector_from_iter(self.0.iter().copied()); + fb::Chunk::create( + fbb, + &fb::ChunkArgs { + column_offsets: Some(offsets), + }, + ) + } +} + +impl<'a> WriteFlatBuffer for IPCChunkColumn<'a> { + type Target<'t> = fb::ChunkColumn<'t>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let col_data = self.1; + let array = Some(IPCArray(self.0, col_data).write_flatbuffer(fbb)); + + // Walk the ColumnData depth-first to compute the buffer offsets. + let mut buffers = Vec::with_capacity(col_data.buffers().len()); + let mut offset = 0; + for col_data in col_data.depth_first_traversal() { + for buffer in col_data.buffers() { + buffers.push(fb::Buffer::new( + offset as u64, + buffer.len() as u64, + Compression::None, + )); + let aligned_size = (buffer.len() + (ALIGNMENT - 1)) & !(ALIGNMENT - 1); + offset += aligned_size; + } + } + let buffers = Some(fbb.create_vector(&buffers)); + + fb::ChunkColumn::create( + fbb, + &fb::ChunkColumnArgs { + array, + buffers, + buffer_size: offset as u64, + }, + ) + } +} + +impl<'a> WriteFlatBuffer for IPCArray<'a> { + type Target<'t> = fba::Array<'t>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let column_data = self.1; + + let encoding = self + .0 + .encoding_idx(column_data.encoding()) + // TODO(ngates): return result from this writer? + .unwrap_or_else(|| panic!("Encoding not found: {:?}", column_data.encoding())); + + let metadata = column_data + .metadata() + .map(|m| fbb.create_vector(m.as_slice())); + + let children = column_data + .children() + .iter() + .map(|child| IPCArray(self.0, child).write_flatbuffer(fbb)) + .collect_vec(); + let children = Some(fbb.create_vector(&children)); + + let nbuffers = column_data.buffers().len() as u16; // TODO(ngates): checked cast + + fba::Array::create( + fbb, + &fba::ArrayArgs { + version: Default::default(), + encoding, + metadata, + children, + nbuffers, + }, + ) + } +} diff --git a/vortex-ipc/src/reader.rs b/vortex-ipc/src/reader.rs new file mode 100644 index 0000000000..b8fb315e82 --- /dev/null +++ b/vortex-ipc/src/reader.rs @@ -0,0 +1,191 @@ +use crate::flatbuffers::ipc::Message; +use crate::iter::{FallibleLendingIterator, FallibleLendingIteratorāļžItem}; +use arrow_buffer::Buffer; +use flatbuffers::root; +use nougat::gat; +use std::io; +use std::io::{BufReader, Read}; +use vortex::array::composite::COMPOSITE_EXTENSIONS; +use vortex::serde::context::SerdeContext; +use vortex::serde::ArrayView; +use vortex_error::{vortex_err, VortexError, VortexResult}; +use vortex_flatbuffers::{FlatBufferReader, ReadFlatBuffer}; +use vortex_schema::{DType, DTypeSerdeContext}; + +#[allow(dead_code)] +pub struct StreamReader { + read: R, + + pub(crate) ctx: SerdeContext, + // Optionally take a projection? + + // Use replace to swap the scratch buffer. + // std::mem::replace + // We could use a cell to avoid the need for mutable borrow. + scratch: Vec, +} + +impl StreamReader> { + pub fn try_new(read: R) -> VortexResult { + Self::try_new_unbuffered(BufReader::new(read)) + } +} + +impl StreamReader { + pub fn try_new_unbuffered(mut read: R) -> VortexResult { + let mut msg_vec = Vec::new(); + let fb_msg = read + .read_message::(&mut msg_vec)? + .ok_or_else(|| vortex_err!(InvalidSerde: "Unexpected EOF reading IPC format"))?; + let fb_ctx = fb_msg.header_as_context().ok_or_else( + || vortex_err!(InvalidSerde: "Expected IPC Context as first message in stream"), + )?; + let ctx: SerdeContext = fb_ctx.try_into()?; + + Ok(Self { + read, + ctx, + scratch: Vec::with_capacity(1024), + }) + } +} + +/// We implement a lending iterator here so that each StreamArrayChunkReader can be lent as +/// mutable to the caller. This is necessary because we need a mutable handle to the reader. +#[gat] +impl FallibleLendingIterator for StreamReader { + type Error = VortexError; + type Item<'next> = StreamArrayChunkReader<'next, R> where Self: 'next; + + fn next(&mut self) -> Result>, Self::Error> { + let mut fb_vec = Vec::new(); + let msg = self.read.read_message::(&mut fb_vec)?; + if msg.is_none() { + // End of the stream + return Ok(None); + } + let msg = msg.unwrap(); + + // FIXME(ngates): parse the schema? + let schema = msg + .header_as_schema() + .ok_or_else(|| vortex_err!(InvalidSerde: "Expected IPC Schema message"))?; + + // TODO(ngates): construct this from the SerdeContext. + let dtype_ctx = + DTypeSerdeContext::new(COMPOSITE_EXTENSIONS.iter().map(|e| e.id()).collect()); + let dtype = DType::read_flatbuffer( + &dtype_ctx, + &schema + .dtype() + .ok_or_else(|| vortex_err!(InvalidSerde: "Schema missing DType"))?, + ) + .map_err(|e| vortex_err!(InvalidSerde: "Failed to parse DType: {}", e))?; + + Ok(Some(StreamArrayChunkReader { + read: &mut self.read, + ctx: &self.ctx, + dtype, + fb_buffer: Vec::new(), + buffers: Vec::new(), + })) + } +} + +#[allow(dead_code)] +pub struct StreamArrayChunkReader<'a, R: Read> { + read: &'a mut R, + ctx: &'a SerdeContext, + dtype: DType, + fb_buffer: Vec, + buffers: Vec, +} + +impl<'a, R: Read> StreamArrayChunkReader<'a, R> { + pub fn dtype(&self) -> &DType { + &self.dtype + } +} + +#[gat] +impl<'a, R: Read> FallibleLendingIterator for StreamArrayChunkReader<'a, R> { + type Error = VortexError; + type Item<'next> = ArrayView<'next> where Self: 'next; + + fn next(&mut self) -> Result>, Self::Error> { + let mut fb_vec: Vec = Vec::new(); + let msg = self.read.read_message::(&mut fb_vec)?; + if msg.is_none() { + // End of the stream + return Ok(None); + } + let msg = msg.unwrap(); + + let chunk = msg + .header_as_chunk() + .ok_or_else(|| vortex_err!(InvalidSerde: "Expected IPC Chunk message")) + .unwrap(); + + let col_offsets = chunk + .column_offsets() + .ok_or_else( + || vortex_err!(InvalidSerde: "Expected column offsets in IPC Chunk message"), + ) + .unwrap(); + assert_eq!(col_offsets.len(), 1); + + // TODO(ngates): read each column + read_into(self.read, &mut self.fb_buffer).unwrap(); + let col_msg = root::(&self.fb_buffer) + .unwrap() + .header_as_chunk_column() + .ok_or_else(|| vortex_err!(InvalidSerde: "Expected IPC Chunk Column message")) + .unwrap(); + + let col_array = col_msg + .array() + .ok_or_else(|| vortex_err!(InvalidSerde: "Chunk column missing Array")) + .unwrap(); + + // Read all the column's buffers + self.buffers.clear(); + let mut offset = 0; + for buffer in col_msg.buffers().unwrap_or_default().iter() { + let to_kill = buffer.offset() - offset; + io::copy(&mut self.read.take(to_kill), &mut io::sink()).unwrap(); + + let mut bytes = vec![0u8; buffer.length() as usize]; + self.read.read_exact(&mut bytes).unwrap(); + self.buffers.push(Buffer::from_vec(bytes)); + + offset = buffer.offset() + buffer.length(); + } + + // Consume any remaining padding after the final buffer. + let to_kill = col_msg.buffer_size() - offset; + io::copy(&mut self.read.take(to_kill), &mut io::sink()).unwrap(); + + let view = ArrayView::try_new(self.ctx, &self.dtype, col_array, &self.buffers)?; + + // Validate the array once here so we can ignore metadata parsing errors from now on. + // TODO(ngates): should we convert to heap-allocated array if this is missing? + view.vtable().validate(&view)?; + + Ok(Some(view)) + } +} + +/// FIXME(ngates): this exists to detach the lifetimes of the object as read by read_flatbuffer. +/// We should be able to fix that. +pub fn read_into(read: &mut R, buffer: &mut Vec) -> VortexResult<()> { + buffer.clear(); + + let mut buffer_len: [u8; 4] = [0; 4]; + // FIXME(ngates): return optional for EOF? + read.read_exact(&mut buffer_len)?; + + let buffer_len = u32::from_le_bytes(buffer_len) as usize; + read.take(buffer_len as u64).read_to_end(buffer)?; + + Ok(()) +} diff --git a/vortex-ipc/src/writer.rs b/vortex-ipc/src/writer.rs new file mode 100644 index 0000000000..e19d04350d --- /dev/null +++ b/vortex-ipc/src/writer.rs @@ -0,0 +1,106 @@ +use flatbuffers::root_unchecked; +use itertools::Itertools; +use std::io::{BufWriter, Write}; + +use vortex::array::Array; +use vortex::serde::context::SerdeContext; +use vortex::serde::data::{ArrayData, ColumnData}; + +use crate::ALIGNMENT; +use vortex_error::VortexResult; +use vortex_flatbuffers::FlatBufferWriter; + +use crate::flatbuffers::ipc as fb; +use crate::messages::{IPCChunk, IPCChunkColumn, IPCContext, IPCMessage, IPCSchema}; + +#[allow(dead_code)] +pub struct StreamWriter { + write: W, + ctx: SerdeContext, +} + +impl StreamWriter> { + pub fn try_new(write: W, ctx: SerdeContext) -> VortexResult { + Self::try_new_unbuffered(BufWriter::new(write), ctx) + } +} + +impl StreamWriter { + pub fn try_new_unbuffered(mut write: W, ctx: SerdeContext) -> VortexResult { + // Write the IPC context to the stream + write.write_message(&IPCMessage::Context(IPCContext(&ctx)), ALIGNMENT)?; + Ok(Self { write, ctx }) + } + + pub fn write(&mut self, array: &dyn Array) -> VortexResult<()> { + // First, write a schema message indicating the start of an array. + self.write + .write_message(&IPCMessage::Schema(IPCSchema(array.dtype())), ALIGNMENT)?; + + // Then we write the array in chunks. + // TODO(ngates): should we do any chunking ourselves? + // TODO(ngates): If it's a chunked array, use those chunks. Else write the whole thing. + + // For now, we write a single chunk. + self.write_chunk(array) + } + + fn write_chunk(&mut self, array: &dyn Array) -> VortexResult<()> { + // A chunk contains the forward byte offsets to each of the columns in the chunk. + let col_data = ColumnData::try_from_array(array)?; + + // TODO(ngates): somehow get the flattened columns as ArrayData. + let data = ArrayData::new(vec![col_data]); + + // In order to generate chunk metadata, we need to know the forward offsets for each + // column. To compute this, we need to know how big the metadata messages are for each + // column, as well as how long their buffers are. + let mut offset = 0; + let mut chunk_column_msgs = Vec::with_capacity(data.columns().len()); + let mut chunk_column_offsets = Vec::with_capacity(data.columns().len()); + for column_data in data.columns() { + chunk_column_offsets.push(offset); + + // Serialize the ChunkColumn message and add its offset. + let mut vec = Vec::new(); + vec.write_message( + &IPCMessage::ChunkColumn(IPCChunkColumn(&self.ctx, column_data)), + ALIGNMENT, + )?; + + // Parse our message to extract the total size used by all buffers of the column. + let chunk_col = unsafe { root_unchecked::(&vec[4..]) } + .header_as_chunk_column() + .unwrap(); + offset += chunk_col.buffer_size(); + + chunk_column_msgs.push(vec); + } + + // Now we can construct a Chunk message with the offsets to each column. + self.write.write_message( + &IPCMessage::Chunk(IPCChunk(&chunk_column_offsets)), + ALIGNMENT, + )?; + + // Then write each chunk column chunk message, followed by its buffers. + for (msg, column_data) in chunk_column_msgs.iter().zip(data.columns()) { + self.write.write_all(msg)?; + + let buffer_offsets = column_data.all_buffer_offsets(ALIGNMENT); + let mut current_offset = 0; + for (buffer, &buffer_end) in column_data + .depth_first_traversal() + .flat_map(|data| data.buffers().iter()) + .zip_eq(buffer_offsets.iter().skip(1)) + { + self.write.write_all(buffer.as_slice())?; + current_offset += buffer.len(); + let padding = (buffer_end as usize) - current_offset; + self.write.write_all(&vec![0; padding])?; + } + } + + Ok(()) + } +} diff --git a/vortex-ree/src/compress.rs b/vortex-ree/src/compress.rs index d1d5270257..b89865d89b 100644 --- a/vortex-ree/src/compress.rs +++ b/vortex-ree/src/compress.rs @@ -5,8 +5,9 @@ use num_traits::AsPrimitive; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::{Array, ArrayRef, Encoding}; +use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::encoding::Encoding; use vortex::match_each_integer_ptype; use vortex::ptype::{match_each_native_ptype, NativePType}; use vortex::stats::Stat; diff --git a/vortex-ree/src/lib.rs b/vortex-ree/src/lib.rs index 1999271823..0061ef7a9d 100644 --- a/vortex-ree/src/lib.rs +++ b/vortex-ree/src/lib.rs @@ -1,5 +1,5 @@ use linkme::distributed_slice; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; pub use ree::*; diff --git a/vortex-ree/src/ree.rs b/vortex-ree/src/ree.rs index 5f79be4328..a72762974e 100644 --- a/vortex-ree/src/ree.rs +++ b/vortex-ree/src/ree.rs @@ -1,20 +1,19 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{ - check_slice_bounds, Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef, -}; +use crate::compress::ree_encode; +use vortex::array::{check_slice_bounds, Array, ArrayKind, ArrayRef}; use vortex::compress::EncodingCompression; use vortex::compute::search_sorted::SearchSortedSide; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stat, Stats, StatsCompute, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; -use vortex::{compute, impl_array}; +use vortex::{compute, impl_array, ArrayWalker}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_schema::DType; -use crate::compress::ree_encode; - #[derive(Debug, Clone)] pub struct REEArray { ends: ArrayRef, @@ -156,6 +155,11 @@ impl Array for REEArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.values())?; + walker.visit_child(self.ends()) + } } impl StatsCompute for REEArray {} diff --git a/vortex-ree/src/serde.rs b/vortex-ree/src/serde.rs index 92f0e310fc..d8166b6b5d 100644 --- a/vortex-ree/src/serde.rs +++ b/vortex-ree/src/serde.rs @@ -14,6 +14,10 @@ impl ArraySerde for REEArray { ctx.write(self.ends())?; ctx.write(self.values()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for REEEncoding { diff --git a/vortex-roaring/src/boolean/mod.rs b/vortex-roaring/src/boolean/mod.rs index d1da56bef5..2460c8ebaa 100644 --- a/vortex-roaring/src/boolean/mod.rs +++ b/vortex-roaring/src/boolean/mod.rs @@ -3,15 +3,15 @@ use std::sync::{Arc, RwLock}; use croaring::{Bitmap, Native}; use compress::roaring_encode; -use vortex::array::{ - check_slice_bounds, Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef, -}; +use vortex::array::{check_slice_bounds, Array, ArrayKind, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_err, VortexResult}; use vortex_schema::DType; use vortex_schema::Nullability::NonNullable; @@ -99,6 +99,14 @@ impl Array for RoaringBoolArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { + // TODO(ngates): should we store a buffer in memory? Or delay serialization? + // Or serialize into metadata? The only reason we support buffers is so we can write to + // the wire without copying into FlatBuffers. But if we need to allocate to serialize + // the bitmap anyway, then may as well shove it into metadata. + todo!() + } } impl ArrayDisplay for RoaringBoolArray { diff --git a/vortex-roaring/src/boolean/serde.rs b/vortex-roaring/src/boolean/serde.rs index 5efb9e2483..073dcc58ff 100644 --- a/vortex-roaring/src/boolean/serde.rs +++ b/vortex-roaring/src/boolean/serde.rs @@ -16,6 +16,10 @@ impl ArraySerde for RoaringBoolArray { self.bitmap().serialize_into::(&mut data); ctx.write_slice(data.as_slice()) } + + fn metadata(&self) -> VortexResult>> { + todo!() + } } impl EncodingSerde for RoaringBoolEncoding { diff --git a/vortex-roaring/src/integer/mod.rs b/vortex-roaring/src/integer/mod.rs index 438ee88e39..5eca3a5058 100644 --- a/vortex-roaring/src/integer/mod.rs +++ b/vortex-roaring/src/integer/mod.rs @@ -3,16 +3,16 @@ use std::sync::{Arc, RwLock}; use croaring::{Bitmap, Native}; use compress::roaring_encode; -use vortex::array::{ - check_slice_bounds, Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef, -}; +use vortex::array::{check_slice_bounds, Array, ArrayKind, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::ptype::PType; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_schema::DType; @@ -101,6 +101,10 @@ impl Array for RoaringIntArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { + todo!() + } } impl ArrayDisplay for RoaringIntArray { diff --git a/vortex-roaring/src/integer/serde.rs b/vortex-roaring/src/integer/serde.rs index 4100bf5995..6a70534418 100644 --- a/vortex-roaring/src/integer/serde.rs +++ b/vortex-roaring/src/integer/serde.rs @@ -16,6 +16,10 @@ impl ArraySerde for RoaringIntArray { self.bitmap().serialize_into::(&mut data); ctx.write_slice(data.as_slice()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for RoaringIntEncoding { diff --git a/vortex-roaring/src/lib.rs b/vortex-roaring/src/lib.rs index 0e844e00c8..92ad6e83cd 100644 --- a/vortex-roaring/src/lib.rs +++ b/vortex-roaring/src/lib.rs @@ -2,7 +2,7 @@ use linkme::distributed_slice; pub use boolean::*; pub use integer::*; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; mod boolean; mod downcast; diff --git a/vortex-schema/Cargo.toml b/vortex-schema/Cargo.toml index 6d8574d9fd..ee29f5f525 100644 --- a/vortex-schema/Cargo.toml +++ b/vortex-schema/Cargo.toml @@ -16,11 +16,13 @@ name = "vortex_schema" path = "src/lib.rs" [dependencies] -vortex-error = { path = "../vortex-error" } arrow-schema = { workspace = true } flatbuffers = { workspace = true } itertools = { workspace = true } +linkme = { workspace = true } thiserror = { workspace = true } +vortex-error = { path = "../vortex-error" } +vortex-flatbuffers = { path = "../vortex-flatbuffers" } [build-dependencies] flatc = { workspace = true } diff --git a/vortex-schema/build.rs b/vortex-schema/build.rs deleted file mode 100644 index cee1d21652..0000000000 --- a/vortex-schema/build.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::env; -use std::ffi::OsStr; -use std::path::{Path, PathBuf}; -use std::process::Command; - -use flatc::flatc; -use walkdir::WalkDir; - -fn main() { - let flatbuffers_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()) - .canonicalize() - .expect("Failed to canonicalize CARGO_MANIFEST_DIR") - .join("flatbuffers"); - let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()) - .canonicalize() - .expect("Failed to canonicalize OUT_DIR"); - - let fbs_files = WalkDir::new(flatbuffers_dir) - .into_iter() - .filter_map(|e| e.ok()) - .filter(|e| e.path().extension() == Some(OsStr::new("fbs"))) - .map(|e| { - rerun_if_changed(e.path()); - e.path().to_path_buf() - }) - .collect::>(); - - if !Command::new(flatc()) - .args(["--filename-suffix", ""]) - .arg("--rust") - .arg("-o") - .arg(out_dir.join("flatbuffers")) - .args(fbs_files) - .status() - .unwrap() - .success() - { - panic!("Failed to run flatc"); - } -} - -fn rerun_if_changed(path: &Path) { - println!( - "cargo:rerun-if-changed={}", - path.canonicalize() - .unwrap_or_else(|_| panic!("failed to canonicalize {}", path.to_str().unwrap())) - .to_str() - .unwrap() - ); -} diff --git a/vortex-schema/build.rs b/vortex-schema/build.rs new file mode 120000 index 0000000000..7cb528993c --- /dev/null +++ b/vortex-schema/build.rs @@ -0,0 +1 @@ +../flatbuffers.build.rs \ No newline at end of file diff --git a/vortex-schema/flatbuffers/schema.fbs b/vortex-schema/flatbuffers/dtype.fbs similarity index 97% rename from vortex-schema/flatbuffers/schema.fbs rename to vortex-schema/flatbuffers/dtype.fbs index 0a885e1371..ab7e9825e4 100644 --- a/vortex-schema/flatbuffers/schema.fbs +++ b/vortex-schema/flatbuffers/dtype.fbs @@ -1,3 +1,5 @@ +namespace vortex.dtype; + enum Nullability: byte { NonNullable, Nullable, @@ -49,7 +51,6 @@ table Float { table Utf8 { nullability: Nullability; - } table Binary { diff --git a/vortex-schema/src/deserialize.rs b/vortex-schema/src/deserialize.rs new file mode 100644 index 0000000000..79097e8db7 --- /dev/null +++ b/vortex-schema/src/deserialize.rs @@ -0,0 +1,149 @@ +use crate::{flatbuffers as fb, FloatWidth, IntWidth, Nullability, Signedness}; +use crate::{CompositeID, DType}; +use std::sync::Arc; +use vortex_error::{vortex_err, VortexError, VortexResult}; +use vortex_flatbuffers::ReadFlatBuffer; + +#[allow(dead_code)] +pub struct DTypeSerdeContext { + composite_ids: Vec, +} + +impl DTypeSerdeContext { + pub fn new(composite_ids: Vec) -> Self { + Self { composite_ids } + } + + pub fn find_composite_id(&self, id: &str) -> Option { + self.composite_ids.iter().find(|c| c.0 == id).copied() + } +} + +impl ReadFlatBuffer for DType { + type Source<'a> = fb::DType<'a>; + type Error = VortexError; + + fn read_flatbuffer( + ctx: &DTypeSerdeContext, + fb: &Self::Source<'_>, + ) -> Result { + match fb.type_type() { + fb::Type::Null => Ok(DType::Null), + fb::Type::Bool => Ok(DType::Bool( + fb.type__as_bool().unwrap().nullability().try_into()?, + )), + fb::Type::Int => { + let fb_int = fb.type__as_int().unwrap(); + Ok(DType::Int( + fb_int.width().try_into()?, + fb_int.signedness().try_into()?, + fb_int.nullability().try_into()?, + )) + } + fb::Type::Float => { + let fb_float = fb.type__as_float().unwrap(); + Ok(DType::Float( + fb_float.width().try_into()?, + fb_float.nullability().try_into()?, + )) + } + fb::Type::Decimal => { + let fb_decimal = fb.type__as_decimal().unwrap(); + Ok(DType::Decimal( + fb_decimal.precision(), + fb_decimal.scale(), + fb_decimal.nullability().try_into()?, + )) + } + fb::Type::Binary => Ok(DType::Binary( + fb.type__as_binary().unwrap().nullability().try_into()?, + )), + fb::Type::Utf8 => Ok(DType::Utf8( + fb.type__as_utf_8().unwrap().nullability().try_into()?, + )), + fb::Type::List => { + let fb_list = fb.type__as_list().unwrap(); + let element_dtype = DType::read_flatbuffer(ctx, &fb_list.element_type().unwrap())?; + Ok(DType::List( + Box::new(element_dtype), + fb_list.nullability().try_into()?, + )) + } + fb::Type::Struct_ => { + let fb_struct = fb.type__as_struct_().unwrap(); + let names = fb_struct + .names() + .unwrap() + .iter() + .map(|n| Arc::new(n.to_string())) + .collect::>(); + let fields: Vec = fb_struct + .fields() + .unwrap() + .iter() + .map(|f| DType::read_flatbuffer(ctx, &f)) + .collect::>>()?; + Ok(DType::Struct(names, fields)) + } + fb::Type::Composite => { + let fb_composite = fb.type__as_composite().unwrap(); + let id = ctx + .find_composite_id(fb_composite.id().unwrap()) + .ok_or_else(|| vortex_err!("Couldn't find composite id"))?; + Ok(DType::Composite(id, fb_composite.nullability().try_into()?)) + } + _ => Err(vortex_err!("Unknown DType variant")), + } + } +} + +impl TryFrom for Nullability { + type Error = VortexError; + + fn try_from(value: fb::Nullability) -> VortexResult { + match value { + fb::Nullability::NonNullable => Ok(Nullability::NonNullable), + fb::Nullability::Nullable => Ok(Nullability::Nullable), + _ => Err(vortex_err!("Unknown nullability value")), + } + } +} + +impl TryFrom for IntWidth { + type Error = VortexError; + + fn try_from(value: fb::IntWidth) -> VortexResult { + match value { + fb::IntWidth::_8 => Ok(IntWidth::_8), + fb::IntWidth::_16 => Ok(IntWidth::_16), + fb::IntWidth::_32 => Ok(IntWidth::_32), + fb::IntWidth::_64 => Ok(IntWidth::_64), + _ => Err(vortex_err!("Unknown IntWidth value")), + } + } +} + +impl TryFrom for Signedness { + type Error = VortexError; + + fn try_from(value: fb::Signedness) -> VortexResult { + match value { + fb::Signedness::Unsigned => Ok(Signedness::Unsigned), + fb::Signedness::Signed => Ok(Signedness::Signed), + _ => Err(vortex_err!("Unknown Signedness value")), + } + } +} + +impl TryFrom for FloatWidth { + type Error = VortexError; + + fn try_from(value: fb::FloatWidth) -> VortexResult { + match value { + fb::FloatWidth::_16 => Ok(FloatWidth::_16), + fb::FloatWidth::_32 => Ok(FloatWidth::_32), + fb::FloatWidth::_64 => Ok(FloatWidth::_64), + _ => Err(vortex_err!("Unknown IntWidth value")), + } + } +} diff --git a/vortex-schema/src/lib.rs b/vortex-schema/src/lib.rs index 6aa154d244..3fdaf2a021 100644 --- a/vortex-schema/src/lib.rs +++ b/vortex-schema/src/lib.rs @@ -1,11 +1,12 @@ use std::fmt::{Display, Formatter}; pub use dtype::*; -pub use serde::FbDeserialize; -pub use serde::FbSerialize; +mod deserialize; mod dtype; -mod serde; +mod serialize; + +pub use deserialize::*; #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)] pub struct CompositeID(pub &'static str); @@ -16,11 +17,13 @@ impl Display for CompositeID { } } -#[allow(unused_imports)] -#[allow(dead_code)] -#[allow(clippy::needless_lifetimes)] -#[allow(clippy::extra_unused_lifetimes)] -#[allow(non_camel_case_types)] -mod generated { - include!(concat!(env!("OUT_DIR"), "/flatbuffers/schema.rs")); +pub mod flatbuffers { + #[allow(unused_imports)] + #[allow(dead_code)] + #[allow(clippy::all)] + #[allow(non_camel_case_types)] + mod generated { + include!(concat!(env!("OUT_DIR"), "/flatbuffers/dtype.rs")); + } + pub use generated::vortex::dtype::*; } diff --git a/vortex-schema/src/serde.rs b/vortex-schema/src/serde.rs deleted file mode 100644 index 58f07b6e0a..0000000000 --- a/vortex-schema/src/serde.rs +++ /dev/null @@ -1,405 +0,0 @@ -use std::sync::Arc; - -use flatbuffers::{FlatBufferBuilder, WIPOffset}; -use vortex_error::{vortex_err, VortexError, VortexResult}; - -use crate::generated::{ - root_as_dtype, Bool, BoolArgs, Composite, CompositeArgs, Int, IntArgs, List, ListArgs, Null, - NullArgs, Struct_, Struct_Args, Type, -}; -use crate::generated::{Binary, BinaryArgs, Signedness as FbSignedness}; -use crate::generated::{DType as FbDType, DTypeArgs}; -use crate::generated::{Decimal, DecimalArgs, FloatWidth as FbFloatWidth}; -use crate::generated::{Float, FloatArgs, IntWidth as FbIntWidth}; -use crate::generated::{Nullability as FbNullability, Utf8, Utf8Args}; -use crate::{CompositeID, DType, FloatWidth, IntWidth, Nullability, Signedness}; - -pub trait FbSerialize<'a> { - type OffsetType; - - // Convert self to flatbuffer representation, returns written bytes and index of valid data - // If you want to serialize multiple objects you should prefer serialize_to_builder to reuse the allocated memory - fn serialize(&self) -> (Vec, usize) { - let mut fbb = FlatBufferBuilder::new(); - let wip_dtype = self.serialize_to_builder(&mut fbb); - fbb.finish_minimal(wip_dtype); - fbb.collapse() - } - - fn serialize_to_builder(&self, fbb: &mut FlatBufferBuilder<'a>) -> WIPOffset; -} - -pub trait FbDeserialize<'a>: Sized { - type OffsetType; - - fn deserialize(bytes: &[u8], find_id: fn(&str) -> Option) -> VortexResult; - - fn convert_from_fb( - fb_type: Self::OffsetType, - find_id: fn(&str) -> Option, - ) -> VortexResult; -} - -impl<'a> FbSerialize<'a> for DType { - type OffsetType = FbDType<'a>; - - fn serialize_to_builder(&self, fbb: &mut FlatBufferBuilder<'a>) -> WIPOffset { - let (dtype_union, dtype_union_variant) = match self { - DType::Null => (Null::create(fbb, &NullArgs {}).as_union_value(), Type::Null), - DType::Bool(n) => ( - Bool::create( - fbb, - &BoolArgs { - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Bool, - ), - DType::Int(w, s, n) => ( - Int::create( - fbb, - &IntArgs { - width: w.into(), - signedness: s.into(), - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Int, - ), - DType::Decimal(p, s, n) => ( - Decimal::create( - fbb, - &DecimalArgs { - precision: *p, - scale: *s, - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Decimal, - ), - DType::Float(w, n) => ( - Float::create( - fbb, - &FloatArgs { - width: w.into(), - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Float, - ), - DType::Utf8(n) => ( - Utf8::create( - fbb, - &Utf8Args { - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Utf8, - ), - DType::Binary(n) => ( - Binary::create( - fbb, - &BinaryArgs { - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Binary, - ), - DType::Struct(ns, fs) => { - let name_offsets = ns - .iter() - .map(|n| fbb.create_string(n.as_ref())) - .collect::>(); - fbb.start_vector::>(ns.len()); - for name in name_offsets.iter().rev() { - fbb.push(name); - } - let names_vector = fbb.end_vector(ns.len()); - - let dtype_offsets = fs - .iter() - .map(|f| f.serialize_to_builder(fbb)) - .collect::>(); - fbb.start_vector::>(fs.len()); - for doff in dtype_offsets.iter().rev() { - fbb.push(doff); - } - let fields_vector = fbb.end_vector(fs.len()); - - ( - Struct_::create( - fbb, - &Struct_Args { - names: Some(names_vector), - fields: Some(fields_vector), - }, - ) - .as_union_value(), - Type::Struct_, - ) - } - DType::List(e, n) => { - let fb_dtype = e.as_ref().serialize_to_builder(fbb); - ( - List::create( - fbb, - &ListArgs { - element_type: Some(fb_dtype), - nullability: n.into(), - }, - ) - .as_union_value(), - Type::List, - ) - } - DType::Composite(id, n) => { - let id = fbb.create_string(id.0); - ( - Composite::create( - fbb, - &CompositeArgs { - id: Some(id), - nullability: n.into(), - }, - ) - .as_union_value(), - Type::Composite, - ) - } - }; - - FbDType::create( - fbb, - &DTypeArgs { - type_type: dtype_union_variant, - type_: Some(dtype_union), - }, - ) - } -} - -impl<'a> FbDeserialize<'a> for DType { - type OffsetType = FbDType<'a>; - - fn deserialize(bytes: &[u8], find_id: fn(&str) -> Option) -> VortexResult { - root_as_dtype(bytes) - .map_err(|e| vortex_err!("Unable to read bytes as DType: {}", e)) - .and_then(|d| Self::convert_from_fb(d, find_id)) - } - - fn convert_from_fb( - fb_type: Self::OffsetType, - find_id: fn(&str) -> Option, - ) -> VortexResult { - match fb_type.type_type() { - Type::Null => Ok(DType::Null), - Type::Bool => Ok(DType::Bool( - fb_type.type__as_bool().unwrap().nullability().try_into()?, - )), - Type::Int => { - let fb_int = fb_type.type__as_int().unwrap(); - Ok(DType::Int( - fb_int.width().try_into()?, - fb_int.signedness().try_into()?, - fb_int.nullability().try_into()?, - )) - } - Type::Float => { - let fb_float = fb_type.type__as_float().unwrap(); - Ok(DType::Float( - fb_float.width().try_into()?, - fb_float.nullability().try_into()?, - )) - } - Type::Decimal => { - let fb_decimal = fb_type.type__as_decimal().unwrap(); - Ok(DType::Decimal( - fb_decimal.precision(), - fb_decimal.scale(), - fb_decimal.nullability().try_into()?, - )) - } - Type::Binary => Ok(DType::Binary( - fb_type - .type__as_binary() - .unwrap() - .nullability() - .try_into()?, - )), - Type::Utf8 => Ok(DType::Utf8( - fb_type.type__as_utf_8().unwrap().nullability().try_into()?, - )), - Type::List => { - let fb_list = fb_type.type__as_list().unwrap(); - let element_dtype = - DType::convert_from_fb(fb_list.element_type().unwrap(), find_id)?; - Ok(DType::List( - Box::new(element_dtype), - fb_list.nullability().try_into()?, - )) - } - Type::Struct_ => { - let fb_struct = fb_type.type__as_struct_().unwrap(); - let names = fb_struct - .names() - .unwrap() - .iter() - .map(|n| Arc::new(n.to_string())) - .collect::>(); - let fields: Vec = fb_struct - .fields() - .unwrap() - .iter() - .map(|f| DType::convert_from_fb(f, find_id)) - .collect::>>()?; - Ok(DType::Struct(names, fields)) - } - Type::Composite => { - let fb_composite = fb_type.type__as_composite().unwrap(); - let id = find_id(fb_composite.id().unwrap()) - .ok_or_else(|| vortex_err!("Couldn't find composite id"))?; - Ok(DType::Composite(id, fb_composite.nullability().try_into()?)) - } - _ => Err(vortex_err!("Unknown DType variant")), - } - } -} - -impl From<&Nullability> for FbNullability { - fn from(value: &Nullability) -> Self { - match value { - Nullability::NonNullable => FbNullability::NonNullable, - Nullability::Nullable => FbNullability::Nullable, - } - } -} - -impl TryFrom for Nullability { - type Error = VortexError; - - fn try_from(value: FbNullability) -> VortexResult { - match value { - FbNullability::NonNullable => Ok(Nullability::NonNullable), - FbNullability::Nullable => Ok(Nullability::Nullable), - _ => Err(vortex_err!("Unknown nullability value")), - } - } -} - -impl From<&IntWidth> for FbIntWidth { - fn from(value: &IntWidth) -> Self { - match value { - IntWidth::_8 => FbIntWidth::_8, - IntWidth::_16 => FbIntWidth::_16, - IntWidth::_32 => FbIntWidth::_32, - IntWidth::_64 => FbIntWidth::_64, - } - } -} - -impl TryFrom for IntWidth { - type Error = VortexError; - - fn try_from(value: FbIntWidth) -> VortexResult { - match value { - FbIntWidth::_8 => Ok(IntWidth::_8), - FbIntWidth::_16 => Ok(IntWidth::_16), - FbIntWidth::_32 => Ok(IntWidth::_32), - FbIntWidth::_64 => Ok(IntWidth::_64), - _ => Err(vortex_err!("Unknown IntWidth value")), - } - } -} - -impl From<&Signedness> for FbSignedness { - fn from(value: &Signedness) -> Self { - match value { - Signedness::Unsigned => FbSignedness::Unsigned, - Signedness::Signed => FbSignedness::Signed, - } - } -} - -impl TryFrom for Signedness { - type Error = VortexError; - - fn try_from(value: FbSignedness) -> VortexResult { - match value { - FbSignedness::Unsigned => Ok(Signedness::Unsigned), - FbSignedness::Signed => Ok(Signedness::Signed), - _ => Err(vortex_err!("Unknown Signedness value")), - } - } -} - -impl From<&FloatWidth> for FbFloatWidth { - fn from(value: &FloatWidth) -> Self { - match value { - FloatWidth::_16 => FbFloatWidth::_16, - FloatWidth::_32 => FbFloatWidth::_32, - FloatWidth::_64 => FbFloatWidth::_64, - } - } -} - -impl TryFrom for FloatWidth { - type Error = VortexError; - - fn try_from(value: FbFloatWidth) -> VortexResult { - match value { - FbFloatWidth::_16 => Ok(FloatWidth::_16), - FbFloatWidth::_32 => Ok(FloatWidth::_32), - FbFloatWidth::_64 => Ok(FloatWidth::_64), - _ => Err(vortex_err!("Unknown IntWidth value")), - } - } -} - -#[cfg(test)] -mod test { - use std::sync::Arc; - - use crate::{DType, FbDeserialize, FbSerialize, FloatWidth, IntWidth, Nullability, Signedness}; - - fn roundtrip_dtype(dtype: DType) { - let (bytes, head) = dtype.serialize(); - let deserialized = - DType::deserialize(&bytes[head..], |_| panic!("no composite ids")).unwrap(); - assert_eq!(dtype, deserialized); - } - - #[test] - fn roundtrip() { - roundtrip_dtype(DType::Null); - roundtrip_dtype(DType::Bool(Nullability::NonNullable)); - roundtrip_dtype(DType::Int( - IntWidth::_64, - Signedness::Unsigned, - Nullability::NonNullable, - )); - roundtrip_dtype(DType::Decimal(18, 9, Nullability::NonNullable)); - roundtrip_dtype(DType::Float(FloatWidth::_64, Nullability::NonNullable)); - roundtrip_dtype(DType::Binary(Nullability::NonNullable)); - roundtrip_dtype(DType::Utf8(Nullability::NonNullable)); - roundtrip_dtype(DType::List( - Box::new(DType::Float(FloatWidth::_32, Nullability::Nullable)), - Nullability::NonNullable, - )); - roundtrip_dtype(DType::Struct( - vec![ - Arc::new("strings".to_string()), - Arc::new("ints".to_string()), - ], - vec![ - DType::Utf8(Nullability::NonNullable), - DType::Int(IntWidth::_16, Signedness::Unsigned, Nullability::Nullable), - ], - )) - } -} diff --git a/vortex-schema/src/serialize.rs b/vortex-schema/src/serialize.rs new file mode 100644 index 0000000000..1338b7efe1 --- /dev/null +++ b/vortex-schema/src/serialize.rs @@ -0,0 +1,223 @@ +use flatbuffers::{FlatBufferBuilder, WIPOffset}; +use itertools::Itertools; +use vortex_flatbuffers::{FlatBufferRoot, WriteFlatBuffer}; + +use crate::flatbuffers as fb; +use crate::{DType, FloatWidth, IntWidth, Nullability, Signedness}; + +impl FlatBufferRoot for DType {} +impl WriteFlatBuffer for DType { + type Target<'a> = fb::DType<'a>; + + fn write_flatbuffer<'fb>( + &self, + fbb: &mut FlatBufferBuilder<'fb>, + ) -> WIPOffset> { + let dtype_union = match self { + DType::Null => fb::Null::create(fbb, &fb::NullArgs {}).as_union_value(), + DType::Bool(n) => fb::Bool::create( + fbb, + &fb::BoolArgs { + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Int(width, signednedss, n) => fb::Int::create( + fbb, + &fb::IntArgs { + width: width.into(), + signedness: signednedss.into(), + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Decimal(p, s, n) => fb::Decimal::create( + fbb, + &fb::DecimalArgs { + precision: *p, + scale: *s, + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Float(width, n) => fb::Float::create( + fbb, + &fb::FloatArgs { + width: width.into(), + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Utf8(n) => fb::Utf8::create( + fbb, + &fb::Utf8Args { + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Binary(n) => fb::Binary::create( + fbb, + &fb::BinaryArgs { + nullability: n.into(), + }, + ) + .as_union_value(), + DType::Struct(names, dtypes) => { + let names = names + .iter() + .map(|n| fbb.create_string(n.as_str())) + .collect_vec(); + let names = Some(fbb.create_vector(&names)); + + let dtypes = dtypes + .iter() + .map(|dtype| dtype.write_flatbuffer(fbb)) + .collect_vec(); + let fields = Some(fbb.create_vector(&dtypes)); + + fb::Struct_::create(fbb, &fb::Struct_Args { names, fields }).as_union_value() + } + DType::List(e, n) => { + let element_type = Some(e.as_ref().write_flatbuffer(fbb)); + fb::List::create( + fbb, + &fb::ListArgs { + element_type, + nullability: n.into(), + }, + ) + .as_union_value() + } + DType::Composite(id, n) => { + let id = Some(fbb.create_string(id.0)); + fb::Composite::create( + fbb, + &fb::CompositeArgs { + id, + nullability: n.into(), + }, + ) + .as_union_value() + } + }; + + let dtype_type = match self { + DType::Null => fb::Type::Null, + DType::Bool(_) => fb::Type::Bool, + DType::Int(_, _, _) => fb::Type::Int, + DType::Decimal(_, _, _) => fb::Type::Decimal, + DType::Float(_, _) => fb::Type::Float, + DType::Utf8(_) => fb::Type::Utf8, + DType::Binary(_) => fb::Type::Binary, + DType::Struct(_, _) => fb::Type::Struct_, + DType::List(_, _) => fb::Type::List, + DType::Composite(_, _) => fb::Type::Composite, + }; + + fb::DType::create( + fbb, + &fb::DTypeArgs { + type_type: dtype_type, + type_: Some(dtype_union), + }, + ) + } +} + +impl From for fb::Nullability { + fn from(value: Nullability) -> Self { + match value { + Nullability::NonNullable => fb::Nullability::NonNullable, + Nullability::Nullable => fb::Nullability::Nullable, + } + } +} + +impl From<&Nullability> for fb::Nullability { + fn from(value: &Nullability) -> Self { + match value { + Nullability::NonNullable => fb::Nullability::NonNullable, + Nullability::Nullable => fb::Nullability::Nullable, + } + } +} + +impl From<&IntWidth> for fb::IntWidth { + fn from(value: &IntWidth) -> Self { + match value { + IntWidth::_8 => fb::IntWidth::_8, + IntWidth::_16 => fb::IntWidth::_16, + IntWidth::_32 => fb::IntWidth::_32, + IntWidth::_64 => fb::IntWidth::_64, + } + } +} + +impl From<&Signedness> for fb::Signedness { + fn from(value: &Signedness) -> Self { + match value { + Signedness::Unsigned => fb::Signedness::Unsigned, + Signedness::Signed => fb::Signedness::Signed, + } + } +} + +impl From<&FloatWidth> for fb::FloatWidth { + fn from(value: &FloatWidth) -> Self { + match value { + FloatWidth::_16 => fb::FloatWidth::_16, + FloatWidth::_32 => fb::FloatWidth::_32, + FloatWidth::_64 => fb::FloatWidth::_64, + } + } +} + +#[cfg(test)] +mod test { + use crate::flatbuffers as fb; + use flatbuffers::{root, FlatBufferBuilder}; + use std::sync::Arc; + use vortex_flatbuffers::{ReadFlatBuffer, WriteFlatBuffer}; + + use crate::{DType, DTypeSerdeContext, FloatWidth, IntWidth, Nullability, Signedness}; + + fn roundtrip_dtype(dtype: DType) { + let mut fbb = FlatBufferBuilder::new(); + let root_offset = dtype.write_flatbuffer(&mut fbb); + fbb.finish_minimal(root_offset); + + let bytes = fbb.finished_data(); + let deserialized = DType::read_flatbuffer( + &DTypeSerdeContext::new(vec![]), + &root::(bytes).unwrap(), + ) + .unwrap(); + assert_eq!(dtype, deserialized); + } + + #[test] + fn roundtrip() { + roundtrip_dtype(DType::Null); + roundtrip_dtype(DType::Bool(Nullability::NonNullable)); + roundtrip_dtype(DType::Int( + IntWidth::_64, + Signedness::Unsigned, + Nullability::NonNullable, + )); + roundtrip_dtype(DType::Decimal(18, 9, Nullability::NonNullable)); + roundtrip_dtype(DType::Float(FloatWidth::_64, Nullability::NonNullable)); + roundtrip_dtype(DType::Binary(Nullability::NonNullable)); + roundtrip_dtype(DType::Utf8(Nullability::NonNullable)); + roundtrip_dtype(DType::List( + Box::new(DType::Float(FloatWidth::_32, Nullability::Nullable)), + Nullability::NonNullable, + )); + roundtrip_dtype(DType::Struct( + vec![Arc::new("strings".into()), Arc::new("ints".into())], + vec![ + DType::Utf8(Nullability::NonNullable), + DType::Int(IntWidth::_16, Signedness::Unsigned, Nullability::Nullable), + ], + )) + } +} diff --git a/vortex-zigzag/src/lib.rs b/vortex-zigzag/src/lib.rs index 904c0baff0..0f4170dba6 100644 --- a/vortex-zigzag/src/lib.rs +++ b/vortex-zigzag/src/lib.rs @@ -1,5 +1,5 @@ use linkme::distributed_slice; -use vortex::array::{EncodingRef, ENCODINGS}; +use vortex::encoding::{EncodingRef, ENCODINGS}; pub use zigzag::*; diff --git a/vortex-zigzag/src/serde.rs b/vortex-zigzag/src/serde.rs index e4d340b4a1..ea43957ca1 100644 --- a/vortex-zigzag/src/serde.rs +++ b/vortex-zigzag/src/serde.rs @@ -9,6 +9,10 @@ impl ArraySerde for ZigZagArray { fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { ctx.write(self.encoded()) } + + fn metadata(&self) -> VortexResult>> { + Ok(None) + } } impl EncodingSerde for ZigZagEncoding { diff --git a/vortex-zigzag/src/zigzag.rs b/vortex-zigzag/src/zigzag.rs index cec402df80..bd11e785c7 100644 --- a/vortex-zigzag/src/zigzag.rs +++ b/vortex-zigzag/src/zigzag.rs @@ -1,12 +1,14 @@ use std::sync::{Arc, RwLock}; -use vortex::array::{Array, ArrayKind, ArrayRef, Encoding, EncodingId, EncodingRef}; +use vortex::array::{Array, ArrayKind, ArrayRef}; use vortex::compress::EncodingCompression; +use vortex::compute::ArrayCompute; +use vortex::encoding::{Encoding, EncodingId, EncodingRef}; use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::impl_array; use vortex::serde::{ArraySerde, EncodingSerde}; use vortex::stats::{Stats, StatsSet}; use vortex::validity::{ArrayValidity, Validity}; +use vortex::{impl_array, ArrayWalker}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_schema::{DType, Signedness}; @@ -90,6 +92,10 @@ impl Array for ZigZagArray { fn serde(&self) -> Option<&dyn ArraySerde> { Some(self) } + + fn walk(&self, walker: &mut dyn ArrayWalker) -> VortexResult<()> { + walker.visit_child(self.encoded()) + } } impl ArrayDisplay for ZigZagArray {