From 908888b5c65889493fe6ca32618aca9e5f5eaff9 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 12:41:37 -0400 Subject: [PATCH 1/9] successfully importing csv2parquet crate' --- Cargo.lock | 531 +++++++++++++++++++++------ optd-perftest/Cargo.toml | 4 +- optd-perftest/src/datafusion_dbms.rs | 8 +- optd-perftest/src/job.rs | 6 + optd-perftest/src/tpch.rs | 6 + 5 files changed, 435 insertions(+), 120 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f92018a..27e73843 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -186,19 +186,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fab9e93ba8ce88a37d5a30dce4b9913b75413dc1ac56cb5d72e5a840543f829" dependencies = [ "ahash", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 47.0.0", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-cast 47.0.0", + "arrow-csv 47.0.0", + "arrow-data 47.0.0", + "arrow-ipc 47.0.0", + "arrow-json 47.0.0", + "arrow-ord 47.0.0", + "arrow-row 47.0.0", + "arrow-schema 47.0.0", + "arrow-select 47.0.0", + "arrow-string 47.0.0", +] + +[[package]] +name = "arrow" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219d05930b81663fd3b32e3bde8ce5bff3c4d23052a99f11a8fa50a3b47b2658" +dependencies = [ + "arrow-arith 51.0.0", + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-cast 51.0.0", + "arrow-csv 51.0.0", + "arrow-data 51.0.0", + "arrow-ipc 51.0.0", + "arrow-json 51.0.0", + "arrow-ord 51.0.0", + "arrow-row 51.0.0", + "arrow-schema 51.0.0", + "arrow-select 51.0.0", + "arrow-string 51.0.0", ] [[package]] @@ -207,10 +228,25 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "chrono", + "half", + "num 0.4.1", +] + +[[package]] +name = "arrow-arith" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0272150200c07a86a390be651abdd320a2d12e84535f0837566ca87ecd8f95e0" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "chrono", "half", "num 0.4.1", @@ -223,9 +259,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", "chrono", "chrono-tz", "half", @@ -233,6 +269,22 @@ dependencies = [ "num 0.4.1", ] +[[package]] +name = "arrow-array" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8010572cf8c745e242d1b632bd97bd6d4f40fefed5ed1290a8f433abaa686fea" +dependencies = [ + "ahash", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", + "chrono", + "half", + "hashbrown 0.14.3", + "num 0.4.1", +] + [[package]] name = "arrow-buffer" version = "47.0.0" @@ -244,17 +296,28 @@ dependencies = [ "num 0.4.1", ] +[[package]] +name = "arrow-buffer" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d0a2432f0cba5692bf4cb757469c66791394bac9ec7ce63c1afe74744c37b27" +dependencies = [ + "bytes", + "half", + "num 0.4.1", +] + [[package]] name = "arrow-cast" version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "arrow-select 47.0.0", "chrono", "comfy-table", "half", @@ -262,17 +325,56 @@ dependencies = [ "num 0.4.1", ] +[[package]] +name = "arrow-cast" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9abc10cd7995e83505cc290df9384d6e5412b207b79ce6bdff89a10505ed2cba" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", + "arrow-select 51.0.0", + "atoi", + "base64 0.22.1", + "chrono", + "half", + "lexical-core", + "num 0.4.1", + "ryu", +] + [[package]] name = "arrow-csv" version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43ef855dc6b126dc197f43e061d4de46b9d4c033aa51c2587657f7508242cef1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-cast 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-csv" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95cbcba196b862270bf2a5edb75927380a7f3a163622c61d40cbba416a6305f2" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-cast 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "chrono", "csv", "csv-core", @@ -287,8 +389,20 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 47.0.0", + "arrow-schema 47.0.0", + "half", + "num 0.4.1", +] + +[[package]] +name = "arrow-data" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2742ac1f6650696ab08c88f6dd3f0eb68ce10f8c253958a18c943a68cd04aec5" +dependencies = [ + "arrow-buffer 51.0.0", + "arrow-schema 51.0.0", "half", "num 0.4.1", ] @@ -299,11 +413,25 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-cast 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "flatbuffers", +] + +[[package]] +name = "arrow-ipc" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a42ea853130f7e78b9b9d178cb4cd01dee0f78e64d96c2949dc0a915d6d9e19d" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-cast 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "flatbuffers", ] @@ -313,11 +441,31 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f03d7e3b04dd688ccec354fe449aed56b831679f03e44ee2c1cfc4045067b69c" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-cast 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "chrono", + "half", + "indexmap 2.1.0", + "lexical-core", + "num 0.4.1", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-json" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaafb5714d4e59feae964714d724f880511500e3569cc2a94d02456b403a2a49" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-cast 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "chrono", "half", "indexmap 2.1.0", @@ -333,11 +481,26 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "arrow-select 47.0.0", + "half", + "num 0.4.1", +] + +[[package]] +name = "arrow-ord" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e6b61e3dc468f503181dccc2fc705bdcc5f2f146755fa5b56d0a6c5943f412" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", + "arrow-select 51.0.0", "half", "num 0.4.1", ] @@ -349,10 +512,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "half", + "hashbrown 0.14.3", +] + +[[package]] +name = "arrow-row" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "848ee52bb92eb459b811fb471175ea3afcf620157674c8794f539838920f9228" +dependencies = [ + "ahash", + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "half", "hashbrown 0.14.3", ] @@ -363,6 +541,15 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" +[[package]] +name = "arrow-schema" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" +dependencies = [ + "serde", +] + [[package]] name = "arrow-select" version = "47.0.0" @@ -370,10 +557,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "num 0.4.1", +] + +[[package]] +name = "arrow-select" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "849524fa70e0e3c5ab58394c770cb8f514d0122d20de08475f7b472ed8075830" +dependencies = [ + "ahash", + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", "num 0.4.1", ] @@ -383,16 +584,38 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4cebbb282d6b9244895f4a9a912e55e57bce112554c7fa91fcec5459cb421ab" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-data 47.0.0", + "arrow-schema 47.0.0", + "arrow-select 47.0.0", "num 0.4.1", "regex", "regex-syntax 0.7.5", ] +[[package]] +name = "arrow-string" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9373cb5a021aee58863498c37eb484998ef13377f69989c6c5ccfbd258236cdb" +dependencies = [ + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-data 51.0.0", + "arrow-schema 51.0.0", + "arrow-select 51.0.0", + "memchr", + "num 0.4.1", + "regex", + "regex-syntax 0.8.2", +] + +[[package]] +name = "arrow-tools" +version = "0.18.0" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#bc06d5038e8febf5972e8646425b7da65af776fc" + [[package]] name = "assert_approx_eq" version = "1.1.0" @@ -454,6 +677,15 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atty" version = "0.2.14" @@ -791,6 +1023,12 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -1012,12 +1250,12 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.2" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", - "clap_derive 4.5.0", + "clap_derive 4.5.4", ] [[package]] @@ -1038,7 +1276,7 @@ version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro-error", "proc-macro2 1.0.78", "quote 1.0.35", @@ -1047,11 +1285,11 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.0" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2 1.0.78", "quote 1.0.35", "syn 2.0.48", @@ -1284,6 +1522,19 @@ dependencies = [ "memchr", ] +[[package]] +name = "csv2parquet" +version = "0.18.0" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#bc06d5038e8febf5972e8646425b7da65af776fc" +dependencies = [ + "arrow 51.0.0", + "arrow-schema 51.0.0", + "arrow-tools", + "clap 4.5.4", + "parquet 51.0.0", + "serde_json", +] + [[package]] name = "ctor" version = "0.2.6" @@ -1356,9 +1607,9 @@ checksum = "7014432223f4d721cb9786cd88bb89e7464e0ba984d4a7f49db7787f5f268674" dependencies = [ "ahash", "apache-avro", - "arrow", - "arrow-array", - "arrow-schema", + "arrow 47.0.0", + "arrow-array 47.0.0", + "arrow-schema 47.0.0", "async-compression", "async-trait", "bytes", @@ -1384,7 +1635,7 @@ dependencies = [ "num_cpus", "object_store", "parking_lot", - "parquet", + "parquet 47.0.0", "percent-encoding", "pin-project-lite", "rand 0.8.5", @@ -1406,15 +1657,15 @@ checksum = "cb3903ed8f102892f17b48efa437f3542159241d41c564f0d1e78efdc5e663aa" dependencies = [ "ahash", "apache-avro", - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow 47.0.0", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-schema 47.0.0", "chrono", "half", "num_cpus", "object_store", - "parquet", + "parquet 47.0.0", "sqlparser", ] @@ -1424,7 +1675,7 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "780b73b2407050e53f51a9781868593f694102c59e622de9a8aafc0343c4f237" dependencies = [ - "arrow", + "arrow 47.0.0", "chrono", "dashmap", "datafusion-common", @@ -1446,8 +1697,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24c382676338d8caba6c027ba0da47260f65ffedab38fda78f6d8043f607557c" dependencies = [ "ahash", - "arrow", - "arrow-array", + "arrow 47.0.0", + "arrow-array 47.0.0", "datafusion-common", "sqlparser", "strum", @@ -1458,7 +1709,7 @@ dependencies = [ name = "datafusion-optd-cli" version = "32.0.0" dependencies = [ - "arrow", + "arrow 47.0.0", "assert_cmd", "async-trait", "aws-config", @@ -1489,7 +1740,7 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f2904a432f795484fd45e29ded4537152adb60f636c05691db34fcd94c92c96" dependencies = [ - "arrow", + "arrow 47.0.0", "async-trait", "chrono", "datafusion-common", @@ -1508,11 +1759,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57b4968e9a998dc0476c4db7a82f280e2026b25f464e4aa0c3bb9807ee63ddfd" dependencies = [ "ahash", - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", - "base64", + "arrow 47.0.0", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-schema 47.0.0", + "base64 0.21.5", "blake2", "blake3", "chrono", @@ -1542,10 +1793,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efd0d1fe54e37a47a2d58a1232c22786f2c28ad35805fdcd08f0253a8b0aaa90" dependencies = [ "ahash", - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", + "arrow 47.0.0", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-schema 47.0.0", "async-trait", "chrono", "datafusion-common", @@ -1572,8 +1823,8 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b568d44c87ead99604d704f942e257c8a236ee1bbf890ee3e034ad659dcb2c21" dependencies = [ - "arrow", - "arrow-schema", + "arrow 47.0.0", + "arrow-schema 47.0.0", "datafusion-common", "datafusion-expr", "log", @@ -2046,6 +2297,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -2484,6 +2741,15 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +dependencies = [ + "twox-hash", +] + [[package]] name = "lzma-sys" version = "0.1.20" @@ -2507,9 +2773,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "mimalloc" @@ -2743,7 +3009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" dependencies = [ "async-trait", - "base64", + "base64 0.21.5", "bytes", "chrono", "futures", @@ -2804,7 +3070,7 @@ name = "optd-core" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema", + "arrow-schema 47.0.0", "chrono", "itertools 0.11.0", "num-derive", @@ -2821,7 +3087,7 @@ name = "optd-datafusion-bridge" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema", + "arrow-schema 47.0.0", "async-recursion", "async-trait", "datafusion", @@ -2839,7 +3105,7 @@ name = "optd-datafusion-repr" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema", + "arrow-schema 47.0.0", "assert_approx_eq", "async-trait", "bincode", @@ -2885,7 +3151,8 @@ dependencies = [ "anyhow", "assert_cmd", "async-trait", - "clap 4.5.2", + "clap 4.5.4", + "csv2parquet", "datafusion", "datafusion-optd-cli", "env_logger 0.11.2", @@ -2897,7 +3164,7 @@ dependencies = [ "optd-datafusion-bridge", "optd-datafusion-repr", "optd-gungnir", - "parquet", + "parquet 47.0.0", "prettytable-rs", "rayon", "regex", @@ -3007,14 +3274,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", + "arrow-array 47.0.0", + "arrow-buffer 47.0.0", + "arrow-cast 47.0.0", + "arrow-data 47.0.0", + "arrow-ipc 47.0.0", + "arrow-schema 47.0.0", + "arrow-select 47.0.0", + "base64 0.21.5", "brotli", "bytes", "chrono", @@ -3034,6 +3301,38 @@ dependencies = [ "zstd 0.12.4", ] +[[package]] +name = "parquet" +version = "51.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "096795d4f47f65fd3ee1ec5a98b77ab26d602f2cc785b0e4be5443add17ecc32" +dependencies = [ + "ahash", + "arrow-array 51.0.0", + "arrow-buffer 51.0.0", + "arrow-cast 51.0.0", + "arrow-data 51.0.0", + "arrow-ipc 51.0.0", + "arrow-schema 51.0.0", + "arrow-select 51.0.0", + "base64 0.22.1", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.14.3", + "lz4_flex", + "num 0.4.1", + "num-bigint 0.4.4", + "paste", + "seq-macro", + "snap", + "thrift", + "twox-hash", + "zstd 0.13.0", +] + [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -3147,7 +3446,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" dependencies = [ - "base64", + "base64 0.21.5", "byteorder", "bytes", "fallible-iterator", @@ -3551,7 +3850,7 @@ version = "0.11.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" dependencies = [ - "base64", + "base64 0.21.5", "bytes", "encoding_rs", "futures-core", @@ -3718,7 +4017,7 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "base64", + "base64 0.21.5", ] [[package]] @@ -3857,9 +4156,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.108" +version = "1.0.116" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" dependencies = [ "itoa", "ryu", @@ -3884,7 +4183,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee80b0e361bbf88fd2f6e242ccd19cfda072cb0faa6ae694ecee08199938569a" dependencies = [ - "base64", + "base64 0.21.5", "chrono", "hex", "indexmap 1.9.3", @@ -3990,7 +4289,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2 1.0.78", "quote 1.0.35", "syn 1.0.109", @@ -4122,7 +4421,7 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2 1.0.78", "quote 1.0.35", "rustversion", diff --git a/optd-perftest/Cargo.toml b/optd-perftest/Cargo.toml index fe08820e..0e8427cb 100644 --- a/optd-perftest/Cargo.toml +++ b/optd-perftest/Cargo.toml @@ -31,7 +31,7 @@ tokio = { version = "1.24", features = [ shlex = "1.3" tokio-postgres = "0.7" regex = "1.10" -clap = { version = "4.5", features = [ +clap = { version = "4.5.4", features = [ "derive", ] } log = "0.4" @@ -47,6 +47,8 @@ itertools = "0.12.1" test-case = "3.3" rayon = "1.10" parquet = "47.0.0" +csv2parquet = { git = "https://github.com/wangpatrick57/arrow-tools.git", branch = "main" } + [dev_dependencies] assert_cmd = "2.0" diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index 2a3ec9a5..ccae4d8d 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -362,7 +362,7 @@ impl DatafusionDBMS { &mut self, tpch_kit_config: &TpchKitConfig, ) -> anyhow::Result<()> { - // Generate the tables. + // Generate the tables and convert them to Parquet. let tpch_kit = TpchKit::build(&self.workspace_dpath)?; tpch_kit.gen_tables(tpch_kit_config)?; @@ -486,14 +486,14 @@ impl DatafusionDBMS { ) -> anyhow::Result<()> { let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; - // Download the tables. + // Download the tables and convert them to Parquet. let job_kit = JobKit::build(&self.workspace_dpath)?; job_kit.download_tables(job_kit_config)?; // Create the tables. Self::create_job_tables(&ctx, &job_kit).await?; - // Load each table using register_csv() + // Load each table using register_csv(). let tbl_fpath_iter = job_kit.get_tbl_fpath_vec("csv").unwrap(); for tbl_fpath in tbl_fpath_iter { let tbl_name = tbl_fpath.file_stem().unwrap().to_str().unwrap(); @@ -527,6 +527,7 @@ impl DatafusionDBMS { // Generate the tables let tpch_kit = TpchKit::build(&self.workspace_dpath)?; tpch_kit.gen_tables(tpch_kit_config)?; + tpch_kit.make_parquet_files(tpch_kit_config)?; // To get the schema of each table. let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_TPCH).await?; @@ -552,6 +553,7 @@ impl DatafusionDBMS { // Generate the tables. let job_kit = JobKit::build(&self.workspace_dpath)?; job_kit.download_tables(job_kit_config)?; + job_kit.make_parquet_files(job_kit_config)?; // To get the schema of each table. let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index b6eb4f14..6c7b8641 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -126,6 +126,12 @@ impl JobKit { Ok(()) } + pub fn make_parquet_files(&self, job_kit_config: &JobKitConfig) -> io::Result<()> { + println!("{}", csv2parquet::test::hi()); + panic!(); + Ok(()) + } + /// Convert a tbl_fpath into the table name pub fn get_tbl_name_from_tbl_fpath>(tbl_fpath: P) -> String { tbl_fpath diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index 6c8e53d4..d31fee18 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -146,6 +146,12 @@ impl TpchKit { Ok(()) } + pub fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig) -> io::Result<()> { + println!("{}", csv2parquet::test::hi()); + panic!(); + Ok(()) + } + /// Generates the .sql files for all queries of TPC-H, with one .sql file per query pub fn gen_queries(&self, tpch_kit_config: &TpchKitConfig) -> io::Result<()> { let this_genned_queries_dpath = self.get_this_genned_queries_dpath(tpch_kit_config); From e1ef9c5134e24b2d195f57dd870d8e7604175001 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 13:17:34 -0400 Subject: [PATCH 2/9] now calling convert --- Cargo.lock | 4 ++-- optd-perftest/src/job.rs | 2 -- optd-perftest/src/tpch.rs | 13 +++++++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 27e73843..c5588588 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,7 +614,7 @@ dependencies = [ [[package]] name = "arrow-tools" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#bc06d5038e8febf5972e8646425b7da65af776fc" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#857393e949471d8e6d9e16281a3295cf16f2e155" [[package]] name = "assert_approx_eq" @@ -1525,7 +1525,7 @@ dependencies = [ [[package]] name = "csv2parquet" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#bc06d5038e8febf5972e8646425b7da65af776fc" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#857393e949471d8e6d9e16281a3295cf16f2e155" dependencies = [ "arrow 51.0.0", "arrow-schema 51.0.0", diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index 6c7b8641..e4e64e38 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -127,8 +127,6 @@ impl JobKit { } pub fn make_parquet_files(&self, job_kit_config: &JobKitConfig) -> io::Result<()> { - println!("{}", csv2parquet::test::hi()); - panic!(); Ok(()) } diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index d31fee18..cda70798 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -1,4 +1,5 @@ /// A wrapper around tpch-kit +use csv2parquet::Opts; use serde::{Deserialize, Serialize}; use crate::shell; @@ -147,8 +148,16 @@ impl TpchKit { } pub fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig) -> io::Result<()> { - println!("{}", csv2parquet::test::hi()); - panic!(); + let csv_tbl_fpaths = self.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap(); + + for csv_tbl_fpath in csv_tbl_fpaths { + let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); + parquet_tbl_fpath.set_extension("parquet"); + println!("csv_tbl_fpath={:?}, parquet_tbl_fpath={:?}", csv_tbl_fpath, parquet_tbl_fpath); + let opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath); + csv2parquet::convert(opts).unwrap(); + } + Ok(()) } From dd2b756ff08e0c7c676d5d284f609a19777313f0 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 15:03:10 -0400 Subject: [PATCH 3/9] now writing to parquet --- Cargo.lock | 4 ++-- optd-perftest/src/tpch.rs | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c5588588..081ab564 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,7 +614,7 @@ dependencies = [ [[package]] name = "arrow-tools" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#857393e949471d8e6d9e16281a3295cf16f2e155" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#428e2375bdf0fb341d21b69aff81c15d9cb97d2d" [[package]] name = "assert_approx_eq" @@ -1525,7 +1525,7 @@ dependencies = [ [[package]] name = "csv2parquet" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#857393e949471d8e6d9e16281a3295cf16f2e155" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#428e2375bdf0fb341d21b69aff81c15d9cb97d2d" dependencies = [ "arrow 51.0.0", "arrow-schema 51.0.0", diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index cda70798..9f0cf9c8 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -148,15 +148,16 @@ impl TpchKit { } pub fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig) -> io::Result<()> { - let csv_tbl_fpaths = self.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap(); + // let csv_tbl_fpaths = self.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap(); - for csv_tbl_fpath in csv_tbl_fpaths { - let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); - parquet_tbl_fpath.set_extension("parquet"); - println!("csv_tbl_fpath={:?}, parquet_tbl_fpath={:?}", csv_tbl_fpath, parquet_tbl_fpath); - let opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath); - csv2parquet::convert(opts).unwrap(); - } + // for csv_tbl_fpath in csv_tbl_fpaths { + // let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); + // parquet_tbl_fpath.set_extension("parquet"); + // println!("csv_tbl_fpath={:?}, parquet_tbl_fpath={:?}", csv_tbl_fpath, parquet_tbl_fpath); + // let mut opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath); + // opts.delimiter = '|'; + // csv2parquet::convert(opts).unwrap(); + // } Ok(()) } From 38e10e922c1b1d3561a95c40bfffe5a097ca9c06 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 17:44:08 -0400 Subject: [PATCH 4/9] now getting schema from datafusion to pass to csv2parquet --- Cargo.lock | 486 ++++++--------------------- optd-perftest/Cargo.toml | 1 + optd-perftest/src/datafusion_dbms.rs | 82 +++-- optd-perftest/src/job.rs | 23 +- optd-perftest/src/main.rs | 3 +- optd-perftest/src/tpch.rs | 33 +- 6 files changed, 217 insertions(+), 411 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 081ab564..ad909048 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -186,40 +186,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fab9e93ba8ce88a37d5a30dce4b9913b75413dc1ac56cb5d72e5a840543f829" dependencies = [ "ahash", - "arrow-arith 47.0.0", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-cast 47.0.0", - "arrow-csv 47.0.0", - "arrow-data 47.0.0", - "arrow-ipc 47.0.0", - "arrow-json 47.0.0", - "arrow-ord 47.0.0", - "arrow-row 47.0.0", - "arrow-schema 47.0.0", - "arrow-select 47.0.0", - "arrow-string 47.0.0", -] - -[[package]] -name = "arrow" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219d05930b81663fd3b32e3bde8ce5bff3c4d23052a99f11a8fa50a3b47b2658" -dependencies = [ - "arrow-arith 51.0.0", - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-cast 51.0.0", - "arrow-csv 51.0.0", - "arrow-data 51.0.0", - "arrow-ipc 51.0.0", - "arrow-json 51.0.0", - "arrow-ord 51.0.0", - "arrow-row 51.0.0", - "arrow-schema 51.0.0", - "arrow-select 51.0.0", - "arrow-string 51.0.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] @@ -228,25 +207,10 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "chrono", - "half", - "num 0.4.1", -] - -[[package]] -name = "arrow-arith" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0272150200c07a86a390be651abdd320a2d12e84535f0837566ca87ecd8f95e0" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "half", "num 0.4.1", @@ -259,9 +223,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" dependencies = [ "ahash", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "chrono-tz", "half", @@ -269,22 +233,6 @@ dependencies = [ "num 0.4.1", ] -[[package]] -name = "arrow-array" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8010572cf8c745e242d1b632bd97bd6d4f40fefed5ed1290a8f433abaa686fea" -dependencies = [ - "ahash", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", - "chrono", - "half", - "hashbrown 0.14.3", - "num 0.4.1", -] - [[package]] name = "arrow-buffer" version = "47.0.0" @@ -296,28 +244,17 @@ dependencies = [ "num 0.4.1", ] -[[package]] -name = "arrow-buffer" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d0a2432f0cba5692bf4cb757469c66791394bac9ec7ce63c1afe74744c37b27" -dependencies = [ - "bytes", - "half", - "num 0.4.1", -] - [[package]] name = "arrow-cast" version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "arrow-select 47.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "chrono", "comfy-table", "half", @@ -325,56 +262,17 @@ dependencies = [ "num 0.4.1", ] -[[package]] -name = "arrow-cast" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9abc10cd7995e83505cc290df9384d6e5412b207b79ce6bdff89a10505ed2cba" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", - "arrow-select 51.0.0", - "atoi", - "base64 0.22.1", - "chrono", - "half", - "lexical-core", - "num 0.4.1", - "ryu", -] - [[package]] name = "arrow-csv" version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43ef855dc6b126dc197f43e061d4de46b9d4c033aa51c2587657f7508242cef1" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-cast 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "chrono", - "csv", - "csv-core", - "lazy_static", - "lexical-core", - "regex", -] - -[[package]] -name = "arrow-csv" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95cbcba196b862270bf2a5edb75927380a7f3a163622c61d40cbba416a6305f2" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-cast 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "chrono", "csv", "csv-core", @@ -389,20 +287,8 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" dependencies = [ - "arrow-buffer 47.0.0", - "arrow-schema 47.0.0", - "half", - "num 0.4.1", -] - -[[package]] -name = "arrow-data" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2742ac1f6650696ab08c88f6dd3f0eb68ce10f8c253958a18c943a68cd04aec5" -dependencies = [ - "arrow-buffer 51.0.0", - "arrow-schema 51.0.0", + "arrow-buffer", + "arrow-schema", "half", "num 0.4.1", ] @@ -413,25 +299,11 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-cast 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "flatbuffers", -] - -[[package]] -name = "arrow-ipc" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a42ea853130f7e78b9b9d178cb4cd01dee0f78e64d96c2949dc0a915d6d9e19d" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-cast 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "flatbuffers", ] @@ -441,31 +313,11 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f03d7e3b04dd688ccec354fe449aed56b831679f03e44ee2c1cfc4045067b69c" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-cast 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "chrono", - "half", - "indexmap 2.1.0", - "lexical-core", - "num 0.4.1", - "serde", - "serde_json", -] - -[[package]] -name = "arrow-json" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaafb5714d4e59feae964714d724f880511500e3569cc2a94d02456b403a2a49" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-cast 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "chrono", "half", "indexmap 2.1.0", @@ -481,26 +333,11 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "arrow-select 47.0.0", - "half", - "num 0.4.1", -] - -[[package]] -name = "arrow-ord" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3e6b61e3dc468f503181dccc2fc705bdcc5f2f146755fa5b56d0a6c5943f412" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", - "arrow-select 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "half", "num 0.4.1", ] @@ -512,25 +349,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22" dependencies = [ "ahash", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "half", - "hashbrown 0.14.3", -] - -[[package]] -name = "arrow-row" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848ee52bb92eb459b811fb471175ea3afcf620157674c8794f539838920f9228" -dependencies = [ - "ahash", - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", "hashbrown 0.14.3", ] @@ -540,12 +362,6 @@ name = "arrow-schema" version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" - -[[package]] -name = "arrow-schema" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" dependencies = [ "serde", ] @@ -557,24 +373,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108" dependencies = [ "ahash", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "num 0.4.1", -] - -[[package]] -name = "arrow-select" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "849524fa70e0e3c5ab58394c770cb8f514d0122d20de08475f7b472ed8075830" -dependencies = [ - "ahash", - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num 0.4.1", ] @@ -584,37 +386,20 @@ version = "47.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4cebbb282d6b9244895f4a9a912e55e57bce112554c7fa91fcec5459cb421ab" dependencies = [ - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-data 47.0.0", - "arrow-schema 47.0.0", - "arrow-select 47.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "num 0.4.1", "regex", "regex-syntax 0.7.5", ] -[[package]] -name = "arrow-string" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9373cb5a021aee58863498c37eb484998ef13377f69989c6c5ccfbd258236cdb" -dependencies = [ - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-data 51.0.0", - "arrow-schema 51.0.0", - "arrow-select 51.0.0", - "memchr", - "num 0.4.1", - "regex", - "regex-syntax 0.8.2", -] - [[package]] name = "arrow-tools" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#428e2375bdf0fb341d21b69aff81c15d9cb97d2d" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#ceafc8e09e9acda077fa5be7c549ec024f005559" [[package]] name = "assert_approx_eq" @@ -677,15 +462,6 @@ dependencies = [ "syn 2.0.48", ] -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - [[package]] name = "atty" version = "0.2.14" @@ -1023,12 +799,6 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - [[package]] name = "base64-simd" version = "0.8.0" @@ -1525,13 +1295,13 @@ dependencies = [ [[package]] name = "csv2parquet" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#428e2375bdf0fb341d21b69aff81c15d9cb97d2d" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#ceafc8e09e9acda077fa5be7c549ec024f005559" dependencies = [ - "arrow 51.0.0", - "arrow-schema 51.0.0", + "arrow", + "arrow-schema", "arrow-tools", "clap 4.5.4", - "parquet 51.0.0", + "parquet", "serde_json", ] @@ -1607,9 +1377,9 @@ checksum = "7014432223f4d721cb9786cd88bb89e7464e0ba984d4a7f49db7787f5f268674" dependencies = [ "ahash", "apache-avro", - "arrow 47.0.0", - "arrow-array 47.0.0", - "arrow-schema 47.0.0", + "arrow", + "arrow-array", + "arrow-schema", "async-compression", "async-trait", "bytes", @@ -1635,7 +1405,7 @@ dependencies = [ "num_cpus", "object_store", "parking_lot", - "parquet 47.0.0", + "parquet", "percent-encoding", "pin-project-lite", "rand 0.8.5", @@ -1657,15 +1427,15 @@ checksum = "cb3903ed8f102892f17b48efa437f3542159241d41c564f0d1e78efdc5e663aa" dependencies = [ "ahash", "apache-avro", - "arrow 47.0.0", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-schema 47.0.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", "chrono", "half", "num_cpus", "object_store", - "parquet 47.0.0", + "parquet", "sqlparser", ] @@ -1675,7 +1445,7 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "780b73b2407050e53f51a9781868593f694102c59e622de9a8aafc0343c4f237" dependencies = [ - "arrow 47.0.0", + "arrow", "chrono", "dashmap", "datafusion-common", @@ -1697,8 +1467,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24c382676338d8caba6c027ba0da47260f65ffedab38fda78f6d8043f607557c" dependencies = [ "ahash", - "arrow 47.0.0", - "arrow-array 47.0.0", + "arrow", + "arrow-array", "datafusion-common", "sqlparser", "strum", @@ -1709,7 +1479,7 @@ dependencies = [ name = "datafusion-optd-cli" version = "32.0.0" dependencies = [ - "arrow 47.0.0", + "arrow", "assert_cmd", "async-trait", "aws-config", @@ -1740,7 +1510,7 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f2904a432f795484fd45e29ded4537152adb60f636c05691db34fcd94c92c96" dependencies = [ - "arrow 47.0.0", + "arrow", "async-trait", "chrono", "datafusion-common", @@ -1759,11 +1529,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57b4968e9a998dc0476c4db7a82f280e2026b25f464e4aa0c3bb9807ee63ddfd" dependencies = [ "ahash", - "arrow 47.0.0", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-schema 47.0.0", - "base64 0.21.5", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "base64", "blake2", "blake3", "chrono", @@ -1793,10 +1563,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efd0d1fe54e37a47a2d58a1232c22786f2c28ad35805fdcd08f0253a8b0aaa90" dependencies = [ "ahash", - "arrow 47.0.0", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-schema 47.0.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", "async-trait", "chrono", "datafusion-common", @@ -1823,8 +1593,8 @@ version = "32.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b568d44c87ead99604d704f942e257c8a236ee1bbf890ee3e034ad659dcb2c21" dependencies = [ - "arrow 47.0.0", - "arrow-schema 47.0.0", + "arrow", + "arrow-schema", "datafusion-common", "datafusion-expr", "log", @@ -2741,15 +2511,6 @@ dependencies = [ "libc", ] -[[package]] -name = "lz4_flex" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" -dependencies = [ - "twox-hash", -] - [[package]] name = "lzma-sys" version = "0.1.20" @@ -3009,7 +2770,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" dependencies = [ "async-trait", - "base64 0.21.5", + "base64", "bytes", "chrono", "futures", @@ -3070,7 +2831,7 @@ name = "optd-core" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema 47.0.0", + "arrow-schema", "chrono", "itertools 0.11.0", "num-derive", @@ -3087,7 +2848,7 @@ name = "optd-datafusion-bridge" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema 47.0.0", + "arrow-schema", "async-recursion", "async-trait", "datafusion", @@ -3105,7 +2866,7 @@ name = "optd-datafusion-repr" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema 47.0.0", + "arrow-schema", "assert_approx_eq", "async-trait", "bincode", @@ -3149,6 +2910,7 @@ name = "optd-perftest" version = "0.1.0" dependencies = [ "anyhow", + "arrow-schema", "assert_cmd", "async-trait", "clap 4.5.4", @@ -3164,7 +2926,7 @@ dependencies = [ "optd-datafusion-bridge", "optd-datafusion-repr", "optd-gungnir", - "parquet 47.0.0", + "parquet", "prettytable-rs", "rayon", "regex", @@ -3274,14 +3036,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" dependencies = [ "ahash", - "arrow-array 47.0.0", - "arrow-buffer 47.0.0", - "arrow-cast 47.0.0", - "arrow-data 47.0.0", - "arrow-ipc 47.0.0", - "arrow-schema 47.0.0", - "arrow-select 47.0.0", - "base64 0.21.5", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", "brotli", "bytes", "chrono", @@ -3301,38 +3063,6 @@ dependencies = [ "zstd 0.12.4", ] -[[package]] -name = "parquet" -version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "096795d4f47f65fd3ee1ec5a98b77ab26d602f2cc785b0e4be5443add17ecc32" -dependencies = [ - "ahash", - "arrow-array 51.0.0", - "arrow-buffer 51.0.0", - "arrow-cast 51.0.0", - "arrow-data 51.0.0", - "arrow-ipc 51.0.0", - "arrow-schema 51.0.0", - "arrow-select 51.0.0", - "base64 0.22.1", - "brotli", - "bytes", - "chrono", - "flate2", - "half", - "hashbrown 0.14.3", - "lz4_flex", - "num 0.4.1", - "num-bigint 0.4.4", - "paste", - "seq-macro", - "snap", - "thrift", - "twox-hash", - "zstd 0.13.0", -] - [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -3446,7 +3176,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" dependencies = [ - "base64 0.21.5", + "base64", "byteorder", "bytes", "fallible-iterator", @@ -3850,7 +3580,7 @@ version = "0.11.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" dependencies = [ - "base64 0.21.5", + "base64", "bytes", "encoding_rs", "futures-core", @@ -4017,7 +3747,7 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "base64 0.21.5", + "base64", ] [[package]] @@ -4183,7 +3913,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee80b0e361bbf88fd2f6e242ccd19cfda072cb0faa6ae694ecee08199938569a" dependencies = [ - "base64 0.21.5", + "base64", "chrono", "hex", "indexmap 1.9.3", diff --git a/optd-perftest/Cargo.toml b/optd-perftest/Cargo.toml index 0e8427cb..58137a34 100644 --- a/optd-perftest/Cargo.toml +++ b/optd-perftest/Cargo.toml @@ -48,6 +48,7 @@ test-case = "3.3" rayon = "1.10" parquet = "47.0.0" csv2parquet = { git = "https://github.com/wangpatrick57/arrow-tools.git", branch = "main" } +arrow-schema = { version = "47.0.0", features = ["serde"] } [dev_dependencies] diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index ccae4d8d..e904aa23 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -24,6 +24,7 @@ use datafusion::{ }; use datafusion_optd_cli::helper::unescape_input; +use itertools::Itertools; use lazy_static::lazy_static; use optd_datafusion_bridge::{DatafusionCatalog, OptdQueryPlanner}; use optd_datafusion_repr::{ @@ -322,7 +323,7 @@ impl DatafusionDBMS { match benchmark { Benchmark::Tpch(_) => { let tpch_kit = TpchKit::build(&self.workspace_dpath)?; - self.create_tpch_tables(&tpch_kit).await?; + Self::create_tpch_tables(self.get_ctx(), &tpch_kit).await?; } Benchmark::Job(_) | Benchmark::Joblight(_) => { let job_kit = JobKit::build(&self.workspace_dpath)?; @@ -332,7 +333,7 @@ impl DatafusionDBMS { Ok(()) } - async fn create_tpch_tables(&mut self, tpch_kit: &TpchKit) -> anyhow::Result<()> { + async fn create_tpch_tables(ctx: &SessionContext, tpch_kit: &TpchKit) -> anyhow::Result<()> { let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?; let ddls = ddls .split(';') @@ -340,7 +341,7 @@ impl DatafusionDBMS { .filter(|s| !s.is_empty()) .collect::>(); for ddl in ddls { - Self::execute(self.get_ctx(), ddl).await?; + Self::execute(ctx, ddl).await?; } Ok(()) } @@ -367,7 +368,7 @@ impl DatafusionDBMS { tpch_kit.gen_tables(tpch_kit_config)?; // Create the tables. - self.create_tpch_tables(&tpch_kit).await?; + Self::create_tpch_tables(self.get_ctx(), &tpch_kit).await?; // Load the data by creating an external table first and copying the data to real tables. let tbl_fpath_iter = tpch_kit.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap(); @@ -394,6 +395,10 @@ impl DatafusionDBMS { .await .unwrap() .schema(); + + // DEBUG(phw2) + println!("schema={}", serde_json::to_string_pretty(&schema).unwrap()); + let projection_list = (1..=schema.fields().len()) .map(|i| format!("column_{}", i)) .collect::>() @@ -476,7 +481,34 @@ impl DatafusionDBMS { println!("Total execution time {:?}...", now.elapsed()); - Ok(base_table_stats.into_inner()?) + let stats = base_table_stats.into_inner(); + let l = stats.unwrap(); + l.iter().for_each(|(table_name, stats)| { + println!("Table: {} (num_rows: {})", table_name, stats.row_cnt); + stats + .column_comb_stats + .iter() + .sorted_by_key(|x| x.0[0]) + .for_each(|x| { + let sum_freq: f64 = x.1.mcvs.frequencies().iter().map(|(_, v)| *v).sum(); + println!( + "Col: {} (n_distinct: {}) (n_frac: {}) (mcvs: {} {}) (tdigests: {:?} {:?} {:?} {:?} {:?})", + x.0[0], + x.1.ndistinct, + x.1.null_frac, + x.1.mcvs.frequencies().len(), + sum_freq, + x.1.distr.as_ref().map(|d| d.quantile(0.01)), + x.1.distr.as_ref().map(|d| d.quantile(0.25)), + x.1.distr.as_ref().map(|d| d.quantile(0.50)), + x.1.distr.as_ref().map(|d| d.quantile(0.75)), + x.1.distr.as_ref().map(|d| d.quantile(0.99)), + ); + }); + }); + // println!("{:#?}", stats); + + Ok(l) } // Load job data from a .csv file. @@ -524,25 +556,22 @@ impl DatafusionDBMS { &mut self, tpch_kit_config: &TpchKitConfig, ) -> anyhow::Result { - // Generate the tables - let tpch_kit = TpchKit::build(&self.workspace_dpath)?; - tpch_kit.gen_tables(tpch_kit_config)?; - tpch_kit.make_parquet_files(tpch_kit_config)?; - - // To get the schema of each table. + // Create tables in a temporary context to get the schema provider. let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_TPCH).await?; - let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?; - let ddls = ddls - .split(';') - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - .collect::>(); - for ddl in ddls { - Self::execute(&ctx, ddl).await?; - } + let tpch_kit = TpchKit::build(&self.workspace_dpath)?; + Self::create_tpch_tables(&ctx, &tpch_kit).await?; + let schema_provider = ctx + .catalog("datafusion") + .unwrap() + .schema("public") + .unwrap(); + // Generate the tables + tpch_kit.gen_tables(tpch_kit_config)?; + tpch_kit.make_parquet_files(tpch_kit_config, schema_provider).await?; // Compute base statistics on Parquet. let tbl_paths = tpch_kit.get_tbl_fpath_vec(tpch_kit_config, "parquet")?; + assert!(tbl_paths.len() == tpch_kit.get_tbl_fpath_vec(tpch_kit_config, "tbl")?.len()); Self::gen_base_stats(tbl_paths) } @@ -550,10 +579,20 @@ impl DatafusionDBMS { &mut self, job_kit_config: &JobKitConfig, ) -> anyhow::Result { + // Create tables in a temporary context to get the schema provider. + let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; + let job_kit = JobKit::build(&self.workspace_dpath)?; + Self::create_job_tables(&ctx, &job_kit).await?; + let schema_provider = ctx + .catalog("datafusion") + .unwrap() + .schema("public") + .unwrap(); + // Generate the tables. let job_kit = JobKit::build(&self.workspace_dpath)?; job_kit.download_tables(job_kit_config)?; - job_kit.make_parquet_files(job_kit_config)?; + job_kit.make_parquet_files(job_kit_config, schema_provider).await?; // To get the schema of each table. let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; @@ -569,6 +608,7 @@ impl DatafusionDBMS { // Compute base statistics on Parquet. let tbl_paths = job_kit.get_tbl_fpath_vec("parquet").unwrap(); + assert!(tbl_paths.len() == job_kit.get_tbl_fpath_vec("csv")?.len()); Self::gen_base_stats(tbl_paths) } } diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index e4e64e38..53c674e0 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -1,3 +1,5 @@ +use csv2parquet::Opts; +use datafusion::catalog::schema::SchemaProvider; /// A wrapper around job-kit use serde::{Deserialize, Serialize}; @@ -7,6 +9,7 @@ use std::fs; use std::fs::File; use std::io; use std::path::{Path, PathBuf}; +use std::sync::Arc; const JOB_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/job-kit.git"; const JOB_TABLES_URL: &str = "https://homepages.cwi.nl/~boncz/job/imdb.tgz"; @@ -126,7 +129,25 @@ impl JobKit { Ok(()) } - pub fn make_parquet_files(&self, job_kit_config: &JobKitConfig) -> io::Result<()> { + pub async fn make_parquet_files(&self, job_kit_config: &JobKitConfig, schema_provider: Arc) -> io::Result<()> { + let done_fpath = self.downloaded_tables_dpath.join("make_parquet_done"); + if !done_fpath.exists() { + log::debug!("[start] making parquet for {}", job_kit_config); + for csv_tbl_fpath in self.get_tbl_fpath_vec("csv").unwrap() { + let tbl_name = Self::get_tbl_name_from_tbl_fpath(&csv_tbl_fpath); + let schema = schema_provider.table(&tbl_name).await.unwrap().schema(); + let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); + parquet_tbl_fpath.set_extension("parquet"); + let mut opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath.clone()); + opts.delimiter = ','; + opts.schema = Some(schema.as_ref().clone()); + csv2parquet::convert(opts).unwrap(); + } + File::create(done_fpath)?; + log::debug!("[end] making parquet for {}", job_kit_config); + } else { + log::debug!("[skip] making parquet for {}", job_kit_config); + } Ok(()) } diff --git a/optd-perftest/src/main.rs b/optd-perftest/src/main.rs index ebf2184e..69772f3b 100644 --- a/optd-perftest/src/main.rs +++ b/optd-perftest/src/main.rs @@ -5,8 +5,9 @@ use optd_perftest::job::JobKitConfig; use optd_perftest::shell; use optd_perftest::tpch::{TpchKitConfig, TPCH_KIT_POSTGRES}; use optd_perftest::{cardtest, job, tpch}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use prettytable::{format, Table}; -use std::fs; +use std::fs::{self, File}; use std::path::Path; #[derive(Parser)] diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index 9f0cf9c8..3c07303a 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -1,5 +1,7 @@ /// A wrapper around tpch-kit use csv2parquet::Opts; +use datafusion::catalog::schema::SchemaProvider; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use serde::{Deserialize, Serialize}; use crate::shell; @@ -10,6 +12,7 @@ use std::fs; use std::fs::File; use std::io; use std::path::{Path, PathBuf}; +use std::sync::Arc; const TPCH_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/tpch-kit.git"; pub const TPCH_KIT_POSTGRES: &str = "POSTGRESQL"; @@ -147,17 +150,27 @@ impl TpchKit { Ok(()) } - pub fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig) -> io::Result<()> { - // let csv_tbl_fpaths = self.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap(); + pub async fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig, schema_provider: Arc) -> io::Result<()> { + let this_genned_tables_dpath = self.get_this_genned_tables_dpath(tpch_kit_config); + let done_fpath = this_genned_tables_dpath.join("make_parquet_done"); - // for csv_tbl_fpath in csv_tbl_fpaths { - // let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); - // parquet_tbl_fpath.set_extension("parquet"); - // println!("csv_tbl_fpath={:?}, parquet_tbl_fpath={:?}", csv_tbl_fpath, parquet_tbl_fpath); - // let mut opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath); - // opts.delimiter = '|'; - // csv2parquet::convert(opts).unwrap(); - // } + if !done_fpath.exists() { + log::debug!("[start] making parquet for {}", tpch_kit_config); + for csv_tbl_fpath in self.get_tbl_fpath_vec(tpch_kit_config, "tbl").unwrap() { + let tbl_name = Self::get_tbl_name_from_tbl_fpath(&csv_tbl_fpath); + let schema = schema_provider.table(&tbl_name).await.unwrap().schema(); + let mut parquet_tbl_fpath = csv_tbl_fpath.clone(); + parquet_tbl_fpath.set_extension("parquet"); + let mut opts = Opts::new(csv_tbl_fpath, parquet_tbl_fpath.clone()); + opts.delimiter = '|'; + opts.schema = Some(schema.as_ref().clone()); + csv2parquet::convert(opts).unwrap(); + } + File::create(done_fpath)?; + log::debug!("[end] making parquet for {}", tpch_kit_config); + } else { + log::debug!("[skip] making parquet for {}", tpch_kit_config); + } Ok(()) } From 7709a8a5a365bbfd16e1e0115aefd4a8fb534d40 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 18:40:26 -0400 Subject: [PATCH 5/9] lock --- Cargo.lock | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad909048..125aee21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -399,7 +399,7 @@ dependencies = [ [[package]] name = "arrow-tools" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#ceafc8e09e9acda077fa5be7c549ec024f005559" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#c04460346d808268e7811b212212c3442428330c" [[package]] name = "assert_approx_eq" @@ -1295,13 +1295,14 @@ dependencies = [ [[package]] name = "csv2parquet" version = "0.18.0" -source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#ceafc8e09e9acda077fa5be7c549ec024f005559" +source = "git+https://github.com/wangpatrick57/arrow-tools.git?branch=main#c04460346d808268e7811b212212c3442428330c" dependencies = [ "arrow", "arrow-schema", "arrow-tools", "clap 4.5.4", "parquet", + "regex", "serde_json", ] From 6a2358588965c1513f1de35f8e9c83b374c0a33e Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 18:42:02 -0400 Subject: [PATCH 6/9] fmt and clip --- optd-perftest/src/datafusion_dbms.rs | 24 ++++++++++-------------- optd-perftest/src/job.rs | 6 +++++- optd-perftest/src/main.rs | 3 +-- optd-perftest/src/tpch.rs | 7 +++++-- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index e904aa23..9052c37d 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -490,14 +490,14 @@ impl DatafusionDBMS { .iter() .sorted_by_key(|x| x.0[0]) .for_each(|x| { - let sum_freq: f64 = x.1.mcvs.frequencies().iter().map(|(_, v)| *v).sum(); + let sum_freq: f64 = x.1.mcvs.frequencies().values().copied().sum(); println!( "Col: {} (n_distinct: {}) (n_frac: {}) (mcvs: {} {}) (tdigests: {:?} {:?} {:?} {:?} {:?})", x.0[0], x.1.ndistinct, x.1.null_frac, x.1.mcvs.frequencies().len(), - sum_freq, + sum_freq, x.1.distr.as_ref().map(|d| d.quantile(0.01)), x.1.distr.as_ref().map(|d| d.quantile(0.25)), x.1.distr.as_ref().map(|d| d.quantile(0.50)), @@ -560,15 +560,13 @@ impl DatafusionDBMS { let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_TPCH).await?; let tpch_kit = TpchKit::build(&self.workspace_dpath)?; Self::create_tpch_tables(&ctx, &tpch_kit).await?; - let schema_provider = ctx - .catalog("datafusion") - .unwrap() - .schema("public") - .unwrap(); + let schema_provider = ctx.catalog("datafusion").unwrap().schema("public").unwrap(); // Generate the tables tpch_kit.gen_tables(tpch_kit_config)?; - tpch_kit.make_parquet_files(tpch_kit_config, schema_provider).await?; + tpch_kit + .make_parquet_files(tpch_kit_config, schema_provider) + .await?; // Compute base statistics on Parquet. let tbl_paths = tpch_kit.get_tbl_fpath_vec(tpch_kit_config, "parquet")?; assert!(tbl_paths.len() == tpch_kit.get_tbl_fpath_vec(tpch_kit_config, "tbl")?.len()); @@ -583,16 +581,14 @@ impl DatafusionDBMS { let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; let job_kit = JobKit::build(&self.workspace_dpath)?; Self::create_job_tables(&ctx, &job_kit).await?; - let schema_provider = ctx - .catalog("datafusion") - .unwrap() - .schema("public") - .unwrap(); + let schema_provider = ctx.catalog("datafusion").unwrap().schema("public").unwrap(); // Generate the tables. let job_kit = JobKit::build(&self.workspace_dpath)?; job_kit.download_tables(job_kit_config)?; - job_kit.make_parquet_files(job_kit_config, schema_provider).await?; + job_kit + .make_parquet_files(job_kit_config, schema_provider) + .await?; // To get the schema of each table. let ctx = Self::new_session_ctx(None, self.adaptive, WITH_LOGICAL_FOR_JOB).await?; diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index 53c674e0..dd256f0a 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -129,7 +129,11 @@ impl JobKit { Ok(()) } - pub async fn make_parquet_files(&self, job_kit_config: &JobKitConfig, schema_provider: Arc) -> io::Result<()> { + pub async fn make_parquet_files( + &self, + job_kit_config: &JobKitConfig, + schema_provider: Arc, + ) -> io::Result<()> { let done_fpath = self.downloaded_tables_dpath.join("make_parquet_done"); if !done_fpath.exists() { log::debug!("[start] making parquet for {}", job_kit_config); diff --git a/optd-perftest/src/main.rs b/optd-perftest/src/main.rs index 69772f3b..ebf2184e 100644 --- a/optd-perftest/src/main.rs +++ b/optd-perftest/src/main.rs @@ -5,9 +5,8 @@ use optd_perftest::job::JobKitConfig; use optd_perftest::shell; use optd_perftest::tpch::{TpchKitConfig, TPCH_KIT_POSTGRES}; use optd_perftest::{cardtest, job, tpch}; -use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use prettytable::{format, Table}; -use std::fs::{self, File}; +use std::fs; use std::path::Path; #[derive(Parser)] diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index 3c07303a..95bac5d0 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -1,7 +1,6 @@ /// A wrapper around tpch-kit use csv2parquet::Opts; use datafusion::catalog::schema::SchemaProvider; -use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use serde::{Deserialize, Serialize}; use crate::shell; @@ -150,7 +149,11 @@ impl TpchKit { Ok(()) } - pub async fn make_parquet_files(&self, tpch_kit_config: &TpchKitConfig, schema_provider: Arc) -> io::Result<()> { + pub async fn make_parquet_files( + &self, + tpch_kit_config: &TpchKitConfig, + schema_provider: Arc, + ) -> io::Result<()> { let this_genned_tables_dpath = self.get_this_genned_tables_dpath(tpch_kit_config); let done_fpath = this_genned_tables_dpath.join("make_parquet_done"); From b10de9fbb9beef40982d596986e72cbda1e0e9bb Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 19:00:29 -0400 Subject: [PATCH 7/9] cmt --- optd-perftest/src/datafusion_dbms.rs | 48 ++++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/optd-perftest/src/datafusion_dbms.rs b/optd-perftest/src/datafusion_dbms.rs index 9052c37d..ccae7439 100644 --- a/optd-perftest/src/datafusion_dbms.rs +++ b/optd-perftest/src/datafusion_dbms.rs @@ -24,7 +24,6 @@ use datafusion::{ }; use datafusion_optd_cli::helper::unescape_input; -use itertools::Itertools; use lazy_static::lazy_static; use optd_datafusion_bridge::{DatafusionCatalog, OptdQueryPlanner}; use optd_datafusion_repr::{ @@ -483,29 +482,30 @@ impl DatafusionDBMS { let stats = base_table_stats.into_inner(); let l = stats.unwrap(); - l.iter().for_each(|(table_name, stats)| { - println!("Table: {} (num_rows: {})", table_name, stats.row_cnt); - stats - .column_comb_stats - .iter() - .sorted_by_key(|x| x.0[0]) - .for_each(|x| { - let sum_freq: f64 = x.1.mcvs.frequencies().values().copied().sum(); - println!( - "Col: {} (n_distinct: {}) (n_frac: {}) (mcvs: {} {}) (tdigests: {:?} {:?} {:?} {:?} {:?})", - x.0[0], - x.1.ndistinct, - x.1.null_frac, - x.1.mcvs.frequencies().len(), - sum_freq, - x.1.distr.as_ref().map(|d| d.quantile(0.01)), - x.1.distr.as_ref().map(|d| d.quantile(0.25)), - x.1.distr.as_ref().map(|d| d.quantile(0.50)), - x.1.distr.as_ref().map(|d| d.quantile(0.75)), - x.1.distr.as_ref().map(|d| d.quantile(0.99)), - ); - }); - }); + // Useful for debugging stats so I kept it + // l.iter().for_each(|(table_name, stats)| { + // println!("Table: {} (num_rows: {})", table_name, stats.row_cnt); + // stats + // .column_comb_stats + // .iter() + // .sorted_by_key(|x| x.0[0]) + // .for_each(|x| { + // let sum_freq: f64 = x.1.mcvs.frequencies().values().copied().sum(); + // println!( + // "Col: {} (n_distinct: {}) (n_frac: {}) (mcvs: {} {}) (tdigests: {:?} {:?} {:?} {:?} {:?})", + // x.0[0], + // x.1.ndistinct, + // x.1.null_frac, + // x.1.mcvs.frequencies().len(), + // sum_freq, + // x.1.distr.as_ref().map(|d| d.quantile(0.01)), + // x.1.distr.as_ref().map(|d| d.quantile(0.25)), + // x.1.distr.as_ref().map(|d| d.quantile(0.50)), + // x.1.distr.as_ref().map(|d| d.quantile(0.75)), + // x.1.distr.as_ref().map(|d| d.quantile(0.99)), + // ); + // }); + // }); // println!("{:#?}", stats); Ok(l) From d3380c4ced8cb55bd1dcfc849a19f07365c1be19 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 21:29:21 -0400 Subject: [PATCH 8/9] updated working queries --- dev_scripts/which_queries_work.sh | 3 +-- .../src/cost/base_cost/filter.rs | 1 + optd-perftest/src/job.rs | 19 ++----------------- optd-perftest/src/tpch.rs | 3 +-- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/dev_scripts/which_queries_work.sh b/dev_scripts/which_queries_work.sh index 5a6796d9..98ff623b 100755 --- a/dev_scripts/which_queries_work.sh +++ b/dev_scripts/which_queries_work.sh @@ -24,8 +24,7 @@ fi successful_ids=() IFS=',' for id in $all_ids; do - # make sure to execute with --adaptive so that we actually run the query in datafusion - cargo run --bin optd-perftest cardtest $benchmark_name --query-ids $id --adaptive &>/dev/null + cargo run --release --bin optd-perftest cardtest $benchmark_name --query-ids $id &>/dev/null if [ $? -eq 0 ]; then echo >&2 $id succeeded diff --git a/optd-datafusion-repr/src/cost/base_cost/filter.rs b/optd-datafusion-repr/src/cost/base_cost/filter.rs index 20f7d1bd..75e1722f 100644 --- a/optd-datafusion-repr/src/cost/base_cost/filter.rs +++ b/optd-datafusion-repr/src/cost/base_cost/filter.rs @@ -1,5 +1,6 @@ use std::ops::Bound; +use num_traits::clamp; use optd_core::{ cascades::{CascadesOptimizer, RelNodeContext}, cost::Cost, diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index dd256f0a..cfd91689 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -13,23 +13,8 @@ use std::sync::Arc; const JOB_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/job-kit.git"; const JOB_TABLES_URL: &str = "https://homepages.cwi.nl/~boncz/job/imdb.tgz"; -pub const WORKING_JOB_QUERY_IDS: &[&str] = &[ - "1a", "1b", "1c", "1d", "2a", "2b", "2c", "2d", "3a", "3b", "3c", "4a", "4b", "4c", "5a", "5b", - "5c", "6a", "6b", "6c", "6d", "6e", "6f", "7b", "8a", "8b", "8c", "8d", "9b", "9c", "9d", - "10a", "10b", "10c", "12a", "12b", "12c", "13a", "13b", "13c", "13d", "14a", "14b", "14c", - "15a", "15b", "15c", "15d", "16a", "16b", "16c", "16d", "17a", "17b", "17c", "17d", "17e", - "17f", "18a", "18c", "19b", "19c", "19d", "20a", "20b", "20c", "22a", "22b", "22c", "22d", - "23a", "23b", "23c", "24a", "24b", "25a", "25b", "25c", "26a", "26b", "26c", "28a", "28b", - "28c", "29a", "29b", "29c", "30a", "30b", "30c", "31a", "31b", "31c", "32a", "32b", "33a", - "33b", "33c", -]; -pub const WORKING_JOBLIGHT_QUERY_IDS: &[&str] = &[ - "1a", "1b", "1c", "1d", "2a", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "6a", "6b", "6c", - "6d", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "10a", "10b", "10c", "11a", "11b", "11c", - "12a", "12b", "12c", "13a", "14a", "14b", "14c", "16a", "17a", "17b", "17c", "18a", "19b", - "20a", "20b", "20c", "21a", "21b", "22b", "23b", "24a", "24b", "25a", "26a", "26b", "27a", - "27b", -]; +pub const WORKING_JOB_QUERY_IDS: &[&str] = &["1a", "1b", "1c", "1d", "2a", "2b", "2d", "3a", "3b", "3c", "4a", "4b", "4c", "5c", "6a", "6b", "6c", "6d", "6e", "6f", "7b", "8a", "8b", "8c", "8d", "9b", "9c", "9d", "10a", "10c", "12a", "12b", "12c", "13a", "13b", "13c", "13d", "14a", "14b", "14c", "15a", "15b", "15c", "15d", "16a", "16b", "16c", "16d", "17a", "17b", "17c", "17d", "17e", "17f", "18a", "18c", "19b", "19c", "19d", "20a", "20b", "20c", "22a", "22b", "22c", "22d", "23a", "23b", "23c", "24a", "24b", "25a", "25b", "25c", "26a", "26b", "26c", "28a", "28b", "28c", "29a", "29b", "29c", "30a", "30b", "30c", "31a", "31b", "31c", "32b", "33a", "33b", "33c"]; +pub const WORKING_JOBLIGHT_QUERY_IDS: &[&str] = &["1a", "1b", "1c", "1d", "2a", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "6a", "6b", "6c", "6d", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "10a", "10b", "10c", "11a", "11b", "11c", "12a", "12b", "12c", "13a", "14a", "14b", "14c", "16a", "17a", "17b", "17c", "18a", "19b", "20a", "20b", "20c", "21a", "21b", "22b", "23b", "24a", "24b", "25a", "26a", "26b", "27a", "27b"]; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct JobKitConfig { diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index 95bac5d0..6cacae83 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -16,8 +16,7 @@ use std::sync::Arc; const TPCH_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/tpch-kit.git"; pub const TPCH_KIT_POSTGRES: &str = "POSTGRESQL"; const NUM_TPCH_QUERIES: usize = 22; -pub const WORKING_QUERY_IDS: &[&str] = - &["2", "3", "5", "7", "8", "9", "10", "12", "13", "14", "17"]; +pub const WORKING_QUERY_IDS: &[&str] = &["2", "3", "5", "6", "7", "8", "9", "10", "12", "13", "14", "17", "19"]; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct TpchKitConfig { From 1f49bbb09e05cef2e2053655b226e88eabcc2fad Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Wed, 1 May 2024 21:29:45 -0400 Subject: [PATCH 9/9] fmt clip --- .../src/cost/base_cost/filter.rs | 1 - optd-perftest/src/job.rs | 18 ++++++++++++++++-- optd-perftest/src/tpch.rs | 4 +++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/optd-datafusion-repr/src/cost/base_cost/filter.rs b/optd-datafusion-repr/src/cost/base_cost/filter.rs index 75e1722f..20f7d1bd 100644 --- a/optd-datafusion-repr/src/cost/base_cost/filter.rs +++ b/optd-datafusion-repr/src/cost/base_cost/filter.rs @@ -1,6 +1,5 @@ use std::ops::Bound; -use num_traits::clamp; use optd_core::{ cascades::{CascadesOptimizer, RelNodeContext}, cost::Cost, diff --git a/optd-perftest/src/job.rs b/optd-perftest/src/job.rs index cfd91689..da898cfe 100644 --- a/optd-perftest/src/job.rs +++ b/optd-perftest/src/job.rs @@ -13,8 +13,22 @@ use std::sync::Arc; const JOB_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/job-kit.git"; const JOB_TABLES_URL: &str = "https://homepages.cwi.nl/~boncz/job/imdb.tgz"; -pub const WORKING_JOB_QUERY_IDS: &[&str] = &["1a", "1b", "1c", "1d", "2a", "2b", "2d", "3a", "3b", "3c", "4a", "4b", "4c", "5c", "6a", "6b", "6c", "6d", "6e", "6f", "7b", "8a", "8b", "8c", "8d", "9b", "9c", "9d", "10a", "10c", "12a", "12b", "12c", "13a", "13b", "13c", "13d", "14a", "14b", "14c", "15a", "15b", "15c", "15d", "16a", "16b", "16c", "16d", "17a", "17b", "17c", "17d", "17e", "17f", "18a", "18c", "19b", "19c", "19d", "20a", "20b", "20c", "22a", "22b", "22c", "22d", "23a", "23b", "23c", "24a", "24b", "25a", "25b", "25c", "26a", "26b", "26c", "28a", "28b", "28c", "29a", "29b", "29c", "30a", "30b", "30c", "31a", "31b", "31c", "32b", "33a", "33b", "33c"]; -pub const WORKING_JOBLIGHT_QUERY_IDS: &[&str] = &["1a", "1b", "1c", "1d", "2a", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "6a", "6b", "6c", "6d", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "10a", "10b", "10c", "11a", "11b", "11c", "12a", "12b", "12c", "13a", "14a", "14b", "14c", "16a", "17a", "17b", "17c", "18a", "19b", "20a", "20b", "20c", "21a", "21b", "22b", "23b", "24a", "24b", "25a", "26a", "26b", "27a", "27b"]; +pub const WORKING_JOB_QUERY_IDS: &[&str] = &[ + "1a", "1b", "1c", "1d", "2a", "2b", "2d", "3a", "3b", "3c", "4a", "4b", "4c", "5c", "6a", "6b", + "6c", "6d", "6e", "6f", "7b", "8a", "8b", "8c", "8d", "9b", "9c", "9d", "10a", "10c", "12a", + "12b", "12c", "13a", "13b", "13c", "13d", "14a", "14b", "14c", "15a", "15b", "15c", "15d", + "16a", "16b", "16c", "16d", "17a", "17b", "17c", "17d", "17e", "17f", "18a", "18c", "19b", + "19c", "19d", "20a", "20b", "20c", "22a", "22b", "22c", "22d", "23a", "23b", "23c", "24a", + "24b", "25a", "25b", "25c", "26a", "26b", "26c", "28a", "28b", "28c", "29a", "29b", "29c", + "30a", "30b", "30c", "31a", "31b", "31c", "32b", "33a", "33b", "33c", +]; +pub const WORKING_JOBLIGHT_QUERY_IDS: &[&str] = &[ + "1a", "1b", "1c", "1d", "2a", "3a", "3b", "4a", "4b", "4c", "5a", "5b", "5c", "6a", "6b", "6c", + "6d", "7a", "7b", "7c", "8a", "8b", "8c", "9a", "9b", "10a", "10b", "10c", "11a", "11b", "11c", + "12a", "12b", "12c", "13a", "14a", "14b", "14c", "16a", "17a", "17b", "17c", "18a", "19b", + "20a", "20b", "20c", "21a", "21b", "22b", "23b", "24a", "24b", "25a", "26a", "26b", "27a", + "27b", +]; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct JobKitConfig { diff --git a/optd-perftest/src/tpch.rs b/optd-perftest/src/tpch.rs index 6cacae83..58569200 100644 --- a/optd-perftest/src/tpch.rs +++ b/optd-perftest/src/tpch.rs @@ -16,7 +16,9 @@ use std::sync::Arc; const TPCH_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/tpch-kit.git"; pub const TPCH_KIT_POSTGRES: &str = "POSTGRESQL"; const NUM_TPCH_QUERIES: usize = 22; -pub const WORKING_QUERY_IDS: &[&str] = &["2", "3", "5", "6", "7", "8", "9", "10", "12", "13", "14", "17", "19"]; +pub const WORKING_QUERY_IDS: &[&str] = &[ + "2", "3", "5", "6", "7", "8", "9", "10", "12", "13", "14", "17", "19", +]; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct TpchKitConfig {