diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index ad31a90eb1..39fc15bd7d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -12,7 +12,6 @@ permissions: jobs: bench: runs-on: ubuntu-latest-large - if: ${{ github.ref_name == 'develop' }} steps: - uses: actions/checkout@v4 with: @@ -28,6 +27,12 @@ jobs: - name: Setup bencher.dev CLI uses: bencherdev/bencher@main + # The compression benchmarks rely on DuckDB being installed to convert CSV to Parquet + - name: Install DuckDB + uses: opt-nc/setup-duckdb-action@v1.0.7 + with: + version: v1.0.0 + - name: Upload Vortex Benchmarks run: | bencher run \ @@ -38,4 +43,10 @@ jobs: "cargo bench -p bench-vortex" env: BENCHER_TOKEN: ${{ secrets.BENCHER_TOKEN }} + # AWS Credentials for R2 storage tests + AWS_BUCKET: vortex-test + AWS_ENDPOINT: ${{ secrets.AWS_ENDPOINT }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + diff --git a/Cargo.lock b/Cargo.lock index d62a57d3f6..e20aa8771a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -431,6 +431,7 @@ dependencies = [ "enum-iterator", "flexbuffers", "futures", + "homedir", "humansize", "indicatif", "itertools 0.13.0", @@ -445,6 +446,7 @@ dependencies = [ "reqwest", "serde", "simplelog", + "tar", "tokio", "uuid", "vortex-alp", @@ -1405,6 +1407,18 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.4.1", + "windows-sys 0.52.0", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -1677,6 +1691,18 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "homedir" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bed305c13ce3829a09d627f5d43ff738482a09361ae4eb8039993b55fb10e5e" +dependencies = [ + "cfg-if", + "nix", + "widestring", + "windows", +] + [[package]] name = "http" version = "1.1.0" @@ -1817,7 +1843,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.52.0", ] [[package]] @@ -2538,7 +2564,7 @@ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.3", "smallvec", "windows-targets 0.52.6", ] @@ -3046,6 +3072,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_syscall" version = "0.5.3" @@ -3641,6 +3676,17 @@ dependencies = [ "libc", ] +[[package]] +name = "tar" +version = "0.4.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "target-lexicon" version = "0.12.15" @@ -4485,6 +4531,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "widestring" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7219d36b6eac893fa81e84ebe06485e7dcbb616177469b142df14f1f4deb1311" + [[package]] name = "winapi" version = "0.3.9" @@ -4516,6 +4568,16 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -4525,6 +4587,49 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4757,6 +4862,17 @@ dependencies = [ "web-sys", ] +[[package]] +name = "xattr" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + [[package]] name = "xshell" version = "0.2.6" diff --git a/Cargo.toml b/Cargo.toml index f28eaa85ac..e4d019b361 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ futures-util = "0.3.30" getrandom = "0.2.14" half = { version = "^2", features = ["std", "num-traits"] } hashbrown = "0.14.3" +homedir = "0.3.3" humansize = "2.1.3" indicatif = "0.17.8" itertools = "0.13.0" @@ -104,6 +105,7 @@ serde = "1.0.197" serde_json = "1.0.116" serde_test = "1.0.176" simplelog = { version = "0.12.2", features = ["paris"] } +tar = "0.4" thiserror = "1.0.58" tokio = "1.37.0" uninit = "0.6.2" diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index 540f7fe503..33aca5c10e 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -26,6 +26,7 @@ datafusion = { workspace = true } enum-iterator = { workspace = true } flexbuffers = { workspace = true } futures = { workspace = true, features = ["executor"] } +homedir = { workspace = true } humansize = { workspace = true } indicatif = { workspace = true } itertools = { workspace = true } @@ -40,6 +41,7 @@ rayon = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } simplelog = { workspace = true } +tar = { workspace = true } tokio = { workspace = true, features = ["full"] } uuid = { workspace = true, features = ["v4"] } vortex-alp = { path = "../encodings/alp" } diff --git a/bench-vortex/src/tpch/dbgen.rs b/bench-vortex/src/tpch/dbgen.rs index 70684784ee..93de628aab 100644 --- a/bench-vortex/src/tpch/dbgen.rs +++ b/bench-vortex/src/tpch/dbgen.rs @@ -1,7 +1,12 @@ +use std::fmt::{Display, Formatter}; +use std::fs::File; +use std::io::copy; use std::path::{Path, PathBuf}; +use std::process::Command; use itertools::Itertools; -use xshell::{cmd, Shell}; +use tar::Archive; +use xshell::Shell; /// Download TPC-H data via Docker. @@ -15,6 +20,9 @@ pub struct DBGenOptions { /// Location on-disk to store generated files. pub base_dir: PathBuf, + + /// Location of where we may cache the dbgen tool download. + pub cache_dir: PathBuf, } impl Default for DBGenOptions { @@ -27,6 +35,12 @@ impl Default for DBGenOptions { Self { scale_factor: 1, base_dir: std::env::current_dir().unwrap().join("data").join("tpch"), + cache_dir: homedir::my_home() + .unwrap() + .unwrap() + .join(".cache") + .join("vortex") + .join("dbgen"), } } } @@ -36,6 +50,7 @@ impl DBGenOptions { Self { base_dir: dir.as_ref().to_owned(), scale_factor: self.scale_factor, + cache_dir: self.cache_dir, } } } @@ -64,14 +79,27 @@ impl DBGen { return Ok(output_dir); } - let tpch_path = output_dir.canonicalize()?.to_string_lossy().to_string(); - - // Generate the files using Docker container. - cmd!( - sh, - "docker run --rm -v {tpch_path}:/data ghcr.io/scalytics/tpch-docker:main -s {scale_factor} -v -f" - ) - .run()?; + let dbgen_binary = get_cached_dbgen(&self.options.cache_dir)?; + let dists_file = dbgen_binary.parent().unwrap().join("dists.dss"); + + // Generate the files using our DBGen tool + let output = Command::new(dbgen_binary) + .current_dir(&output_dir) + .args(vec![ + "-b", + dists_file.into_os_string().into_string().unwrap().as_str(), + "-s", + scale_factor.as_str(), + "-f", + "-v", + ]) + .output()?; + + if !output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("dbgen failed: stdout=\"{stdout}\", stderr=\"{stderr}\""); + } // Every tpch .tbl file is a pipe-separated values, but for some strange reason, *also* includes // a trailing pipe at the end of every line. @@ -98,3 +126,80 @@ fn clean_trailing_pipes>(sh: &Shell, path: P) -> anyhow::Result<( Ok(()) } + +#[derive(Clone, Copy)] +enum Platform { + MacOS, + Linux, +} + +impl Display for Platform { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let str = match self { + Platform::MacOS => "macos", + Platform::Linux => "linux", + }; + write!(f, "{}", str) + } +} + +// Increment this when we release new tpch-dbgen. +const DBGEN_VERSION: &str = "0.1.0"; + +// Return a handle to the downloaded toolchain. +fn get_cached_dbgen>(cache_dir: P) -> anyhow::Result { + if cfg!(target_os = "macos") { + return get_or_cache_toolchain(cache_dir.as_ref(), DBGEN_VERSION, Platform::MacOS); + } + + if cfg!(target_os = "linux") { + return get_or_cache_toolchain(cache_dir.as_ref(), DBGEN_VERSION, Platform::Linux); + } + + panic!("unsupported platform, only linux and macos supported") +} + +fn get_or_cache_toolchain( + cache_dir: &Path, + version: &str, + platform: Platform, +) -> anyhow::Result { + let download_dir = dbgen_dir(cache_dir, version, platform); + std::fs::create_dir_all(&download_dir)?; + + let url = format!("https://github.com/spiraldb/tpch-dbgen/releases/download/{version}/dbgen-{platform}-{version}.tar"); + + let mut zip_file = reqwest::blocking::get(url)?; + let zip_path = download_dir.join( + zip_file + .url() + .path_segments() + .and_then(|segments| segments.last()) + .unwrap(), + ); + + { + let mut file = File::create(&zip_path)?; + copy(&mut zip_file, &mut file)?; + } + + let file = File::open(&zip_path)?; + let mut archive = Archive::new(file); + + for entry in archive.entries()? { + let mut entry = entry?; + if !entry.unpack_in(&download_dir)? { + anyhow::bail!("failed to extract {:?} in {download_dir:?}", entry.path()?); + } + } + + Ok(dbgen_binary(cache_dir, version, platform)) +} + +fn dbgen_dir(cache_dir: &Path, version: &str, platform: Platform) -> PathBuf { + cache_dir.join(version).join(platform.to_string()) +} + +fn dbgen_binary(cache_dir: &Path, version: &str, platform: Platform) -> PathBuf { + dbgen_dir(cache_dir, version, platform).join("dbgen") +}