spiraldb · gatesn · Mar 27, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -39,6 +39,7 @@ arrayref = "0.3.7"
 arrow = { version = "51.0.0", features = ["pyarrow"] }
 arrow-array = "51.0.0"
 arrow-buffer = "51.0.0"
+arrow-select = "51.0.0"
 arrow-schema = "51.0.0"
 bindgen = "0.69.4"
 criterion = { version = "0.5.1", features = ["html_reports"] }

diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml
@@ -16,19 +16,21 @@ workspace = true
 
 [dependencies]
 arrow-array = { workspace = true }
+arrow-select = { workspace = true }
+vortex-alp = { path = "../vortex-alp" }
 vortex-array = { path = "../vortex-array" }
 vortex-datetime = { path = "../vortex-datetime" }
-vortex-alp = { path = "../vortex-alp" }
 vortex-dict = { path = "../vortex-dict" }
+vortex-error = { path = "../vortex-error", features = ["parquet"] }
 vortex-fastlanes = { path = "../vortex-fastlanes" }
 vortex-ree = { path = "../vortex-ree" }
 vortex-roaring = { path = "../vortex-roaring" }
 vortex-schema = { path = "../vortex-schema" }
 vortex-zigzag = { path = "../vortex-zigzag" }
 itertools = { workspace = true }
-reqwest = { workspace = true }
-parquet = { workspace = true }
 log = { workspace = true }
+parquet = { workspace = true }
+reqwest = { workspace = true }
 simplelog = { workspace = true }
 
 [dev-dependencies]

diff --git a/bench-vortex/benches/compress_benchmark.rs b/bench-vortex/benches/compress_benchmark.rs
@@ -1,10 +1,10 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
 use bench_vortex::compress_taxi_data;
-use bench_vortex::taxi_data::download_taxi_data;
+use bench_vortex::taxi_data::taxi_data_parquet;
 
 fn vortex_compress(c: &mut Criterion) {
-    download_taxi_data();
+    taxi_data_parquet();
     let mut group = c.benchmark_group("end to end");
     group.sample_size(10);
     group.bench_function("compress", |b| b.iter(|| black_box(compress_taxi_data())));

diff --git a/bench-vortex/benches/random_access.rs b/bench-vortex/benches/random_access.rs
@@ -1,19 +1,22 @@
-use bench_vortex::taxi_data::{take_taxi_data, write_taxi_data};
+use bench_vortex::reader::take_vortex;
+use bench_vortex::taxi_data::taxi_data_vortex_compressed;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use itertools::Itertools;
-
-use vortex::array::ENCODINGS;
 
 fn random_access(c: &mut Criterion) {
-    let taxi_spiral = write_taxi_data();
+    let mut group = c.benchmark_group("random access");
+    // group.sample_size(10);
+
     let indices = [10, 11, 12, 13, 100_000, 3_000_000];
-    println!(
-        "ENCODINGS {:?}",
-        ENCODINGS.iter().map(|e| e.id()).collect_vec()
-    );
-    c.bench_function("random access", |b| {
-        b.iter(|| black_box(take_taxi_data(&taxi_spiral, &indices)))
+
+    let taxi_vortex = taxi_data_vortex_compressed();
+    group.bench_function("vortex", |b| {
+        b.iter(|| black_box(take_vortex(&taxi_vortex, &indices).unwrap()))
     });
+    //
+    // let taxi_parquet = taxi_data_parquet();
+    // group.bench_function("arrow", |b| {
+    //     b.iter(|| black_box(take_parquet(&taxi_parquet, &indices)?))
+    // });
 }
 
 criterion_group!(benches, random_access);

diff --git a/bench-vortex/src/bin/compress.rs b/bench-vortex/src/bin/compress.rs
@@ -0,0 +1,28 @@
+use bench_vortex::reader::{compress_vortex, open_vortex};
+use bench_vortex::setup_logger;
+use bench_vortex::taxi_data::taxi_data_parquet;
+use log::LevelFilter;
+use std::fs::File;
+use std::os::unix::prelude::MetadataExt;
+use std::path::PathBuf;
+use vortex::array::Array;
+use vortex::formatter::display_tree;
+
+pub fn main() {
+    setup_logger(LevelFilter::Debug);
+
+    let path: PathBuf = "taxi_data.vortex".into();
+    {
+        let mut write = File::create(&path).unwrap();
+        compress_vortex(&taxi_data_parquet(), &mut write).unwrap();
+    }
+
+    let taxi_vortex = open_vortex(&path).unwrap();
+
+    let pq_size = taxi_data_parquet().metadata().unwrap().size();
+    let vx_size = taxi_vortex.nbytes();
+
+    println!("{}\n\n", display_tree(taxi_vortex.as_ref()));
+    println!("Parquet size: {}, Vortex size: {}", pq_size, vx_size);
+    println!("Compression ratio: {}", vx_size as f32 / pq_size as f32);
+}
diff --git a/bench-vortex/src/bin/serde.rs b/bench-vortex/src/bin/serde.rs
@@ -1,10 +1,11 @@
+use bench_vortex::reader::take_vortex;
 use bench_vortex::setup_logger;
-use bench_vortex::taxi_data::{take_taxi_data, write_taxi_data};
+use bench_vortex::taxi_data::taxi_data_vortex;
 use log::LevelFilter;
 
 pub fn main() {
     setup_logger(LevelFilter::Debug);
-    let taxi_spiral = write_taxi_data();
-    let rows = take_taxi_data(&taxi_spiral, &[10, 11, 12, 13]); //, 100_000, 3_000_000]);
+    let taxi_vortex = taxi_data_vortex();
+    let rows = take_vortex(&taxi_vortex, &[10, 11, 12, 13, 100_000, 3_000_000]).unwrap();
     println!("TAKE TAXI DATA: {:?}", rows);
 }
diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs
@@ -9,6 +9,7 @@ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 use parquet::arrow::ProjectionMask;
 use simplelog::{ColorChoice, Config, TermLogger, TerminalMode};
 
+use crate::taxi_data::taxi_data_parquet;
 use vortex::array::chunked::ChunkedArray;
 use vortex::array::downcast::DowncastArrayBuiltin;
 use vortex::array::IntoArray;
@@ -18,23 +19,35 @@ use vortex::compress::{CompressConfig, CompressCtx};
 use vortex::formatter::display_tree;
 use vortex_alp::ALPEncoding;
 use vortex_datetime::DateTimeEncoding;
-use vortex_fastlanes::{BitPackedEncoding, DeltaEncoding, FoREncoding};
+use vortex_dict::DictEncoding;
+use vortex_fastlanes::{BitPackedEncoding, FoREncoding};
 use vortex_ree::REEEncoding;
 use vortex_roaring::RoaringBoolEncoding;
 use vortex_schema::DType;
 
+pub mod reader;
 pub mod taxi_data;
 
-pub fn idempotent(name: &str, f: impl FnOnce(&mut File)) -> PathBuf {
+pub fn idempotent<T, E>(
+    name: &str,
+    f: impl FnOnce(&mut File) -> Result<T, E>,
+) -> Result<PathBuf, E> {
+    let path = data_path(name);
+    if !path.exists() {
+        let mut file = File::create(&path).unwrap();
+        f(&mut file)?;
+    }
+    Ok(path.to_path_buf())
+}
+
+pub fn data_path(name: &str) -> PathBuf {
     let path = Path::new(env!("CARGO_MANIFEST_DIR"))
         .join("data")
         .join(name);
-    if !path.exists() {
+    if !path.parent().unwrap().exists() {
         create_dir_all(path.parent().unwrap()).unwrap();
-        let mut file = File::create(&path).unwrap();
-        f(&mut file);
     }
-    path.to_path_buf()
+    path
 }
 
 pub fn setup_logger(level: LevelFilter) {
@@ -51,11 +64,11 @@ pub fn enumerate_arrays() -> Vec<EncodingRef> {
     println!("FOUND {:?}", ENCODINGS.iter().map(|e| e.id()).collect_vec());
     vec![
         &ALPEncoding,
-        //&DictEncoding,
+        &DictEncoding,
         &BitPackedEncoding,
         &FoREncoding,
         &DateTimeEncoding,
-        &DeltaEncoding,
+        // &DeltaEncoding,  Blows up the search space too much.
         &REEEncoding,
         &RoaringBoolEncoding,
         // RoaringIntEncoding,
@@ -71,7 +84,7 @@ pub fn compress_ctx() -> CompressCtx {
 }
 
 pub fn compress_taxi_data() -> ArrayRef {
-    let file = File::open(taxi_data::download_taxi_data()).unwrap();
+    let file = File::open(taxi_data_parquet()).unwrap();
     let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
     let _mask = ProjectionMask::roots(builder.parquet_schema(), [1]);
     let _no_datetime_mask = ProjectionMask::roots(
@@ -141,7 +154,7 @@ mod test {
     use vortex::encode::FromArrowArray;
     use vortex::serde::{ReadCtx, WriteCtx};
 
-    use crate::taxi_data::download_taxi_data;
+    use crate::taxi_data::taxi_data_parquet;
     use crate::{compress_ctx, compress_taxi_data, setup_logger};
 
     #[ignore]
@@ -154,7 +167,7 @@ mod test {
     #[ignore]
     #[test]
     fn round_trip_serde() {
-        let file = File::open(download_taxi_data()).unwrap();
+        let file = File::open(taxi_data_parquet()).unwrap();
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
         let reader = builder.with_limit(1).build().unwrap();
 
@@ -176,7 +189,7 @@ mod test {
     #[ignore]
     #[test]
     fn round_trip_arrow() {
-        let file = File::open(download_taxi_data()).unwrap();
+        let file = File::open(taxi_data_parquet()).unwrap();
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
         let reader = builder.with_limit(1).build().unwrap();
 
@@ -194,7 +207,7 @@ mod test {
     #[ignore]
     #[test]
     fn round_trip_arrow_compressed() {
-        let file = File::open(download_taxi_data()).unwrap();
+        let file = File::open(taxi_data_parquet()).unwrap();
         let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
         let reader = builder.with_limit(1).build().unwrap();
 

diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs
@@ -0,0 +1,124 @@
+use crate::compress_ctx;
+use arrow_array::types::Int64Type;
+use arrow_array::{
+    ArrayRef as ArrowArrayRef, PrimitiveArray as ArrowPrimitiveArray, RecordBatch,
+    RecordBatchReader,
+};
+use arrow_select::concat::concat_batches;
+use arrow_select::take::take_record_batch;
+use itertools::Itertools;
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
+use std::sync::Arc;
+use vortex::array::chunked::ChunkedArray;
+use vortex::array::primitive::PrimitiveArray;
+use vortex::array::{ArrayRef, IntoArray};
+use vortex::arrow::FromArrowType;
+use vortex::compute::flatten::flatten;
+use vortex::compute::take::take;
+use vortex::ptype::PType;
+use vortex::serde::{ReadCtx, WriteCtx};
+use vortex_error::VortexResult;
+use vortex_schema::DType;
+
+pub fn open_vortex(path: &Path) -> VortexResult<ArrayRef> {
+    let mut file = File::open(path)?;
+    let dummy_dtype: DType = PType::U8.into();
+    let mut read_ctx = ReadCtx::new(&dummy_dtype, &mut file);
+    let dtype = read_ctx.dtype()?;
+    read_ctx.with_schema(&dtype).read()
+}
+
+pub fn compress_vortex<W: Write>(parquet_path: &Path, write: &mut W) -> VortexResult<()> {
+    let taxi_pq = File::open(parquet_path)?;
+    let builder = ParquetRecordBatchReaderBuilder::try_new(taxi_pq)?;
+
+    // FIXME(ngates): #157 the compressor should handle batch size.
+    let reader = builder.with_batch_size(65_536).build()?;
+
+    let dtype = DType::from_arrow(reader.schema());
+    let ctx = compress_ctx();
+
+    let chunks = reader
+        .map(|batch_result| batch_result.unwrap())
+        .map(|record_batch| {
+            let vortex_array = record_batch.into_array();
+            ctx.compress(&vortex_array, None).unwrap()
+        })
+        .collect_vec();
+    let chunked = ChunkedArray::new(chunks, dtype.clone());
+
+    let mut write_ctx = WriteCtx::new(write);
+    write_ctx.dtype(&dtype).unwrap();
+    write_ctx.write(&chunked).unwrap();
+    Ok(())
+}
+
+pub fn take_vortex(path: &Path, indices: &[u64]) -> VortexResult<ArrayRef> {
+    let array = open_vortex(path)?;
+    let taken = take(&array, &PrimitiveArray::from(indices.to_vec()))?;
+    // For equivalence.... we flatten to make sure we're not cheating too much.
+    flatten(&taken).map(|x| x.into_array())
+}
+
+pub fn take_parquet(path: &Path, indices: &[u64]) -> VortexResult<RecordBatch> {
+    let file = File::open(path)?;
+
+    // TODO(ngates): enable read_page_index
+    let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+
+    // We figure out which row groups we need to read and a selection filter for each of them.
+    let mut row_groups = HashMap::new();
+    let mut row_group_offsets = vec![0];
+    row_group_offsets.extend(
+        builder
+            .metadata()
+            .row_groups()
+            .iter()
+            .map(|rg| rg.num_rows())
+            .scan(0i64, |acc, x| {
+                *acc += x;
+                Some(*acc)
+            }),
+    );
+
+    for idx in indices {
+        let row_group_idx = row_group_offsets
+            .binary_search(&(*idx as i64))
+            .unwrap_or_else(|e| e - 1);
+        row_groups
+            .entry(row_group_idx)
+            .or_insert_with(Vec::new)
+            .push((*idx as i64) - row_group_offsets[row_group_idx]);
+    }
+    let row_group_indices = row_groups
+        .keys()
+        .sorted()
+        .map(|i| row_groups.get(i).unwrap().clone())
+        .collect_vec();
+
+    let reader = builder
+        .with_row_groups(row_groups.keys().copied().collect_vec())
+        // FIXME(ngates): our indices code assumes the batch size == the row group sizes
+        .with_batch_size(10_000_000)
+        .build()
+        .unwrap();
+
+    let schema = reader.schema();
+
+    let batches = reader
+        .into_iter()
+        .enumerate()
+        .map(|(idx, batch)| {
+            let batch = batch.unwrap();
+            let indices = ArrowPrimitiveArray::<Int64Type>::from(row_group_indices[idx].clone());
+            let indices_array: ArrowArrayRef = Arc::new(indices);
+            take_record_batch(&batch, &indices_array).unwrap()
+        })
+        .collect_vec();
+
+    Ok(concat_batches(&schema, &batches)?)
+}