-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add PBI medicare dataset & vortex/parquet comparison (#187)
- Adds one of the medicare datasets* from the ALP paper to our compress and random_access benchmarks - Adds mechanism for handling csv data - Adds mechanism for handling bzipped data - Refactors to enable translating general parquet to lance - General refactors to make data download/translation ops more modular *https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/Medicare1 -- contains two bzipped files, this downloads just the first
- Loading branch information
Showing
13 changed files
with
346 additions
and
48 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,25 @@ | ||
use bench_vortex::compress_taxi_data; | ||
use bench_vortex::medicare_data::medicare_data_csv; | ||
use bench_vortex::taxi_data::taxi_data_parquet; | ||
use bench_vortex::{compress_medicare_data, compress_taxi_data}; | ||
use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
|
||
fn vortex_compress(c: &mut Criterion) { | ||
fn vortex_compress_taxi(c: &mut Criterion) { | ||
taxi_data_parquet(); | ||
let mut group = c.benchmark_group("end to end"); | ||
group.sample_size(10); | ||
group.bench_function("compress", |b| b.iter(|| black_box(compress_taxi_data()))); | ||
group.finish() | ||
} | ||
|
||
criterion_group!(benches, vortex_compress); | ||
fn vortex_compress_medicare(c: &mut Criterion) { | ||
medicare_data_csv(); | ||
let mut group = c.benchmark_group("end to end"); | ||
group.sample_size(10); | ||
group.bench_function("compress", |b| { | ||
b.iter(|| black_box(compress_medicare_data())) | ||
}); | ||
group.finish() | ||
} | ||
|
||
criterion_group!(benches, vortex_compress_taxi, vortex_compress_medicare); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
use std::path::PathBuf; | ||
use std::sync::Arc; | ||
|
||
use arrow::datatypes::SchemaRef; | ||
use arrow_csv::reader::Format; | ||
use itertools::Itertools; | ||
use vortex::array::chunked::ChunkedArray; | ||
use vortex::array::IntoArray; | ||
use vortex::arrow::FromArrowType; | ||
use vortex::serde::WriteCtx; | ||
use vortex_error::VortexError; | ||
use vortex_schema::DType; | ||
|
||
use crate::data_downloads::{decompress_bz2, download_data, parquet_to_lance}; | ||
use crate::idempotent; | ||
use crate::reader::{compress_csv_to_vortex, default_csv_format, write_csv_as_parquet}; | ||
|
||
pub fn medicare_data_csv() -> PathBuf { | ||
let fname = "Medicare1_1.csv.bz2"; | ||
download_data( | ||
fname, | ||
"http://www.cwi.nl/~boncz/PublicBIbenchmark/Medicare1/Medicare1_1.csv.bz2", | ||
); | ||
decompress_bz2(fname, "Medicare1_1.csv") | ||
} | ||
|
||
pub fn medicare_data_lance() -> PathBuf { | ||
let taxi_data = File::open(medicare_data_parquet()).unwrap(); | ||
idempotent("medicare.lance", |path| { | ||
Ok::<PathBuf, VortexError>(parquet_to_lance(path, taxi_data)) | ||
}) | ||
.unwrap() | ||
} | ||
|
||
pub fn medicare_data_vortex_uncompressed() -> PathBuf { | ||
idempotent("medicare-uncompressed.vortex", |path| { | ||
let csv_file = File::open(medicare_data_csv()).unwrap(); | ||
let reader = BufReader::new(csv_file.try_clone().unwrap()); | ||
|
||
let (schema, _) = Format::default() | ||
.infer_schema(&mut csv_file.try_clone().unwrap(), None) | ||
.unwrap(); | ||
|
||
let csv_reader = arrow::csv::ReaderBuilder::new(Arc::new(schema.clone())) | ||
.with_batch_size(crate::reader::BATCH_SIZE) | ||
.build(reader)?; | ||
|
||
let dtype = DType::from_arrow(SchemaRef::new(schema.clone())); | ||
|
||
let chunks = csv_reader | ||
.map(|batch_result| batch_result.unwrap()) | ||
.map(|record_batch| record_batch.into_array()) | ||
.collect_vec(); | ||
let chunked = ChunkedArray::new(chunks, dtype.clone()); | ||
|
||
let mut write = File::create(path).unwrap(); | ||
let mut write_ctx = WriteCtx::new(&mut write); | ||
write_ctx.dtype(&dtype)?; | ||
write_ctx.write(&chunked) | ||
}) | ||
.unwrap() | ||
} | ||
|
||
pub fn medicare_data_vortex() -> PathBuf { | ||
idempotent("medicare.vortex", |path| { | ||
let mut write = File::create(path).unwrap(); | ||
let delimiter = u8::try_from('|').unwrap(); | ||
compress_csv_to_vortex( | ||
medicare_data_csv(), | ||
default_csv_format().with_delimiter(delimiter), | ||
&mut write, | ||
) | ||
}) | ||
.unwrap() | ||
} | ||
|
||
pub fn medicare_data_parquet() -> PathBuf { | ||
idempotent("medicare.parquet", |path| { | ||
let delimiter = u8::try_from('|').unwrap(); | ||
let format = default_csv_format().with_delimiter(delimiter); | ||
let file = File::create(path).unwrap(); | ||
write_csv_as_parquet(medicare_data_csv(), format, file) | ||
}) | ||
.unwrap() | ||
} |
Oops, something went wrong.