From 69949a3c4f46f8af2bb4d24607b8d9c18108c35a Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 15 Jul 2024 18:11:24 +0100 Subject: [PATCH 1/4] Rest of TPCH --- bench-vortex/benches/tpch_benchmark.rs | 79 +++++++++++++++++--------- bench-vortex/src/bin/tpch_benchmark.rs | 12 ++-- bench-vortex/src/tpch/mod.rs | 9 ++- bench-vortex/src/tpch/query.rs | 24 -------- bench-vortex/tpch/q1.sql | 21 +++++++ bench-vortex/tpch/q10.sql | 31 ++++++++++ bench-vortex/tpch/q11.sql | 27 +++++++++ bench-vortex/tpch/q12.sql | 30 ++++++++++ bench-vortex/tpch/q13.sql | 20 +++++++ bench-vortex/tpch/q14.sql | 13 +++++ bench-vortex/tpch/q15.sql | 34 +++++++++++ bench-vortex/tpch/q16.sql | 30 ++++++++++ bench-vortex/tpch/q17.sql | 17 ++++++ bench-vortex/tpch/q18.sql | 32 +++++++++++ bench-vortex/tpch/q19.sql | 35 ++++++++++++ bench-vortex/tpch/q2.sql | 43 ++++++++++++++ bench-vortex/tpch/q20.sql | 37 ++++++++++++ bench-vortex/tpch/q21.sql | 39 +++++++++++++ bench-vortex/tpch/q22.sql | 37 ++++++++++++ bench-vortex/tpch/q3.sql | 22 +++++++ bench-vortex/tpch/q4.sql | 21 +++++++ bench-vortex/tpch/q5.sql | 24 ++++++++ bench-vortex/tpch/q6.sql | 9 +++ bench-vortex/tpch/q7.sql | 39 +++++++++++++ bench-vortex/tpch/q8.sql | 37 ++++++++++++ bench-vortex/tpch/q9.sql | 32 +++++++++++ 26 files changed, 697 insertions(+), 57 deletions(-) delete mode 100644 bench-vortex/src/tpch/query.rs create mode 100644 bench-vortex/tpch/q1.sql create mode 100644 bench-vortex/tpch/q10.sql create mode 100644 bench-vortex/tpch/q11.sql create mode 100644 bench-vortex/tpch/q12.sql create mode 100644 bench-vortex/tpch/q13.sql create mode 100644 bench-vortex/tpch/q14.sql create mode 100644 bench-vortex/tpch/q15.sql create mode 100644 bench-vortex/tpch/q16.sql create mode 100644 bench-vortex/tpch/q17.sql create mode 100644 bench-vortex/tpch/q18.sql create mode 100644 bench-vortex/tpch/q19.sql create mode 100644 bench-vortex/tpch/q2.sql create mode 100644 bench-vortex/tpch/q20.sql create mode 100644 bench-vortex/tpch/q21.sql create mode 100644 bench-vortex/tpch/q22.sql create mode 100644 bench-vortex/tpch/q3.sql create mode 100644 bench-vortex/tpch/q4.sql create mode 100644 bench-vortex/tpch/q5.sql create mode 100644 bench-vortex/tpch/q6.sql create mode 100644 bench-vortex/tpch/q7.sql create mode 100644 bench-vortex/tpch/q8.sql create mode 100644 bench-vortex/tpch/q9.sql diff --git a/bench-vortex/benches/tpch_benchmark.rs b/bench-vortex/benches/tpch_benchmark.rs index fc98eb6470..2f3f5d451c 100644 --- a/bench-vortex/benches/tpch_benchmark.rs +++ b/bench-vortex/benches/tpch_benchmark.rs @@ -1,5 +1,4 @@ use bench_vortex::tpch::dbgen::{DBGen, DBGenOptions}; -use bench_vortex::tpch::query::Q1; use bench_vortex::tpch::{load_datasets, Format}; use criterion::{criterion_group, criterion_main, Criterion}; use tokio::runtime::Builder; @@ -10,10 +9,7 @@ fn benchmark(c: &mut Criterion) { // Run TPC-H data gen. let data_dir = DBGen::new(DBGenOptions::default()).generate().unwrap(); - let mut group = c.benchmark_group("tpch q1"); - group.sample_size(10); - - let ctx = runtime + let vortex_no_pushdown_ctx = runtime .block_on(load_datasets( &data_dir, Format::Vortex { @@ -21,12 +17,7 @@ fn benchmark(c: &mut Criterion) { }, )) .unwrap(); - group.bench_function("vortex-pushdown", |b| { - b.to_async(&runtime) - .iter(|| async { ctx.sql(Q1).await.unwrap().collect().await.unwrap() }) - }); - - let ctx = runtime + let vortex_ctx = runtime .block_on(load_datasets( &data_dir, Format::Vortex { @@ -34,26 +25,60 @@ fn benchmark(c: &mut Criterion) { }, )) .unwrap(); - group.bench_function("vortex-nopushdown", |b| { - b.to_async(&runtime) - .iter(|| async { ctx.sql(Q1).await.unwrap().collect().await.unwrap() }) - }); - - let ctx = runtime + let csv_ctx = runtime .block_on(load_datasets(&data_dir, Format::Csv)) .unwrap(); - group.bench_function("csv", |b| { - b.to_async(&runtime) - .iter(|| async { ctx.sql(Q1).await.unwrap().collect().await.unwrap() }) - }); - - let ctx = runtime + let arrow_ctx = runtime .block_on(load_datasets(&data_dir, Format::Arrow)) .unwrap(); - group.bench_function("arrow", |b| { - b.to_async(&runtime) - .iter(|| async { ctx.sql(Q1).await.unwrap().collect().await.unwrap() }) - }); + + for q in 1..=22 { + let query = bench_vortex::tpch::tpch_query(q); + + let mut group = c.benchmark_group(format!("tpch_q{q}")); + group.sample_size(10); + + group.bench_function("vortex-pushdown", |b| { + b.to_async(&runtime).iter(|| async { + vortex_ctx + .sql(&query) + .await + .unwrap() + .collect() + .await + .unwrap() + }) + }); + + group.bench_function("vortex-nopushdown", |b| { + b.to_async(&runtime).iter(|| async { + vortex_no_pushdown_ctx + .sql(&query) + .await + .unwrap() + .collect() + .await + .unwrap() + }) + }); + + group.bench_function("csv", |b| { + b.to_async(&runtime) + .iter(|| async { csv_ctx.sql(&query).await.unwrap().collect().await.unwrap() }) + }); + + group.bench_function("arrow", |b| { + b.to_async(&runtime).iter(|| async { + arrow_ctx + .sql(&query) + .await + .unwrap() + .collect() + .await + .unwrap() + }) + }); + } } criterion_group!(benches, benchmark); diff --git a/bench-vortex/src/bin/tpch_benchmark.rs b/bench-vortex/src/bin/tpch_benchmark.rs index d9c46232ac..c80c65c1fb 100644 --- a/bench-vortex/src/bin/tpch_benchmark.rs +++ b/bench-vortex/src/bin/tpch_benchmark.rs @@ -2,17 +2,17 @@ use std::path::PathBuf; use std::time::SystemTime; -use bench_vortex::tpch; use bench_vortex::tpch::dbgen::{DBGen, DBGenOptions}; -use bench_vortex::tpch::{load_datasets, Format}; +use bench_vortex::tpch::{load_datasets, tpch_query, Format}; async fn q1_csv(base_dir: &PathBuf) -> anyhow::Result<()> { let ctx = load_datasets(base_dir, Format::Csv).await?; + let q1 = tpch_query(1); println!("BEGIN: Q1(CSV)"); let start = SystemTime::now(); - ctx.sql(tpch::query::Q1).await?.show().await?; + ctx.sql(&q1).await?.show().await?; let elapsed = start.elapsed()?.as_millis(); println!("END CSV: {elapsed}ms"); @@ -21,11 +21,12 @@ async fn q1_csv(base_dir: &PathBuf) -> anyhow::Result<()> { async fn q1_arrow(base_dir: &PathBuf) -> anyhow::Result<()> { let ctx = load_datasets(base_dir, Format::Arrow).await?; + let q1 = tpch_query(1); println!("BEGIN: Q1(ARROW)"); let start = SystemTime::now(); - ctx.sql(tpch::query::Q1).await?.show().await?; + ctx.sql(&q1).await?.show().await?; let elapsed = start.elapsed()?.as_millis(); println!("END ARROW: {elapsed}ms"); @@ -41,11 +42,12 @@ async fn q1_vortex(base_dir: &PathBuf) -> anyhow::Result<()> { }, ) .await?; + let q1 = tpch_query(1); println!("BEGIN: Q1(VORTEX)"); let start = SystemTime::now(); - ctx.sql(tpch::query::Q1).await?.show().await?; + ctx.sql(&q1).await?.show().await?; let elapsed = start.elapsed()?.as_millis(); println!("END VORTEX: {elapsed}ms"); diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index 473905c433..0d5fe3d93d 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -1,3 +1,4 @@ +use std::fs; use std::path::Path; use std::sync::Arc; @@ -11,7 +12,6 @@ use vortex::{Array, ArrayDType, ArrayData, IntoArray}; use vortex_datafusion::{SessionContextExt, VortexMemTableOptions}; pub mod dbgen; -pub mod query; pub mod schema; pub enum Format { @@ -156,3 +156,10 @@ async fn register_vortex( Ok(()) } + +pub fn tpch_query(query_idx: usize) -> String { + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tpch") + .join(format!("q{}.sql", query_idx)); + fs::read_to_string(manifest_dir).unwrap() +} diff --git a/bench-vortex/src/tpch/query.rs b/bench-vortex/src/tpch/query.rs deleted file mode 100644 index 19a20aadf6..0000000000 --- a/bench-vortex/src/tpch/query.rs +++ /dev/null @@ -1,24 +0,0 @@ -pub const Q1: &str = r#" - select - l_returnflag, - l_linestatus, - sum(l_quantity) as sum_qty, - sum(l_extendedprice) as sum_base_price, - sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, - sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, - avg(l_quantity) as avg_qty, - avg(l_extendedprice) as avg_price, - avg(l_discount) as avg_disc, - count(*) as count_order - from - lineitem - where - l_shipdate <= '1998-11-30' - group by - l_returnflag, - l_linestatus - order by - l_returnflag, - l_linestatus - LIMIT 1; - "#; diff --git a/bench-vortex/tpch/q1.sql b/bench-vortex/tpch/q1.sql new file mode 100644 index 0000000000..a0fcf159e2 --- /dev/null +++ b/bench-vortex/tpch/q1.sql @@ -0,0 +1,21 @@ +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-09-02' +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; \ No newline at end of file diff --git a/bench-vortex/tpch/q10.sql b/bench-vortex/tpch/q10.sql new file mode 100644 index 0000000000..cf45e43485 --- /dev/null +++ b/bench-vortex/tpch/q10.sql @@ -0,0 +1,31 @@ +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-10-01' + and o_orderdate < date '1994-01-01' + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc; \ No newline at end of file diff --git a/bench-vortex/tpch/q11.sql b/bench-vortex/tpch/q11.sql new file mode 100644 index 0000000000..c23ed1c71b --- /dev/null +++ b/bench-vortex/tpch/q11.sql @@ -0,0 +1,27 @@ +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + ) +order by + value desc; \ No newline at end of file diff --git a/bench-vortex/tpch/q12.sql b/bench-vortex/tpch/q12.sql new file mode 100644 index 0000000000..f8e6d960c8 --- /dev/null +++ b/bench-vortex/tpch/q12.sql @@ -0,0 +1,30 @@ +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + lineitem + join + orders + on + l_orderkey = o_orderkey +where + l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1995-01-01' +group by + l_shipmode +order by + l_shipmode; \ No newline at end of file diff --git a/bench-vortex/tpch/q13.sql b/bench-vortex/tpch/q13.sql new file mode 100644 index 0000000000..4bfe8c3555 --- /dev/null +++ b/bench-vortex/tpch/q13.sql @@ -0,0 +1,20 @@ +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%special%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc; \ No newline at end of file diff --git a/bench-vortex/tpch/q14.sql b/bench-vortex/tpch/q14.sql new file mode 100644 index 0000000000..d8ef6afaca --- /dev/null +++ b/bench-vortex/tpch/q14.sql @@ -0,0 +1,13 @@ +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-09-01' + and l_shipdate < date '1995-10-01'; \ No newline at end of file diff --git a/bench-vortex/tpch/q15.sql b/bench-vortex/tpch/q15.sql new file mode 100644 index 0000000000..b5cb49e5a5 --- /dev/null +++ b/bench-vortex/tpch/q15.sql @@ -0,0 +1,34 @@ +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1996-01-01' + and l_shipdate < date '1996-01-01' + interval '3' month + group by + l_suppkey; + + +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey; + +drop view revenue0; \ No newline at end of file diff --git a/bench-vortex/tpch/q16.sql b/bench-vortex/tpch/q16.sql new file mode 100644 index 0000000000..36b7c07c16 --- /dev/null +++ b/bench-vortex/tpch/q16.sql @@ -0,0 +1,30 @@ +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM POLISHED%' + and p_size in (49, 14, 23, 45, 19, 3, 36, 9) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' +) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size; \ No newline at end of file diff --git a/bench-vortex/tpch/q17.sql b/bench-vortex/tpch/q17.sql new file mode 100644 index 0000000000..1e65550634 --- /dev/null +++ b/bench-vortex/tpch/q17.sql @@ -0,0 +1,17 @@ +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container = 'MED BOX' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey +); \ No newline at end of file diff --git a/bench-vortex/tpch/q18.sql b/bench-vortex/tpch/q18.sql new file mode 100644 index 0000000000..835de28a57 --- /dev/null +++ b/bench-vortex/tpch/q18.sql @@ -0,0 +1,32 @@ +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 300 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate; \ No newline at end of file diff --git a/bench-vortex/tpch/q19.sql b/bench-vortex/tpch/q19.sql new file mode 100644 index 0000000000..56668e73f8 --- /dev/null +++ b/bench-vortex/tpch/q19.sql @@ -0,0 +1,35 @@ +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#12' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 1 and l_quantity <= 1 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 10 and l_quantity <= 10 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ); \ No newline at end of file diff --git a/bench-vortex/tpch/q2.sql b/bench-vortex/tpch/q2.sql new file mode 100644 index 0000000000..f66af21020 --- /dev/null +++ b/bench-vortex/tpch/q2.sql @@ -0,0 +1,43 @@ +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 15 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' +) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey; \ No newline at end of file diff --git a/bench-vortex/tpch/q20.sql b/bench-vortex/tpch/q20.sql new file mode 100644 index 0000000000..dd61a7d8e6 --- /dev/null +++ b/bench-vortex/tpch/q20.sql @@ -0,0 +1,37 @@ +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'forest%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'CANADA' +order by + s_name; \ No newline at end of file diff --git a/bench-vortex/tpch/q21.sql b/bench-vortex/tpch/q21.sql new file mode 100644 index 0000000000..9d2fe32cee --- /dev/null +++ b/bench-vortex/tpch/q21.sql @@ -0,0 +1,39 @@ +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name; \ No newline at end of file diff --git a/bench-vortex/tpch/q22.sql b/bench-vortex/tpch/q22.sql new file mode 100644 index 0000000000..90aea6fd74 --- /dev/null +++ b/bench-vortex/tpch/q22.sql @@ -0,0 +1,37 @@ +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode; \ No newline at end of file diff --git a/bench-vortex/tpch/q3.sql b/bench-vortex/tpch/q3.sql new file mode 100644 index 0000000000..7dbc6d9ef6 --- /dev/null +++ b/bench-vortex/tpch/q3.sql @@ -0,0 +1,22 @@ +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-15' + and l_shipdate > date '1995-03-15' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate; \ No newline at end of file diff --git a/bench-vortex/tpch/q4.sql b/bench-vortex/tpch/q4.sql new file mode 100644 index 0000000000..74a620dbc8 --- /dev/null +++ b/bench-vortex/tpch/q4.sql @@ -0,0 +1,21 @@ +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= '1993-07-01' + and o_orderdate < date '1993-07-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority; \ No newline at end of file diff --git a/bench-vortex/tpch/q5.sql b/bench-vortex/tpch/q5.sql new file mode 100644 index 0000000000..5a336b2311 --- /dev/null +++ b/bench-vortex/tpch/q5.sql @@ -0,0 +1,24 @@ +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= date '1994-01-01' + and o_orderdate < date '1995-01-01' +group by + n_name +order by + revenue desc; \ No newline at end of file diff --git a/bench-vortex/tpch/q6.sql b/bench-vortex/tpch/q6.sql new file mode 100644 index 0000000000..5806f980f8 --- /dev/null +++ b/bench-vortex/tpch/q6.sql @@ -0,0 +1,9 @@ +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1994-01-01' + and l_shipdate < date '1995-01-01' + and l_discount between 0.06 - 0.01 and 0.06 + 0.01 + and l_quantity < 24; \ No newline at end of file diff --git a/bench-vortex/tpch/q7.sql b/bench-vortex/tpch/q7.sql new file mode 100644 index 0000000000..512e5be55a --- /dev/null +++ b/bench-vortex/tpch/q7.sql @@ -0,0 +1,39 @@ +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') + or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year; diff --git a/bench-vortex/tpch/q8.sql b/bench-vortex/tpch/q8.sql new file mode 100644 index 0000000000..6ddb2a6747 --- /dev/null +++ b/bench-vortex/tpch/q8.sql @@ -0,0 +1,37 @@ +select + o_year, + sum(case + when nation = 'BRAZIL' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'ECONOMY ANODIZED STEEL' + ) as all_nations +group by + o_year +order by + o_year; \ No newline at end of file diff --git a/bench-vortex/tpch/q9.sql b/bench-vortex/tpch/q9.sql new file mode 100644 index 0000000000..587bbc8a20 --- /dev/null +++ b/bench-vortex/tpch/q9.sql @@ -0,0 +1,32 @@ +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%green%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc; \ No newline at end of file From 8bbd2a0cd2803a6a096d95be4d1815f041421daa Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 16 Jul 2024 08:28:21 +0100 Subject: [PATCH 2/4] More benchmarks --- Cargo.lock | 142 ++++++++++++++++++++++++- Cargo.toml | 3 + bench-vortex/Cargo.toml | 3 + bench-vortex/benches/tpch_benchmark.rs | 4 + bench-vortex/src/bin/tpch_benchmark.rs | 107 +++++++++---------- bench-vortex/src/tpch/mod.rs | 1 + 6 files changed, 201 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0c7e91c02..398aa99954 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -433,13 +433,16 @@ dependencies = [ "flexbuffers", "futures", "humansize", + "indicatif", "itertools 0.13.0", "lazy_static", "log", "mimalloc", "object_store", "parquet", + "prettytable-rs", "rand", + "rayon", "reqwest", "serde", "simplelog", @@ -735,6 +738,19 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af" +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode 0.3.6", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.52.0", +] + [[package]] name = "const-random" version = "0.1.18" @@ -1249,6 +1265,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "divan" version = "0.1.14" @@ -1286,6 +1323,18 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "encoding_rs" version = "0.8.34" @@ -1787,6 +1836,19 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + [[package]] name = "indoc" version = "2.0.5" @@ -1981,6 +2043,16 @@ dependencies = [ "libc", ] +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.5.0", + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -2307,6 +2379,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "object" version = "0.36.0" @@ -2639,6 +2717,20 @@ dependencies = [ "syn 2.0.68", ] +[[package]] +name = "prettytable-rs" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eea25e07510aa6ab6547308ebe3c036016d162b8da920dbb079e3ba8acf3d95a" +dependencies = [ + "csv", + "encode_unicode 1.0.0", + "is-terminal", + "lazy_static", + "term", + "unicode-width", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -2684,8 +2776,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bb182580f71dd070f88d01ce3de9f4da5021db7115d2e1c3605a754153b77c1" dependencies = [ "bytes", - "heck 0.4.1", - "itertools 0.12.1", + "heck 0.5.0", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -2705,7 +2797,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.68", @@ -2939,6 +3031,17 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "redox_users" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + [[package]] name = "regex" version = "1.10.5" @@ -3495,6 +3598,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + [[package]] name = "termcolor" version = "1.4.1" @@ -4288,6 +4402,22 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.8" @@ -4297,6 +4427,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index a98d39e694..6abcb7a48b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ getrandom = "0.2.14" half = { version = "^2", features = ["std", "num-traits"] } hashbrown = "0.14.3" humansize = "2.1.3" +indicatif = "0.17.8" itertools = "0.13.0" lazy_static = "1.4.0" leb128 = "0.2.5" @@ -85,12 +86,14 @@ num_enum = "0.7.2" parquet = "52.0.0" paste = "1.0.14" pin-project = "1.1.5" +prettytable-rs = "0.10.0" prost = "0.13.0" prost-build = "0.13.0" prost-types = "0.13.0" pyo3 = { version = "0.21.2", features = ["extension-module", "abi3-py311"] } pyo3-log = "0.11.0" rand = "0.8.5" +rayon = "1.10.0" reqwest = { version = "0.12.0", features = ["blocking"] } seq-macro = "0.3.5" serde = "1.0.197" diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index 89565d3172..ea3baf45bb 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -27,13 +27,16 @@ enum-iterator = { workspace = true } flexbuffers = { workspace = true } futures = { workspace = true, features = ["executor"] } humansize = { workspace = true } +indicatif = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } log = { workspace = true } mimalloc = { workspace = true } object_store = { workspace = true, features = ["aws"] } parquet = { workspace = true, features = [] } +prettytable-rs = { workspace = true } rand = { workspace = true } +rayon = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } simplelog = { workspace = true } diff --git a/bench-vortex/benches/tpch_benchmark.rs b/bench-vortex/benches/tpch_benchmark.rs index 2f3f5d451c..d49e76504a 100644 --- a/bench-vortex/benches/tpch_benchmark.rs +++ b/bench-vortex/benches/tpch_benchmark.rs @@ -33,6 +33,10 @@ fn benchmark(c: &mut Criterion) { .unwrap(); for q in 1..=22 { + if q == 15 { + // DataFusion does not support query 15 since it has multiple SQL statements. + } + let query = bench_vortex::tpch::tpch_query(q); let mut group = c.benchmark_group(format!("tpch_q{q}")); diff --git a/bench-vortex/src/bin/tpch_benchmark.rs b/bench-vortex/src/bin/tpch_benchmark.rs index c80c65c1fb..65ce2f41de 100644 --- a/bench-vortex/src/bin/tpch_benchmark.rs +++ b/bench-vortex/src/bin/tpch_benchmark.rs @@ -1,61 +1,11 @@ -#![allow(dead_code)] -use std::path::PathBuf; use std::time::SystemTime; use bench_vortex::tpch::dbgen::{DBGen, DBGenOptions}; use bench_vortex::tpch::{load_datasets, tpch_query, Format}; +use indicatif::ProgressBar; +use prettytable::{Cell, Row, Table}; -async fn q1_csv(base_dir: &PathBuf) -> anyhow::Result<()> { - let ctx = load_datasets(base_dir, Format::Csv).await?; - let q1 = tpch_query(1); - - println!("BEGIN: Q1(CSV)"); - - let start = SystemTime::now(); - ctx.sql(&q1).await?.show().await?; - let elapsed = start.elapsed()?.as_millis(); - println!("END CSV: {elapsed}ms"); - - Ok(()) -} - -async fn q1_arrow(base_dir: &PathBuf) -> anyhow::Result<()> { - let ctx = load_datasets(base_dir, Format::Arrow).await?; - let q1 = tpch_query(1); - - println!("BEGIN: Q1(ARROW)"); - let start = SystemTime::now(); - - ctx.sql(&q1).await?.show().await?; - let elapsed = start.elapsed()?.as_millis(); - - println!("END ARROW: {elapsed}ms"); - - Ok(()) -} - -async fn q1_vortex(base_dir: &PathBuf) -> anyhow::Result<()> { - let ctx = load_datasets( - base_dir, - Format::Vortex { - disable_pushdown: true, - }, - ) - .await?; - let q1 = tpch_query(1); - - println!("BEGIN: Q1(VORTEX)"); - let start = SystemTime::now(); - - ctx.sql(&q1).await?.show().await?; - - let elapsed = start.elapsed()?.as_millis(); - println!("END VORTEX: {elapsed}ms"); - - Ok(()) -} - -#[tokio::main(flavor = "current_thread")] +#[tokio::main(flavor = "multi_thread", worker_threads = 8)] async fn main() { // uncomment the below to enable trace logging of datafusion execution // setup_logger(LevelFilter::Trace); @@ -63,7 +13,52 @@ async fn main() { // Run TPC-H data gen. let data_dir = DBGen::new(DBGenOptions::default()).generate().unwrap(); - q1_csv(&data_dir).await.unwrap(); - q1_arrow(&data_dir).await.unwrap(); - q1_vortex(&data_dir).await.unwrap(); + let formats = [ + Format::Csv, + Format::Arrow, + Format::Vortex { + disable_pushdown: false, + }, + Format::Vortex { + disable_pushdown: true, + }, + ]; + + // Set up a results table + let mut table = Table::new(); + let mut cells = vec![Cell::new("Query")]; + cells.extend(formats.iter().map(|f| Cell::new(&format!("{:?}", f)))); + table.add_row(Row::new(cells)); + + // Setup a progress bar + let progress = ProgressBar::new(22 * formats.len() as u64); + + for i in 1..=22 { + // Skip query 15 as it is not supported by DataFusion + if i == 15 { + continue; + } + + let query = tpch_query(i); + let mut cells = Vec::with_capacity(formats.len()); + cells.push(Cell::new(&format!("Q{}", i))); + for format in formats.iter() { + let ctx = load_datasets(&data_dir, *format).await.unwrap(); + let start = SystemTime::now(); + ctx.sql(&query) + .await + .map_err(|e| println!("Failed to run {} {:?}: {}", i, format, e)) + .unwrap() + .collect() + .await + .map_err(|e| println!("Failed to collect {} {:?}: {}", i, format, e)) + .unwrap(); + let elapsed = start.elapsed().unwrap(); + progress.inc(1); + cells.push(Cell::new(&format!("{} us", elapsed.as_micros()))); + } + table.add_row(Row::new(cells)); + } + progress.clone().finish(); + table.printstd(); } diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index 0d5fe3d93d..2ad1ba3691 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -14,6 +14,7 @@ use vortex_datafusion::{SessionContextExt, VortexMemTableOptions}; pub mod dbgen; pub mod schema; +#[derive(Clone, Copy, Debug)] pub enum Format { Csv, Arrow, From 449207ccf759a4116f98a1b759a3ec5600ca4212 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 16 Jul 2024 09:15:16 +0100 Subject: [PATCH 3/4] More benchmarks --- bench-vortex/src/bin/tpch_benchmark.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/bench-vortex/src/bin/tpch_benchmark.rs b/bench-vortex/src/bin/tpch_benchmark.rs index 65ce2f41de..1e2198d026 100644 --- a/bench-vortex/src/bin/tpch_benchmark.rs +++ b/bench-vortex/src/bin/tpch_benchmark.rs @@ -2,7 +2,9 @@ use std::time::SystemTime; use bench_vortex::tpch::dbgen::{DBGen, DBGenOptions}; use bench_vortex::tpch::{load_datasets, tpch_query, Format}; +use futures::future::join_all; use indicatif::ProgressBar; +use itertools::Itertools; use prettytable::{Cell, Row, Table}; #[tokio::main(flavor = "multi_thread", worker_threads = 8)] @@ -13,6 +15,7 @@ async fn main() { // Run TPC-H data gen. let data_dir = DBGen::new(DBGenOptions::default()).generate().unwrap(); + // The formats to run against (vs the baseline) let formats = [ Format::Csv, Format::Arrow, @@ -24,6 +27,17 @@ async fn main() { }, ]; + // Load datasets + let ctxs = join_all( + formats + .iter() + .map(|format| load_datasets(&data_dir, *format)), + ) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect_vec(); + // Set up a results table let mut table = Table::new(); let mut cells = vec![Cell::new("Query")]; @@ -42,8 +56,7 @@ async fn main() { let query = tpch_query(i); let mut cells = Vec::with_capacity(formats.len()); cells.push(Cell::new(&format!("Q{}", i))); - for format in formats.iter() { - let ctx = load_datasets(&data_dir, *format).await.unwrap(); + for (ctx, format) in ctxs.iter().zip(formats.iter()) { let start = SystemTime::now(); ctx.sql(&query) .await From 9c1f7b1a2c2601e7c4a1e05bbba8cd7803fe9390 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 16 Jul 2024 09:47:29 +0100 Subject: [PATCH 4/4] color benchmark output based on time compared to arrow --- bench-vortex/src/bin/tpch_benchmark.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/bench-vortex/src/bin/tpch_benchmark.rs b/bench-vortex/src/bin/tpch_benchmark.rs index 1e2198d026..e250a100cc 100644 --- a/bench-vortex/src/bin/tpch_benchmark.rs +++ b/bench-vortex/src/bin/tpch_benchmark.rs @@ -17,8 +17,8 @@ async fn main() { // The formats to run against (vs the baseline) let formats = [ - Format::Csv, Format::Arrow, + Format::Csv, Format::Vortex { disable_pushdown: false, }, @@ -56,6 +56,8 @@ async fn main() { let query = tpch_query(i); let mut cells = Vec::with_capacity(formats.len()); cells.push(Cell::new(&format!("Q{}", i))); + + let mut elapsed_us = Vec::new(); for (ctx, format) in ctxs.iter().zip(formats.iter()) { let start = SystemTime::now(); ctx.sql(&query) @@ -67,8 +69,25 @@ async fn main() { .map_err(|e| println!("Failed to collect {} {:?}: {}", i, format, e)) .unwrap(); let elapsed = start.elapsed().unwrap(); + elapsed_us.push(elapsed); progress.inc(1); - cells.push(Cell::new(&format!("{} us", elapsed.as_micros()))); + } + + let baseline = elapsed_us.first().unwrap(); + // yellow: 10% slower than baseline + let yellow = baseline.as_micros() + (baseline.as_micros() / 10); + // red: 50% slower than baseline + let red = baseline.as_micros() + (baseline.as_micros() / 50); + cells.push(Cell::new(&format!("{} us", baseline.as_micros())).style_spec("b")); + for measure in elapsed_us.iter().skip(1) { + let style_spec = if measure.as_micros() > red { + "bBr" + } else if measure.as_micros() > yellow { + "bFdBy" + } else { + "bFdBG" + }; + cells.push(Cell::new(&format!("{} us", measure.as_micros())).style_spec(style_spec)); } table.add_row(Row::new(cells)); }