diff --git a/docs/ppl-lang/README.md b/docs/ppl-lang/README.md index d78f4c030..ef186e5f2 100644 --- a/docs/ppl-lang/README.md +++ b/docs/ppl-lang/README.md @@ -104,6 +104,10 @@ For additional examples see the next [documentation](PPL-Example-Commands.md). ### Example PPL Queries See samples of [PPL queries](PPL-Example-Commands.md) +--- +### TPC-H PPL Query Rewriting +See samples of [TPC-H PPL query rewriting](ppl-tpch.md) + --- ### Planned PPL Commands diff --git a/docs/ppl-lang/ppl-tpch.md b/docs/ppl-lang/ppl-tpch.md new file mode 100644 index 000000000..ef5846ce0 --- /dev/null +++ b/docs/ppl-lang/ppl-tpch.md @@ -0,0 +1,102 @@ +## TPC-H Benchmark + +TPC-H is a decision support benchmark designed to evaluate the performance of database systems in handling complex business-oriented queries and concurrent data modifications. The benchmark utilizes a dataset that is broadly representative of various industries, making it widely applicable. TPC-H simulates a decision support environment where large volumes of data are analyzed, intricate queries are executed, and critical business questions are answered. + +### Test PPL Queries + +TPC-H 22 test query statements: [TPCH-Query-PPL](https://github.com/opensearch-project/opensearch-spark/blob/main/integ-test/src/integration/resources/tpch) + +### Data Preparation + +#### Option 1 - from PyPi + +``` +# Create the virtual environment +python3 -m venv .venv + +# Activate the virtual environment +. .venv/bin/activate + +pip install tpch-datagen +``` + +#### Option 2 - from source + +``` +git clone https://github.com/gizmodata/tpch-datagen + +cd tpch-datagen + +# Create the virtual environment +python3 -m venv .venv + +# Activate the virtual environment +. .venv/bin/activate + +# Upgrade pip, setuptools, and wheel +pip install --upgrade pip setuptools wheel + +# Install TPC-H Datagen - in editable mode with client and dev dependencies +pip install --editable .[dev] +``` + +#### Usage + +Here are the options for the tpch-datagen command: +``` +tpch-datagen --help +Usage: tpch-datagen [OPTIONS] + +Options: + --version / --no-version Prints the TPC-H Datagen package version and + exits. [required] + --scale-factor INTEGER The TPC-H Scale Factor to use for data + generation. + --data-directory TEXT The target output data directory to put the + files into [default: data; required] + --work-directory TEXT The work directory to use for data + generation. [default: /tmp; required] + --overwrite / --no-overwrite Can we overwrite the target directory if it + already exists... [default: no-overwrite; + required] + --num-chunks INTEGER The number of chunks that will be generated + - more chunks equals smaller memory + requirements, but more files generated. + [default: 10; required] + --num-processes INTEGER The maximum number of processes for the + multi-processing pool to use for data + generation. [default: 10; required] + --duckdb-threads INTEGER The number of DuckDB threads to use for data + generation (within each job process). + [default: 1; required] + --per-thread-output / --no-per-thread-output + Controls whether to write the output to a + single file or multiple files (for each + process). [default: per-thread-output; + required] + --compression-method [none|snappy|gzip|zstd] + The compression method to use for the + parquet files generated. [default: zstd; + required] + --file-size-bytes TEXT The target file size for the parquet files + generated. [default: 100m; required] + --help Show this message and exit. +``` + +### Generate 1 GB data with zstd (by default) compression + +``` +tpch-datagen --scale-factor 1 +``` + +### Generate 10 GB data with snappy compression + +``` +tpch-datagen --scale-factor 10 --compression-method snappy +``` + +### Query Test + +All TPC-H PPL Queries located in `integ-test/src/integration/resources/tpch` folder. + +To test all queries, run `org.opensearch.flint.spark.ppl.tpch.TPCHQueryITSuite`. \ No newline at end of file diff --git a/integ-test/src/integration/resources/tpch/q10.ppl b/integ-test/src/integration/resources/tpch/q10.ppl index a07618727..10a050785 100644 --- a/integ-test/src/integration/resources/tpch/q10.ppl +++ b/integ-test/src/integration/resources/tpch/q10.ppl @@ -34,9 +34,9 @@ limit 20 */ source = customer -| join left = l right = r ON c_custkey = o_custkey orders -| join left = l right = r ON l_orderkey = o_orderkey lineitem -| join left = l right = r ON c_nationkey = n_nationkey nation +| join ON c_custkey = o_custkey orders +| join ON l_orderkey = o_orderkey lineitem +| join ON c_nationkey = n_nationkey nation | where o_orderdate >= date('1993-10-01') AND o_orderdate < date_add(date('1993-10-01'), interval 3 month) AND l_returnflag = 'R' diff --git a/integ-test/src/integration/resources/tpch/q11.ppl b/integ-test/src/integration/resources/tpch/q11.ppl index db31f137c..3a55d986e 100644 --- a/integ-test/src/integration/resources/tpch/q11.ppl +++ b/integ-test/src/integration/resources/tpch/q11.ppl @@ -29,14 +29,14 @@ order by */ source = partsupp -| join left = l right = r ON ps_suppkey = s_suppkey supplier -| join left = l right = r ON s_nationkey = n_nationkey nation +| join ON ps_suppkey = s_suppkey supplier +| join ON s_nationkey = n_nationkey nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as value by ps_partkey | where value > [ source = partsupp - | join left = l right = r ON ps_suppkey = s_suppkey supplier - | join left = l right = r ON s_nationkey = n_nationkey nation + | join ON ps_suppkey = s_suppkey supplier + | join ON s_nationkey = n_nationkey nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as check | eval threshold = check * 0.0001000000 diff --git a/integ-test/src/integration/resources/tpch/q12.ppl b/integ-test/src/integration/resources/tpch/q12.ppl index cdb56c210..79672d844 100644 --- a/integ-test/src/integration/resources/tpch/q12.ppl +++ b/integ-test/src/integration/resources/tpch/q12.ppl @@ -30,7 +30,7 @@ order by */ source = orders -| join left = l right = r ON o_orderkey = l_orderkey lineitem +| join ON o_orderkey = l_orderkey lineitem | where l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_shipmode in ('MAIL', 'SHIP') diff --git a/integ-test/src/integration/resources/tpch/q13.ppl b/integ-test/src/integration/resources/tpch/q13.ppl index 59439e843..6e77c9b0a 100644 --- a/integ-test/src/integration/resources/tpch/q13.ppl +++ b/integ-test/src/integration/resources/tpch/q13.ppl @@ -23,8 +23,7 @@ order by source = [ source = customer - | left outer join left = l right = r - ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') + | left outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') orders | stats count(o_orderkey) as c_count by c_custkey ] as c_orders diff --git a/integ-test/src/integration/resources/tpch/q14.ppl b/integ-test/src/integration/resources/tpch/q14.ppl index e2b37e818..553f1e549 100644 --- a/integ-test/src/integration/resources/tpch/q14.ppl +++ b/integ-test/src/integration/resources/tpch/q14.ppl @@ -15,7 +15,7 @@ where */ source = lineitem -| join left = l right = r ON l_partkey = p_partkey +| join ON l_partkey = p_partkey AND l_shipdate >= date('1995-09-01') AND l_shipdate < date_add(date('1995-09-01'), interval 1 month) part diff --git a/integ-test/src/integration/resources/tpch/q15.ppl b/integ-test/src/integration/resources/tpch/q15.ppl index 201199095..96f5ecea2 100644 --- a/integ-test/src/integration/resources/tpch/q15.ppl +++ b/integ-test/src/integration/resources/tpch/q15.ppl @@ -33,7 +33,7 @@ order by // CTE is unsupported in PPL source = supplier -| join left = l right = revenue0 ON s_suppkey = supplier_no [ +| join right = revenue0 ON s_suppkey = supplier_no [ source = lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey diff --git a/integ-test/src/integration/resources/tpch/q16.ppl b/integ-test/src/integration/resources/tpch/q16.ppl index 0607bb79f..4c5765f04 100644 --- a/integ-test/src/integration/resources/tpch/q16.ppl +++ b/integ-test/src/integration/resources/tpch/q16.ppl @@ -32,7 +32,7 @@ order by */ source = partsupp -| join left = l right = r ON p_partkey = ps_partkey part +| join ON p_partkey = ps_partkey part | where p_brand != 'Brand#45' and not like(p_type, 'MEDIUM POLISHED%') and p_size in (49, 14, 23, 45, 19, 3, 36, 9) diff --git a/integ-test/src/integration/resources/tpch/q17.ppl b/integ-test/src/integration/resources/tpch/q17.ppl index fa7d63ca0..994b7ee18 100644 --- a/integ-test/src/integration/resources/tpch/q17.ppl +++ b/integ-test/src/integration/resources/tpch/q17.ppl @@ -19,7 +19,7 @@ where */ source = lineitem -| join left = l right = r ON p_partkey = l_partkey part +| join ON p_partkey = l_partkey part | where p_brand = 'Brand#23' and p_container = 'MED BOX' and l_quantity < [ diff --git a/integ-test/src/integration/resources/tpch/q18.ppl b/integ-test/src/integration/resources/tpch/q18.ppl index 9d540ee85..1dab3d473 100644 --- a/integ-test/src/integration/resources/tpch/q18.ppl +++ b/integ-test/src/integration/resources/tpch/q18.ppl @@ -35,8 +35,8 @@ limit 100 */ source = customer -| join left = l right = r ON c_custkey = o_custkey orders -| join left = l right = r ON o_orderkey = l_orderkey lineitem +| join ON c_custkey = o_custkey orders +| join ON o_orderkey = l_orderkey lineitem | where o_orderkey in [ source = lineitem | stats sum(l_quantity) as sum by l_orderkey diff --git a/integ-test/src/integration/resources/tpch/q19.ppl b/integ-test/src/integration/resources/tpch/q19.ppl index 97c1804cb..630d63bcc 100644 --- a/integ-test/src/integration/resources/tpch/q19.ppl +++ b/integ-test/src/integration/resources/tpch/q19.ppl @@ -37,8 +37,7 @@ where */ source = lineitem -| join left = l right = r - ON p_partkey = l_partkey +| join ON p_partkey = l_partkey and p_brand = 'Brand#12' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') and l_quantity >= 1 and l_quantity <= 1 + 10 diff --git a/integ-test/src/integration/resources/tpch/q2.ppl b/integ-test/src/integration/resources/tpch/q2.ppl index 51c5f8e43..aa95d9d14 100644 --- a/integ-test/src/integration/resources/tpch/q2.ppl +++ b/integ-test/src/integration/resources/tpch/q2.ppl @@ -46,15 +46,15 @@ limit 100 */ source = part -| join left = l right = r ON p_partkey = ps_partkey partsupp -| join left = l right = r ON s_suppkey = ps_suppkey supplier -| join left = l right = r ON s_nationkey = n_nationkey nation -| join left = l right = r ON n_regionkey = r_regionkey region +| join ON p_partkey = ps_partkey partsupp +| join ON s_suppkey = ps_suppkey supplier +| join ON s_nationkey = n_nationkey nation +| join ON n_regionkey = r_regionkey region | where p_size = 15 AND like(p_type, '%BRASS') AND r_name = 'EUROPE' AND ps_supplycost = [ source = partsupp - | join left = l right = r ON s_suppkey = ps_suppkey supplier - | join left = l right = r ON s_nationkey = n_nationkey nation - | join left = l right = r ON n_regionkey = r_regionkey region + | join ON s_suppkey = ps_suppkey supplier + | join ON s_nationkey = n_nationkey nation + | join ON n_regionkey = r_regionkey region | where r_name = 'EUROPE' | stats MIN(ps_supplycost) ] diff --git a/integ-test/src/integration/resources/tpch/q20.ppl b/integ-test/src/integration/resources/tpch/q20.ppl index 6456d44e0..08bd21277 100644 --- a/integ-test/src/integration/resources/tpch/q20.ppl +++ b/integ-test/src/integration/resources/tpch/q20.ppl @@ -39,7 +39,7 @@ order by */ source = supplier -| join left = l right = r ON s_nationkey = n_nationkey nation +| join ON s_nationkey = n_nationkey nation | where n_name = 'CANADA' and s_suppkey in [ source = partsupp diff --git a/integ-test/src/integration/resources/tpch/q21.ppl b/integ-test/src/integration/resources/tpch/q21.ppl index 439903a37..0eb7149f6 100644 --- a/integ-test/src/integration/resources/tpch/q21.ppl +++ b/integ-test/src/integration/resources/tpch/q21.ppl @@ -42,14 +42,14 @@ limit 100 */ source = supplier -| join left = l right = l1 ON s_suppkey = l_suppkey lineitem -| join left = l right = r ON o_orderkey = l_orderkey orders -| join left = l right = r ON s_nationkey = n_nationkey nation +| join ON s_suppkey = l1.l_suppkey lineitem as l1 +| join ON o_orderkey = l1.l_orderkey orders +| join ON s_nationkey = n_nationkey nation | where o_orderstatus = 'F' - and l_receiptdate > l_commitdate + and l1.l_receiptdate > l1.l_commitdate and exists [ source = lineitem as l2 - | where l2.l_orderkey = l1.l_orderkey // Bug! `l2`.`l_orderkey` cannot be resolved + | where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey != l1.l_suppkey ] and not exists [ diff --git a/integ-test/src/integration/resources/tpch/q3.ppl b/integ-test/src/integration/resources/tpch/q3.ppl index 18e5902ea..0ece358ab 100644 --- a/integ-test/src/integration/resources/tpch/q3.ppl +++ b/integ-test/src/integration/resources/tpch/q3.ppl @@ -25,8 +25,8 @@ limit 10 */ source = customer -| join left = l right = r ON c_custkey = o_custkey orders -| join left = l right = r ON l_orderkey = o_orderkey lineitem +| join ON c_custkey = o_custkey orders +| join ON l_orderkey = o_orderkey lineitem | where c_mktsegment = 'BUILDING' AND o_orderdate < date('1995-03-15') AND l_shipdate > date('1995-03-15') | stats sum(l_extendedprice * (1 - l_discount)) as revenue by l_orderkey, o_orderdate, o_shippriority | sort - revenue, o_orderdate diff --git a/integ-test/src/integration/resources/tpch/q5.ppl b/integ-test/src/integration/resources/tpch/q5.ppl index 110012613..4761b0365 100644 --- a/integ-test/src/integration/resources/tpch/q5.ppl +++ b/integ-test/src/integration/resources/tpch/q5.ppl @@ -26,11 +26,11 @@ order by */ source = customer -| join left = l right = r ON c_custkey = o_custkey orders -| join left = l right = r ON l_orderkey = o_orderkey lineitem -| join left = l right = r ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey supplier -| join left = l right = r ON s_nationkey = n_nationkey nation -| join left = l right = r ON n_regionkey = r_regionkey region +| join ON c_custkey = o_custkey orders +| join ON l_orderkey = o_orderkey lineitem +| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey supplier +| join ON s_nationkey = n_nationkey nation +| join ON n_regionkey = r_regionkey region | where r_name = 'ASIA' AND o_orderdate >= date('1994-01-01') AND o_orderdate < date_add(date('1994-01-01'), interval 1 year) | stats sum(l_extendedprice * (1 - l_discount)) as revenue by n_name | sort - revenue \ No newline at end of file diff --git a/integ-test/src/integration/resources/tpch/q7.ppl b/integ-test/src/integration/resources/tpch/q7.ppl index 8d92d5145..ceda602b3 100644 --- a/integ-test/src/integration/resources/tpch/q7.ppl +++ b/integ-test/src/integration/resources/tpch/q7.ppl @@ -42,13 +42,13 @@ order by source = [ source = supplier - | join left = l right = r ON s_suppkey = l_suppkey lineitem - | join left = l right = r ON o_orderkey = l_orderkey orders - | join left = l right = r ON c_custkey = o_custkey customer - | join left = l right = n1 ON s_nationkey = n1.n_nationkey nation - | join left = n1 right = n2 ON c_nationkey = n2.n_nationkey nation + | join ON s_suppkey = l_suppkey lineitem + | join ON o_orderkey = l_orderkey orders + | join ON c_custkey = o_custkey customer + | join ON s_nationkey = n1.n_nationkey nation as n1 + | join ON c_nationkey = n2.n_nationkey nation as n2 | where l_shipdate between date('1995-01-01') and date('1996-12-31') - /* | where n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' NPE found */ + and n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' | eval supp_nation = n1.n_name, cust_nation = n2.n_name, l_year = year(l_shipdate), volume = l_extendedprice * (1 - l_discount) | fields supp_nation, cust_nation, l_year, volume ] as shipping diff --git a/integ-test/src/integration/resources/tpch/q8.ppl b/integ-test/src/integration/resources/tpch/q8.ppl index d2cd84ece..a73c7f7c3 100644 --- a/integ-test/src/integration/resources/tpch/q8.ppl +++ b/integ-test/src/integration/resources/tpch/q8.ppl @@ -40,13 +40,13 @@ order by source = [ source = part - | join left = l right = r ON p_partkey = l_partkey lineitem - | join left = l right = r ON s_suppkey = l_suppkey supplier - | join left = l right = r ON l_orderkey = o_orderkey orders - | join left = l right = r ON o_custkey = c_custkey customer - | join left = l right = r ON c_nationkey = n_nationkey nation - | join left = l right = r ON n_regionkey = r_regionkey region - | join left = l right = n2 ON s_nationkey = n2.n_nationkey nation + | join ON p_partkey = l_partkey lineitem + | join ON s_suppkey = l_suppkey supplier + | join ON l_orderkey = o_orderkey orders + | join ON o_custkey = c_custkey customer + | join ON c_nationkey = n1.n_nationkey nation as n1 + | join ON s_nationkey = n2.n_nationkey nation as n2 + | join ON n1.n_regionkey = r_regionkey region | where r_name = 'AMERICA' AND p_type = 'ECONOMY ANODIZED STEEL' and o_orderdate between date('1995-01-01') and date('1996-12-31') | eval o_year = year(o_orderdate) diff --git a/integ-test/src/integration/resources/tpch/q9.ppl b/integ-test/src/integration/resources/tpch/q9.ppl index e3be2e3c3..7692afd74 100644 --- a/integ-test/src/integration/resources/tpch/q9.ppl +++ b/integ-test/src/integration/resources/tpch/q9.ppl @@ -35,11 +35,11 @@ order by source = [ source = part - | join left = l right = r ON p_partkey = l_partkey lineitem - | join left = l right = r ON s_suppkey = l_suppkey supplier - | join left = l right = r ON ps_partkey = l_partkey and ps_suppkey = l_suppkey partsupp - | join left = l right = r ON o_orderkey = l_orderkey orders - | join left = l right = r ON s_nationkey = n_nationkey nation + | join ON p_partkey = l_partkey lineitem + | join ON s_suppkey = l_suppkey supplier + | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey partsupp + | join ON o_orderkey = l_orderkey orders + | join ON s_nationkey = n_nationkey nation | where like(p_name, '%green%') | eval nation = n_name | eval o_year = year(o_orderdate) diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/tpch/TPCHQueryBase.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/tpch/TPCHQueryBase.scala index ff708a13f..fb14210e9 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/tpch/TPCHQueryBase.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/tpch/TPCHQueryBase.scala @@ -172,6 +172,6 @@ trait TPCHQueryBase extends FlintPPLSuite { "q18", "q19", "q20", -// "q21", TODO exits subquery seems has an alias related bug. + "q21", "q22") }