Skip to content

Commit

Permalink
add a hefty local benchmark/test
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed Apr 18, 2024
1 parent 0c2ab25 commit e0b5216
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 29 deletions.
76 changes: 57 additions & 19 deletions scripts/generate_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,29 @@

BASE_PATH = "./data/generated/"

# Query to deal with our currently not-implemented types
modified_lineitem_query = """
SELECT
l_orderkey,
l_partkey,
l_suppkey,
l_linenumber,
(l_quantity*100)::INTEGER as l_quantity,
(l_extendedprice*100)::INTEGER as l_extendedprice,
(l_discount*100)::INTEGER as l_discount,
(l_tax*100)::INTEGER as l_tax,
l_returnflag,
l_linestatus,
l_shipdate::VARCHAR as l_shipdate,
l_commitdate::VARCHAR as l_commitdate,
l_receiptdate::VARCHAR as l_receiptdate,
l_shipinstruct,
l_shipmode,
l_comment
FROM
lineitem
"""

def delete_old_files():
if (os.path.isdir(BASE_PATH)):
shutil.rmtree(BASE_PATH)
Expand Down Expand Up @@ -38,23 +61,38 @@ def generate_test_data(path, query, part_column=False):
else:
con.sql(f"COPY test_table to '{generated_path}/duckdb/data.parquet' (FORMAT parquet)")

delete_old_files()

### Simple partitioned table
query = "CREATE table test_table AS SELECT i, i%2 as part from range(0,10) tbl(i);"
generate_test_data("simple_partitioned", query, "part")

### Lineitem SF0.01 No partitions
query = "call dbgen(sf=0.01);"
query += "CREATE table test_table AS SELECT * as part from lineitem;"
generate_test_data("lineitem_sf0_01", query)

### Lineitem SF0.01 10 Partitions
query = "call dbgen(sf=0.01);"
query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;"
generate_test_data("lineitem_sf0_01_10part", query, "part")
# delete_old_files()
#
# ### Simple partitioned table
# query = "CREATE table test_table AS SELECT i, i%2 as part from range(0,10) tbl(i);"
# generate_test_data("simple_partitioned", query, "part")
#
# ### Lineitem SF0.01 No partitions
# query = "call dbgen(sf=0.01);"
# query += "CREATE table test_table AS SELECT * as part from lineitem;"
# generate_test_data("lineitem_sf0_01", query)
#
# ### Lineitem SF0.01 10 Partitions
# query = "call dbgen(sf=0.01);"
# query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;"
# generate_test_data("lineitem_sf0_01_10part", query, "part")
#
# ### Lineitem SF1 10 Partitions
# query = "call dbgen(sf=1);"
# query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;"
# generate_test_data("lineitem_sf1_10part", query, "part")
#
# ### Lineitem_modified SF0.01
# query = "call dbgen(sf=0.01);"
# query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});"
# generate_test_data("lineitem_modified_sf0.01", query, "part")
#
# ### Lineitem_modified SF1
# query = "call dbgen(sf=1);"
# query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});"
# generate_test_data("lineitem_modified_sf1", query, "part")

### Lineitem SF1 10 Partitions
query = "call dbgen(sf=1);"
query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;"
generate_test_data("lineitem_sf1_10part", query, "part")
### Lineitem_modified SF10
query = "call dbgen(sf=10);"
query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});"
generate_test_data("lineitem_modified_sf10", query, "part")
5 changes: 3 additions & 2 deletions src/functions/deltatable_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,9 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio
// Add any constants from the Delta metadata to the reader partition map
auto file_metadata = custom_bind_data.current_snapshot.metadata.find(filename);
if (file_metadata != custom_bind_data.current_snapshot.metadata.end() && !file_metadata->second.partition_map.empty()) {
for (idx_t i = 0; i < global_names.size(); i++) {
auto col_partition_entry = file_metadata->second.partition_map.find(global_names[i]);
for (idx_t i = 0; i < global_column_ids.size(); i++) {
column_t col_id = global_column_ids[i];
auto col_partition_entry = file_metadata->second.partition_map.find(global_names[col_id]);
if (col_partition_entry != file_metadata->second.partition_map.end()) {
// Todo: use https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization
auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(global_types[i]);
Expand Down
32 changes: 24 additions & 8 deletions test/sql/delta_scan_generated.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,28 @@ require deltatable

require-env GENERATED_DATA_AVAILABLE

mode output_result
query I rowsort q1
SELECT
part, sum(l_extendedprice * l_discount) AS revenue
FROM
delta_scan('data/generated/lineitem_modified_sf10/delta_lake')
WHERE
l_shipdate::date >= CAST('1994-01-01' AS date)
AND l_shipdate::date < CAST('1995-01-01' AS date)
AND l_discount BETWEEN 5 AND 7
AND l_quantity < 2400
GROUP BY part;
----

# This one is ok
#statement ok
#from delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small');

# But this one is not; likely due to some unsupported type in metadata?
#statement ok
#from delta_scan('data/generated/lineitem_sf0_01/delta_lake') limit 10
query I rowsort q1
SELECT
part, sum(l_extendedprice * l_discount) AS revenue
FROM
parquet_scan('data/generated/lineitem_modified_sf10/duckdb/**/*.parquet')
WHERE
l_shipdate::date >= CAST('1994-01-01' AS date)
AND l_shipdate::date < CAST('1995-01-01' AS date)
AND l_discount BETWEEN 5 AND 7
AND l_quantity < 2400
GROUP BY part;
----

0 comments on commit e0b5216

Please sign in to comment.