diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py index 5814258..956d7d0 100644 --- a/scripts/generate_test_data.py +++ b/scripts/generate_test_data.py @@ -6,6 +6,29 @@ BASE_PATH = "./data/generated/" +# Query to deal with our currently not-implemented types +modified_lineitem_query = """ +SELECT + l_orderkey, + l_partkey, + l_suppkey, + l_linenumber, + (l_quantity*100)::INTEGER as l_quantity, + (l_extendedprice*100)::INTEGER as l_extendedprice, + (l_discount*100)::INTEGER as l_discount, + (l_tax*100)::INTEGER as l_tax, + l_returnflag, + l_linestatus, + l_shipdate::VARCHAR as l_shipdate, + l_commitdate::VARCHAR as l_commitdate, + l_receiptdate::VARCHAR as l_receiptdate, + l_shipinstruct, + l_shipmode, + l_comment +FROM + lineitem +""" + def delete_old_files(): if (os.path.isdir(BASE_PATH)): shutil.rmtree(BASE_PATH) @@ -38,23 +61,38 @@ def generate_test_data(path, query, part_column=False): else: con.sql(f"COPY test_table to '{generated_path}/duckdb/data.parquet' (FORMAT parquet)") -delete_old_files() - -### Simple partitioned table -query = "CREATE table test_table AS SELECT i, i%2 as part from range(0,10) tbl(i);" -generate_test_data("simple_partitioned", query, "part") - -### Lineitem SF0.01 No partitions -query = "call dbgen(sf=0.01);" -query += "CREATE table test_table AS SELECT * as part from lineitem;" -generate_test_data("lineitem_sf0_01", query) - -### Lineitem SF0.01 10 Partitions -query = "call dbgen(sf=0.01);" -query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;" -generate_test_data("lineitem_sf0_01_10part", query, "part") +# delete_old_files() +# +# ### Simple partitioned table +# query = "CREATE table test_table AS SELECT i, i%2 as part from range(0,10) tbl(i);" +# generate_test_data("simple_partitioned", query, "part") +# +# ### Lineitem SF0.01 No partitions +# query = "call dbgen(sf=0.01);" +# query += "CREATE table test_table AS SELECT * as part from lineitem;" +# generate_test_data("lineitem_sf0_01", query) +# +# ### Lineitem SF0.01 10 Partitions +# query = "call dbgen(sf=0.01);" +# query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;" +# generate_test_data("lineitem_sf0_01_10part", query, "part") +# +# ### Lineitem SF1 10 Partitions +# query = "call dbgen(sf=1);" +# query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;" +# generate_test_data("lineitem_sf1_10part", query, "part") +# +# ### Lineitem_modified SF0.01 +# query = "call dbgen(sf=0.01);" +# query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});" +# generate_test_data("lineitem_modified_sf0.01", query, "part") +# +# ### Lineitem_modified SF1 +# query = "call dbgen(sf=1);" +# query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});" +# generate_test_data("lineitem_modified_sf1", query, "part") -### Lineitem SF1 10 Partitions -query = "call dbgen(sf=1);" -query += "CREATE table test_table AS SELECT *, l_orderkey%10 as part from lineitem;" -generate_test_data("lineitem_sf1_10part", query, "part") \ No newline at end of file +### Lineitem_modified SF10 +query = "call dbgen(sf=10);" +query += f"CREATE table test_table AS SELECT *, l_orderkey%10 as part from ({modified_lineitem_query});" +generate_test_data("lineitem_modified_sf10", query, "part") \ No newline at end of file diff --git a/src/functions/deltatable_scan.cpp b/src/functions/deltatable_scan.cpp index 61cf66f..7c606b4 100644 --- a/src/functions/deltatable_scan.cpp +++ b/src/functions/deltatable_scan.cpp @@ -202,8 +202,9 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio // Add any constants from the Delta metadata to the reader partition map auto file_metadata = custom_bind_data.current_snapshot.metadata.find(filename); if (file_metadata != custom_bind_data.current_snapshot.metadata.end() && !file_metadata->second.partition_map.empty()) { - for (idx_t i = 0; i < global_names.size(); i++) { - auto col_partition_entry = file_metadata->second.partition_map.find(global_names[i]); + for (idx_t i = 0; i < global_column_ids.size(); i++) { + column_t col_id = global_column_ids[i]; + auto col_partition_entry = file_metadata->second.partition_map.find(global_names[col_id]); if (col_partition_entry != file_metadata->second.partition_map.end()) { // Todo: use https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(global_types[i]); diff --git a/test/sql/delta_scan_generated.test b/test/sql/delta_scan_generated.test index eb0be43..4f4217d 100644 --- a/test/sql/delta_scan_generated.test +++ b/test/sql/delta_scan_generated.test @@ -8,12 +8,28 @@ require deltatable require-env GENERATED_DATA_AVAILABLE -mode output_result +query I rowsort q1 +SELECT + part, sum(l_extendedprice * l_discount) AS revenue +FROM + delta_scan('data/generated/lineitem_modified_sf10/delta_lake') +WHERE + l_shipdate::date >= CAST('1994-01-01' AS date) + AND l_shipdate::date < CAST('1995-01-01' AS date) + AND l_discount BETWEEN 5 AND 7 + AND l_quantity < 2400 +GROUP BY part; +---- -# This one is ok -#statement ok -#from delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small'); - -# But this one is not; likely due to some unsupported type in metadata? -#statement ok -#from delta_scan('data/generated/lineitem_sf0_01/delta_lake') limit 10 +query I rowsort q1 +SELECT + part, sum(l_extendedprice * l_discount) AS revenue +FROM + parquet_scan('data/generated/lineitem_modified_sf10/duckdb/**/*.parquet') +WHERE + l_shipdate::date >= CAST('1994-01-01' AS date) + AND l_shipdate::date < CAST('1995-01-01' AS date) + AND l_discount BETWEEN 5 AND 7 + AND l_quantity < 2400 +GROUP BY part; +----