From 854fa95967221f0a53f76d4b730d46ab39b3b171 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Sun, 5 May 2024 17:41:31 +0200 Subject: [PATCH] use row_ids for deletion vectors --- CMakeLists.txt | 2 +- duckdb | 2 +- extension_config.cmake | 12 +- src/functions/deltatable_scan.cpp | 275 +++++++++++------- src/include/functions/deltatable_scan.hpp | 20 +- test/sql/dat/basic_append.test | 17 +- .../sql/dat/test_custom_delta_scan_param.test | 14 +- test/sql/deltatable_with_dv.test | 17 +- 8 files changed, 221 insertions(+), 138 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5ae87e..daccecc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ set(EXTENSION_SOURCES ### Custom config # TODO: figure out if we really need this? if(APPLE) - set(PLATFORM_LIBS m c System resolv "-framework Corefoundation -framework SystemConfiguration") + set(PLATFORM_LIBS m c System resolv "-framework Corefoundation -framework SystemConfiguration -framework Security") elseif(UNIX) set(PLATFORM_LIBS m c resolv) elseif(WIN32) diff --git a/duckdb b/duckdb index 8e02018..9981777 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 8e02018889cd9f4563b05770c9a830701153140f +Subproject commit 99817776f058c8e35cbf9671ebb82657d5425e73 diff --git a/extension_config.cmake b/extension_config.cmake index 63977c1..7ef7cd6 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -8,9 +8,9 @@ duckdb_extension_load(deltatable # Any extra extensions that should be built duckdb_extension_load(httpfs) -duckdb_extension_load(aws - LOAD_TESTS - GIT_URL https://github.com/duckdb/duckdb_aws - GIT_TAG f7b8729f1cce5ada5d4add70e1486de50763fb97 - APPLY_PATCHES - ) \ No newline at end of file +#duckdb_extension_load(aws +# LOAD_TESTS +# GIT_URL https://github.com/duckdb/duckdb_aws +# GIT_TAG f7b8729f1cce5ada5d4add70e1486de50763fb97 +# APPLY_PATCHES +# ) \ No newline at end of file diff --git a/src/functions/deltatable_scan.cpp b/src/functions/deltatable_scan.cpp index 95ec34a..939cd66 100644 --- a/src/functions/deltatable_scan.cpp +++ b/src/functions/deltatable_scan.cpp @@ -32,12 +32,12 @@ static void* allocate_string(const struct ffi::KernelStringSlice slice) { static void visit_callback(ffi::NullableCvoid engine_context, const struct ffi::KernelStringSlice path, int64_t size, const ffi::DvInfo *dv_info, struct ffi::CStringMap *partition_values) { auto context = (DeltaTableSnapshot *) engine_context; - auto path_string = context->path + "/" + from_delta_string_slice(path); + auto path_string = context->GetPath() + "/" + from_delta_string_slice(path); -// printf("Fetch metadata for %s\n", path_string.c_str()); + printf("Fetch metadata for %s\n", path_string.c_str()); // First we append the file to our resolved files - context->resolved_files.push_back(path_string); + context->resolved_files.push_back(DeltaTableSnapshot::CleanPath(path_string)); context->metadata.push_back({}); D_ASSERT(context->resolved_files.size() == context->metadata.size()); @@ -48,7 +48,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, const struct ffi:: // Fetch the deletion vector auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->table_client, context->global_state); - auto selection_vector = unpack_result_or_throw(selection_vector_res, "selection_vector_from_dv for path " + context->path); + auto selection_vector = unpack_result_or_throw(selection_vector_res, "selection_vector_from_dv for path " + context->GetPath()); if (selection_vector) { context->metadata.back().selection_vector = {selection_vector, ffi::drop_bool_slice}; } @@ -121,31 +121,24 @@ static ffi::EngineInterfaceBuilder* CreateBuilder(ClientContext &context, const return builder; } -DeltaTableSnapshot::DeltaTableSnapshot(ClientContext &context, const string &path) : MultiFileList({path}, FileGlobOptions::ALLOW_EMPTY) { - auto path_slice = to_delta_string_slice(path); - - auto interface_builder = CreateBuilder(context, path); - auto engine_interface_res = ffi::builder_build(interface_builder); - table_client = unpack_result_or_throw(engine_interface_res, "get_default_client in DeltaScanScanBind"); - - // Alternatively we can do the default client like so: -// auto table_client_res = ffi::get_default_client(path_slice, error_allocator); -// table_client = unpack_result_or_throw(table_client_res, "get_default_client in DeltaScanScanBind"); - - // Initialize Snapshot - auto snapshot_res = ffi::snapshot(path_slice, table_client); - snapshot = unpack_result_or_throw(snapshot_res, "snapshot in DeltaScanScanBind"); - - auto scan_res = ffi::scan(snapshot, table_client, nullptr); - scan = unpack_result_or_throw(scan_res, "scan in DeltaScanScanBind"); +DeltaTableSnapshot::DeltaTableSnapshot(ClientContext &context_p, const string &path) : MultiFileList({path}, FileGlobOptions::ALLOW_EMPTY), context(context_p) { +} - global_state = ffi::get_global_scan_state(scan); +string DeltaTableSnapshot::GetPath() { + return GetPaths()[0]; +} - // Set version - this->version = ffi::version(snapshot); +string DeltaTableSnapshot::CleanPath(const string &raw_path) { + if (StringUtil::StartsWith(raw_path, "file://")) { + return raw_path.substr(7); + } + return raw_path; } void DeltaTableSnapshot::Bind(vector &return_types, vector &names) { + if (!initialized) { + InitializeFiles(); + } auto schema = SchemaVisitor::VisitSnapshotSchema(snapshot); for (const auto &field: *schema) { names.push_back(field.first); @@ -169,11 +162,22 @@ string DeltaTableSnapshot::GetFile(idx_t i) { } while(i >= resolved_files.size()) { + auto size_before = resolved_files.size(); + auto have_scan_data_res = ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visit_data); + + // TODO: weird workaround required to not get "Json error: Encountered unexpected 'c' whilst parsing value" + if (have_scan_data_res.tag == ffi::ExternResult::Tag::Err) { + if (have_scan_data_res.err._0) { + files_exhausted = true; + return ""; + } + } + auto have_scan_data = unpack_result_or_throw(have_scan_data_res, "kernel_scan_data_next in DeltaTableSnapshot GetFile"); // TODO: shouldn't the kernel always return false here? - if (!have_scan_data || resolved_files.size() == i) { + if (!have_scan_data || resolved_files.size() == size_before) { files_exhausted = true; return ""; } @@ -188,13 +192,36 @@ string DeltaTableSnapshot::GetFile(idx_t i) { } void DeltaTableSnapshot::InitializeFiles() { + auto path_slice = to_delta_string_slice(paths[0]); + + auto interface_builder = CreateBuilder(context, paths[0]); + auto engine_interface_res = ffi::builder_build(interface_builder); + table_client = unpack_result_or_throw(engine_interface_res, "get_default_client in DeltaScanScanBind"); + + // Alternatively we can do the default client like so: +// auto table_client_res = ffi::get_default_client(path_slice, error_allocator); +// table_client = unpack_result_or_throw(table_client_res, "get_default_client in DeltaScanScanBind"); + + // Initialize Snapshot + auto snapshot_res = ffi::snapshot(path_slice, table_client); + snapshot = unpack_result_or_throw(snapshot_res, "snapshot in DeltaScanScanBind"); + PredicateVisitor visitor(names, &table_filters); + auto scan_res = ffi::scan(snapshot, table_client, &visitor); + scan = unpack_result_or_throw(scan_res, "scan in DeltaScanScanBind"); + + global_state = ffi::get_global_scan_state(scan); + + // Set version + this->version = ffi::version(snapshot); + auto scan_iterator_res = ffi::kernel_scan_data_init(table_client, scan); scan_data_iterator = { unpack_result_or_throw(scan_iterator_res, "kernel_scan_data_init in InitFiles"), ffi::kernel_scan_data_free }; + initialized = true; } @@ -256,24 +283,33 @@ bool DeltaMultiFileReader::Bind(MultiFileReaderOptions &options, MultiFileList & delta_table_snapshot.Bind(return_types, names); + // If deletion vector present, we need to force the parquet readers to emit row-ids and pass the snapshot through + // the custom bind data + bind_data.file_row_number_idx = names.size(); + bind_data.multi_file_reader_needs_file_row_number = true; + return true; }; void DeltaMultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, vector &names, MultiFileReaderBindData& bind_data) { + + // Disable all other multifilereader options + options.auto_detect_hive_partitioning = false; + options.hive_partitioning = false; + options.union_by_name = false; + MultiFileReader::BindOptions(options, files, return_types, names, bind_data); - //! TODO figure out state passing -// auto custom_bind_data = make_uniq(dynamic_cast(files)); -// -// auto demo_gen_col_opt = options.custom_options.find("delta_file_number"); -// if (demo_gen_col_opt != options.custom_options.end()) { -// custom_bind_data->file_number_column_idx = names.size(); -// names.push_back("delta_file_number"); -// return_types.push_back(LogicalType::UBIGINT); -// } -// -// bind_data.custom_data = std::move(custom_bind_data); + auto demo_gen_col_opt = options.custom_options.find("delta_file_number"); + if (demo_gen_col_opt != options.custom_options.end()) { + if (demo_gen_col_opt->second.GetValue()) { + D_ASSERT(bind_data.custom_data.find("file_number_column_idx") == bind_data.custom_data.end()); + bind_data.custom_data["file_number_column_idx"] = Value::UBIGINT(names.size()); + names.push_back("delta_file_number"); + return_types.push_back(LogicalType::UBIGINT); + } + } } void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, @@ -283,29 +319,48 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio ClientContext &context) { MultiFileReader::FinalizeBind(file_options, options, filename, local_names, global_types, global_names, global_column_ids, reader_data, context); + // Handle custom delta option set in MultiFileReaderOptions::custom_options with data passed through in MultiFileReaderBindData::custom_data + auto file_number_opt = file_options.custom_options.find("delta_file_number"); + if (file_number_opt != file_options.custom_options.end()) { + if (file_number_opt->second.GetValue()) { + auto maybe_file_number_column = options.custom_data.find("file_number_column_idx"); + D_ASSERT(maybe_file_number_column != options.custom_data.end()); + auto file_number_column_idx = maybe_file_number_column->second.GetValue(); + D_ASSERT(file_number_column_idx != DConstants::INVALID_INDEX); + + // TODO: we have the metadata for the file available here already, the reason we handle this in FinalizeChunk + // is purely for demonstration purposes + reader_data.constant_map.emplace_back(file_number_column_idx, Value::UBIGINT(0)); + } + } - // TODO figure out state passing -// if (options.custom_data) { -// auto &custom_bind_data = dynamic_cast(*options.custom_data); -// if (custom_bind_data.file_number_column_idx != DConstants::INVALID_INDEX) { -// // TODO: remove the need for a placeholder here? -// reader_data.constant_map.emplace_back(custom_bind_data.file_number_column_idx, Value::UBIGINT(0)); -// } -// -// // Add any constants from the Delta metadata to the reader partition map -// auto file_metadata = custom_bind_data.current_snapshot.metadata.find(filename); -// if (file_metadata != custom_bind_data.current_snapshot.metadata.end() && !file_metadata->second.partition_map.empty()) { -// for (idx_t i = 0; i < global_column_ids.size(); i++) { -// column_t col_id = global_column_ids[i]; -// auto col_partition_entry = file_metadata->second.partition_map.find(global_names[col_id]); -// if (col_partition_entry != file_metadata->second.partition_map.end()) { -// // Todo: use https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization -// auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(global_types[i]); -// reader_data.constant_map.emplace_back(i, maybe_value); -// } -// } -// } -// } + // Get the metadata for this file + D_ASSERT(reader_data.file_metadata.file_list); + const auto &snapshot = dynamic_cast(*reader_data.file_metadata.file_list); + auto &file_metadata = snapshot.metadata[reader_data.file_metadata.file_list_idx]; + + if (!file_metadata.partition_map.empty()) { + for (idx_t i = 0; i < global_column_ids.size(); i++) { + column_t col_id = global_column_ids[i]; + auto col_partition_entry = file_metadata.partition_map.find(global_names[col_id]); + if (col_partition_entry != file_metadata.partition_map.end()) { + // Todo: use https://github.com/delta-io/delta/blob/master/PROTOCOL.md#partition-value-serialization + auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(global_types[i]); + reader_data.constant_map.emplace_back(i, maybe_value); + } + } + } +} + +void DeltaMultiFileReader::CreateMapping(const string &file_name, const vector &local_types, + const vector &local_names, const vector &global_types, + const vector &global_names, const vector &global_column_ids, + optional_ptr filters, MultiFileReaderData &reader_data, + const string &initial_file) { + + MultiFileReader::CreateNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, + initial_file); + MultiFileReader::CreateFilterMap(global_types, filters, reader_data); } unique_ptr DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector& paths, FileGlobOptions options) { @@ -316,23 +371,26 @@ unique_ptr DeltaMultiFileReader::CreateFileList(ClientContext &co return make_uniq(context, paths[0]); } -static SelectionVector DuckSVFromDeltaSV(ffi::KernelBoolSlice *dv, idx_t offset, idx_t count, idx_t &select_count, idx_t &skip_count) { - auto max_count = MinValue(count, dv->len - offset); - SelectionVector result {max_count}; +// Generate the correct Selection Vector Based on the Raw delta KernelBoolSlice dv and the row_id_column +// TODO: benchmark this? +static SelectionVector DuckSVFromDeltaSV(ffi::KernelBoolSlice *dv, Vector row_id_column, idx_t count, idx_t &select_count) { + D_ASSERT(row_id_column.GetType() == LogicalType::BIGINT); -// print_selection_vector(" cur: ", dv); + UnifiedVectorFormat data; + row_id_column.ToUnifiedFormat(count, data); + auto row_ids = UnifiedVectorFormat::GetData(data); + SelectionVector result {count}; idx_t current_select = 0; - for (idx_t i = 0; i < max_count; i++) { - if (dv->ptr[i + offset]) { + for (idx_t i = 0; i < count; i++) { + auto row_id = row_ids[data.sel->get_index(i)]; + if (dv->ptr[row_id]) { result.data()[current_select] = i; current_select++; } } - select_count = current_select; - skip_count = max_count - select_count; -// result.Print(select_count); + select_count = current_select; return result; } @@ -341,52 +399,55 @@ void DeltaMultiFileReader::FinalizeChunk(ClientContext &context, const MultiFile const MultiFileReaderData &reader_data, DataChunk &chunk) { // Base class finalization first MultiFileReader::FinalizeChunk(context, bind_data, reader_data, chunk); + chunk.Print(); - // TODO: Fix by passing MultiFileList? -// if (bind_data.custom_data) { -// auto &custom_bind_data = dynamic_cast(*bind_data.custom_data); -// auto &metadata = custom_bind_data.current_snapshot.metadata[bind_data.filename_idx]; -// -// if (metadata.selection_vector.get() && chunk.size() != 0) { -// // Handle deletion vector -// idx_t select_count, skip_count; -// auto sv = DuckSVFromDeltaSV(metadata.selection_vector.get(), metadata.current_selection_vector_offset, STANDARD_VECTOR_SIZE, select_count, skip_count); -// metadata.current_selection_vector_offset += select_count + skip_count; -// chunk.Slice(sv, select_count); -// } -// -// // Note: this demo function shows how we can use DuckDB's Binder create expression-based generated columns -// if (custom_bind_data.file_number_column_idx != DConstants::INVALID_INDEX) { -// -// //! Create Dummy expression (0 + file_number) -// vector> child_expr; -// child_expr.push_back(make_uniq(Value::UBIGINT(0))); -// child_expr.push_back(make_uniq(Value::UBIGINT(metadata.file_number))); -// unique_ptr expr = make_uniq("+", std::move(child_expr), nullptr, nullptr, false, true); -// -// //! s dummy expression -// auto binder = Binder::CreateBinder(context); -// ExpressionBinder expr_binder(*binder, context); -// auto bound_expr = expr_binder.Bind(expr, nullptr); -// -// //! Execute dummy expression into result column -// ExpressionExecutor expr_executor(context); -// expr_executor.AddExpression(*bound_expr); -// -// //! Execute the expression directly into the output Chunk -// expr_executor.ExecuteExpression(chunk.data[custom_bind_data.file_number_column_idx]); -// } -// } + D_ASSERT(reader_data.file_metadata.file_list); + + // Get the metadata for this file + const auto &snapshot = dynamic_cast(*reader_data.file_metadata.file_list); + auto &metadata = snapshot.metadata[reader_data.file_metadata.file_list_idx]; + + if (metadata.selection_vector.get() && chunk.size() != 0) { + idx_t select_count; + + // Construct the selection vector using the file_row_number column and the raw selection vector from delta + auto sv = DuckSVFromDeltaSV(metadata.selection_vector.get(), chunk.data[bind_data.file_row_number_idx], chunk.size(), select_count); + + // Slice the result + chunk.Slice(sv, select_count); + } + + // Note: this demo function shows how we can use DuckDB's Binder create expression-based generated columns + auto maybe_file_number_column_idx = bind_data.custom_data.find("file_number_column_idx"); + if (maybe_file_number_column_idx != bind_data.custom_data.end()) { + auto file_number_column_idx = maybe_file_number_column_idx->second.GetValue(); + //! Create Dummy expression (0 + file_number) + vector> child_expr; + child_expr.push_back(make_uniq(Value::UBIGINT(0))); + child_expr.push_back(make_uniq(Value::UBIGINT(metadata.file_number))); + unique_ptr expr = make_uniq("+", std::move(child_expr), nullptr, nullptr, false, true); + + //! s dummy expression + auto binder = Binder::CreateBinder(context); + ExpressionBinder expr_binder(*binder, context); + auto bound_expr = expr_binder.Bind(expr, nullptr); + + //! Execute dummy expression into result column + ExpressionExecutor expr_executor(context); + expr_executor.AddExpression(*bound_expr); + + //! Execute the expression directly into the output Chunk + expr_executor.ExecuteExpression(chunk.data[file_number_column_idx]); + } }; bool DeltaMultiFileReader::ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, ClientContext &context) { auto loption = StringUtil::Lower(key); - // TODO: Fix by adding custom options? -// if (loption == "delta_file_number") { -// options.custom_options[loption] = val; -// return true; -// } + if (loption == "delta_file_number") { + options.custom_options[loption] = val; + return true; + } return MultiFileReader::ParseOption(key, val, options, context); } diff --git a/src/include/functions/deltatable_scan.hpp b/src/include/functions/deltatable_scan.hpp index f7d8e62..0f7d0b8 100644 --- a/src/include/functions/deltatable_scan.hpp +++ b/src/include/functions/deltatable_scan.hpp @@ -26,6 +26,8 @@ struct DeltaFileMetaData { //! The DeltaTableSnapshot implements the MultiFileList API to allow injecting it into the regular DuckDB parquet scan struct DeltaTableSnapshot : public MultiFileList { DeltaTableSnapshot(ClientContext &context, const string &path); + string GetPath(); + static string CleanPath(const string &raw_path); //! MultiFileList API public: @@ -47,8 +49,6 @@ struct DeltaTableSnapshot : public MultiFileList { // TODO: change back to protected public: - //! Table Info - string path; idx_t version; //! Delta Kernel Structures @@ -69,13 +69,15 @@ struct DeltaTableSnapshot : public MultiFileList { bool files_exhausted = false; vector resolved_files; TableFilterSet table_filters; + + ClientContext &context; }; struct DeltaMultiFileReader : public MultiFileReader { static unique_ptr CreateInstance(); //! Return a DeltaTableSnapshot unique_ptr CreateFileList(ClientContext &context, const vector &paths, - FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY) override; + FileGlobOptions options) override; //! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file //! readers will try read @@ -91,6 +93,7 @@ struct DeltaMultiFileReader : public MultiFileReader { const vector &global_types, const vector &global_names, const vector &global_column_ids, MultiFileReaderData &reader_data, ClientContext &context) override; + //! Override the FinalizeChunk method void FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data, const MultiFileReaderData &reader_data, DataChunk &chunk) override; @@ -100,15 +103,4 @@ struct DeltaMultiFileReader : public MultiFileReader { ClientContext &context) override; }; -//struct DeltaMultiFileReaderBindData { -// -// DeltaMultiFileReaderBindData(DeltaTableSnapshot& delta_table_snapshot); -// -// //! The current MultiFileList -// DeltaTableSnapshot& current_snapshot; -// -// //! Bind data for demo generated column option -// idx_t file_number_column_idx = DConstants::INVALID_INDEX; -//}; - } // namespace duckdb diff --git a/test/sql/dat/basic_append.test b/test/sql/dat/basic_append.test index eb2d06a..a69891f 100644 --- a/test/sql/dat/basic_append.test +++ b/test/sql/dat/basic_append.test @@ -14,11 +14,13 @@ require-env DAT_AVAILABLE # - part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet # - contains letters d,e +mode skip + # TODO Missing types: double # Query the whole table query II SELECT letter, number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') ---- d 4 e 5 @@ -28,7 +30,7 @@ c 3 query I SELECT letter -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') ---- d e @@ -38,7 +40,7 @@ c query I SELECT number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') ---- 4 5 @@ -51,23 +53,26 @@ FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated # Now we add a filter that filters out one of the files query II SELECT letter, number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') WHERE number < 2 ---- a 1 +mode unskip # Now we add a filter that filters out the other file query II SELECT letter, number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') WHERE number > 4 ---- e 5 +mode skip + # Now we add a filter that filters out all columns query II SELECT letter, number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') WHERE number > 6 ---- diff --git a/test/sql/dat/test_custom_delta_scan_param.test b/test/sql/dat/test_custom_delta_scan_param.test index c804d61..ca484dd 100644 --- a/test/sql/dat/test_custom_delta_scan_param.test +++ b/test/sql/dat/test_custom_delta_scan_param.test @@ -14,10 +14,22 @@ require-env DAT_AVAILABLE # - part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet # - contains letters d,e +query II +SELECT letter, number +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +---- +d 4 +e 5 +a 1 +b 2 +c 3 + +mode skip + # Demo delta_file_number parameter (i.e. Delta extension provided) query III SELECT letter, number, delta_file_number -FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1) +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1) ---- d 4 0 e 5 0 diff --git a/test/sql/deltatable_with_dv.test b/test/sql/deltatable_with_dv.test index e2faf4d..e8df89a 100644 --- a/test/sql/deltatable_with_dv.test +++ b/test/sql/deltatable_with_dv.test @@ -8,7 +8,7 @@ require deltatable # Simplest example query I -SELECT * FROM delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small') +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/table-with-dv-small/') ---- 1 2 @@ -19,4 +19,17 @@ SELECT * FROM delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small' 7 8 -# TODO: test with laaarge data with dv's \ No newline at end of file +# With filter: ensures the deletion vector is applied properly on top of pushed down filters +query I +FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/table-with-dv-small/') +WHERE value > 3 +---- +4 +5 +6 +7 +8 + +# TODO: test with laaarge data with dv's +# TODO: test with delta_file_number option +# TODO: test with file_row_number option: ensure we don't enable the extra DataChunk reference step in the parquet scanner \ No newline at end of file