Skip to content

Commit

Permalink
use row_ids for deletion vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed May 5, 2024
1 parent 073b643 commit 854fa95
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 138 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ set(EXTENSION_SOURCES
### Custom config
# TODO: figure out if we really need this?
if(APPLE)
set(PLATFORM_LIBS m c System resolv "-framework Corefoundation -framework SystemConfiguration")
set(PLATFORM_LIBS m c System resolv "-framework Corefoundation -framework SystemConfiguration -framework Security")
elseif(UNIX)
set(PLATFORM_LIBS m c resolv)
elseif(WIN32)
Expand Down
12 changes: 6 additions & 6 deletions extension_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ duckdb_extension_load(deltatable

# Any extra extensions that should be built
duckdb_extension_load(httpfs)
duckdb_extension_load(aws
LOAD_TESTS
GIT_URL https://github.com/duckdb/duckdb_aws
GIT_TAG f7b8729f1cce5ada5d4add70e1486de50763fb97
APPLY_PATCHES
)
#duckdb_extension_load(aws
# LOAD_TESTS
# GIT_URL https://github.com/duckdb/duckdb_aws
# GIT_TAG f7b8729f1cce5ada5d4add70e1486de50763fb97
# APPLY_PATCHES
# )
275 changes: 168 additions & 107 deletions src/functions/deltatable_scan.cpp

Large diffs are not rendered by default.

20 changes: 6 additions & 14 deletions src/include/functions/deltatable_scan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ struct DeltaFileMetaData {
//! The DeltaTableSnapshot implements the MultiFileList API to allow injecting it into the regular DuckDB parquet scan
struct DeltaTableSnapshot : public MultiFileList {
DeltaTableSnapshot(ClientContext &context, const string &path);
string GetPath();
static string CleanPath(const string &raw_path);

//! MultiFileList API
public:
Expand All @@ -47,8 +49,6 @@ struct DeltaTableSnapshot : public MultiFileList {

// TODO: change back to protected
public:
//! Table Info
string path;
idx_t version;

//! Delta Kernel Structures
Expand All @@ -69,13 +69,15 @@ struct DeltaTableSnapshot : public MultiFileList {
bool files_exhausted = false;
vector<string> resolved_files;
TableFilterSet table_filters;

ClientContext &context;
};

struct DeltaMultiFileReader : public MultiFileReader {
static unique_ptr<MultiFileReader> CreateInstance();
//! Return a DeltaTableSnapshot
unique_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
FileGlobOptions options = FileGlobOptions::DISALLOW_EMPTY) override;
FileGlobOptions options) override;

//! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file
//! readers will try read
Expand All @@ -91,6 +93,7 @@ struct DeltaMultiFileReader : public MultiFileReader {
const vector<LogicalType> &global_types, const vector<string> &global_names,
const vector<column_t> &global_column_ids, MultiFileReaderData &reader_data,
ClientContext &context) override;

//! Override the FinalizeChunk method
void FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data,
const MultiFileReaderData &reader_data, DataChunk &chunk) override;
Expand All @@ -100,15 +103,4 @@ struct DeltaMultiFileReader : public MultiFileReader {
ClientContext &context) override;
};

//struct DeltaMultiFileReaderBindData {
//
// DeltaMultiFileReaderBindData(DeltaTableSnapshot& delta_table_snapshot);
//
// //! The current MultiFileList
// DeltaTableSnapshot& current_snapshot;
//
// //! Bind data for demo generated column option
// idx_t file_number_column_idx = DConstants::INVALID_INDEX;
//};

} // namespace duckdb
17 changes: 11 additions & 6 deletions test/sql/dat/basic_append.test
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ require-env DAT_AVAILABLE
# - part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet
# - contains letters d,e

mode skip

# TODO Missing types: double
# Query the whole table
query II
SELECT letter, number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
----
d 4
e 5
Expand All @@ -28,7 +30,7 @@ c 3

query I
SELECT letter
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
----
d
e
Expand All @@ -38,7 +40,7 @@ c

query I
SELECT number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
----
4
5
Expand All @@ -51,23 +53,26 @@ FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated
# Now we add a filter that filters out one of the files
query II
SELECT letter, number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
WHERE number < 2
----
a 1

mode unskip

# Now we add a filter that filters out the other file
query II
SELECT letter, number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
WHERE number > 4
----
e 5

mode skip

# Now we add a filter that filters out all columns
query II
SELECT letter, number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----
14 changes: 13 additions & 1 deletion test/sql/dat/test_custom_delta_scan_param.test
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,22 @@ require-env DAT_AVAILABLE
# - part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet
# - contains letters d,e

query II
SELECT letter, number
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
----
d 4
e 5
a 1
b 2
c 3

mode skip

# Demo delta_file_number parameter (i.e. Delta extension provided)
query III
SELECT letter, number, delta_file_number
FROM delta_scan('delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1)
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1)
----
d 4 0
e 5 0
Expand Down
17 changes: 15 additions & 2 deletions test/sql/deltatable_with_dv.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ require deltatable

# Simplest example
query I
SELECT * FROM delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small')
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/table-with-dv-small/')
----
1
2
Expand All @@ -19,4 +19,17 @@ SELECT * FROM delta_scan('delta-kernel-rs/kernel/tests/data/table-with-dv-small'
7
8

# TODO: test with laaarge data with dv's
# With filter: ensures the deletion vector is applied properly on top of pushed down filters
query I
FROM delta_scan('file:///Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/table-with-dv-small/')
WHERE value > 3
----
4
5
6
7
8

# TODO: test with laaarge data with dv's
# TODO: test with delta_file_number option
# TODO: test with file_row_number option: ensure we don't enable the extra DataChunk reference step in the parquet scanner

0 comments on commit 854fa95

Please sign in to comment.