From 9922008b64514cda481dc41e76585551bc816c32 Mon Sep 17 00:00:00 2001 From: Luc Rancourt Date: Fri, 27 Oct 2023 13:25:11 +0200 Subject: [PATCH] Fragment metadata: adding get_tile_metadata. This adds the ability to get the tile metadata to later be used by the aggregates pipeline. --- TYPE: IMPROVEMENT DESC: Fragment metadata: adding get_tile_metadata. --- test/src/unit-tile-metadata.cc | 309 ++++++++++++++++++++++-- tiledb/sm/fragment/fragment_metadata.cc | 69 +++++- tiledb/sm/fragment/fragment_metadata.h | 14 ++ 3 files changed, 375 insertions(+), 17 deletions(-) diff --git a/test/src/unit-tile-metadata.cc b/test/src/unit-tile-metadata.cc index 5464cffb1896..ce1c8f99d97c 100644 --- a/test/src/unit-tile-metadata.cc +++ b/test/src/unit-tile-metadata.cc @@ -35,6 +35,7 @@ #include "test/support/src/helpers.h" #include "tiledb/sm/c_api/tiledb_struct_def.h" #include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/query/readers/aggregators/tile_metadata.h" #include "tiledb/sm/tile/tile_metadata_generator.h" using namespace tiledb; @@ -296,6 +297,7 @@ struct CPPFixedTileMetadataFx { bool has_coords = layout != TILEDB_ROW_MAJOR; if (has_coords) { std::vector names{"d"}; + frag_meta[f]->load_rtree(enc_key); frag_meta[f]->load_tile_min_values(enc_key, names); frag_meta[f]->load_tile_max_values(enc_key, names); frag_meta[f]->load_tile_sum_values(enc_key, names); @@ -347,6 +349,12 @@ struct CPPFixedTileMetadataFx { // Validate sum. auto sum = frag_meta[f]->get_tile_sum("d", tile_idx); CHECK(*(int64_t*)sum == correct_sum); + + // Validate the full tile data structure. + auto full_tile_data = frag_meta[f]->get_tile_metadata("d", tile_idx); + CHECK(correct_min == full_tile_data.min_as()); + CHECK(correct_max == full_tile_data.max_as()); + CHECK(correct_sum == full_tile_data.sum_as()); } } } @@ -483,11 +491,12 @@ struct CPPFixedTileMetadataFx { // For strings, the index is stored in a signed value, switch to // the index to unsigned. - int64_t idx = (int64_t)correct_tile_mins_[f][tile_idx] - - (int64_t)std::numeric_limits::min(); + int64_t min_idx = (int64_t)correct_tile_mins_[f][tile_idx] - + (int64_t)std::numeric_limits::min(); CHECK( 0 == - strncmp(min.data(), string_ascii_[idx].c_str(), cell_val_num)); + strncmp( + min.data(), string_ascii_[min_idx].c_str(), cell_val_num)); // Validate max. const auto max = @@ -496,11 +505,12 @@ struct CPPFixedTileMetadataFx { // For strings, the index is stored in a signed value, switch to // the index to unsigned. - idx = (int64_t)correct_tile_maxs_[f][tile_idx] - - (int64_t)std::numeric_limits::min(); + int64_t max_idx = (int64_t)correct_tile_maxs_[f][tile_idx] - + (int64_t)std::numeric_limits::min(); CHECK( 0 == - strncmp(max.data(), string_ascii_[idx].c_str(), cell_val_num)); + strncmp( + max.data(), string_ascii_[max_idx].c_str(), cell_val_num)); // Validate no sum. CHECK_THROWS_WITH( @@ -508,6 +518,16 @@ struct CPPFixedTileMetadataFx { "FragmentMetadata: Trying to access tile sum metadata that's " "not " "present"); + + // Validate the full tile data structure. + auto full_tile_data = + frag_meta[f]->get_tile_metadata("a", tile_idx); + CHECK( + string_ascii_[min_idx] == + full_tile_data.min_as()); + CHECK( + string_ascii_[max_idx] == + full_tile_data.max_as()); } } else { (void)cell_val_num; @@ -528,13 +548,29 @@ struct CPPFixedTileMetadataFx { memcmp( &max, &correct_tile_maxs_[f][tile_idx], sizeof(TestType))); + // Validate the full tile data structure. + auto full_tile_data = + frag_meta[f]->get_tile_metadata("a", tile_idx); + CHECK( + correct_tile_mins_[f][tile_idx] == + full_tile_data.min_as()); + CHECK( + correct_tile_maxs_[f][tile_idx] == + full_tile_data.max_as()); + if constexpr (!std::is_same::value) { // Validate sum. auto sum = frag_meta[f]->get_tile_sum("a", tile_idx); if constexpr (std::is_integral_v) { CHECK(*(int64_t*)sum == correct_tile_sums_int_[f][tile_idx]); + CHECK( + correct_tile_sums_int_[f][tile_idx] == + full_tile_data.sum_as()); } else { CHECK(*(double*)sum == correct_tile_sums_double_[f][tile_idx]); + CHECK( + correct_tile_sums_double_[f][tile_idx] == + full_tile_data.sum_as()); } } } @@ -555,6 +591,20 @@ struct CPPFixedTileMetadataFx { } } + if constexpr (!std::is_same::value) { + // Validate the full tile data structure for null count + for (uint64_t tile_idx = 0; tile_idx < num_tiles_; tile_idx++) { + auto full_tile_data = frag_meta[f]->get_tile_metadata("a", tile_idx); + if (nullable) { + CHECK( + full_tile_data.null_count() == + correct_tile_null_counts_[f][tile_idx]); + } else { + CHECK(full_tile_data.null_count() == 0); + } + } + } + // Close array. rc = tiledb_array_close(ctx, array); CHECK(rc == TILEDB_OK); @@ -819,6 +869,7 @@ struct CPPVarTileMetadataFx { bool has_coords = layout != TILEDB_ROW_MAJOR; if (has_coords) { std::vector names{"d"}; + frag_meta[f]->load_rtree(enc_key); frag_meta[f]->load_tile_min_values(enc_key, names); frag_meta[f]->load_tile_max_values(enc_key, names); frag_meta[f]->load_tile_sum_values(enc_key, names); @@ -870,6 +921,12 @@ struct CPPVarTileMetadataFx { // Validate sum. auto sum = frag_meta[f]->get_tile_sum("d", tile_idx); CHECK(*(int64_t*)sum == correct_sum); + + // Validate the full tile data structure. + auto full_tile_data = frag_meta[f]->get_tile_metadata("d", tile_idx); + CHECK(correct_min == full_tile_data.min_as()); + CHECK(correct_max == full_tile_data.max_as()); + CHECK(correct_sum == full_tile_data.sum_as()); } } } @@ -929,26 +986,35 @@ struct CPPVarTileMetadataFx { // Validate min. const auto min = frag_meta[f]->get_tile_min_as("a", tile_idx); - int idx = correct_tile_mins_[f][tile_idx]; - CHECK(min.size() == strings_[idx].size()); + int min_idx = correct_tile_mins_[f][tile_idx]; + CHECK(min.size() == strings_[min_idx].size()); CHECK( - 0 == - strncmp(min.data(), strings_[idx].c_str(), strings_[idx].size())); + 0 == strncmp( + min.data(), + strings_[min_idx].c_str(), + strings_[min_idx].size())); // Validate max. const auto max = frag_meta[f]->get_tile_max_as("a", tile_idx); - idx = correct_tile_maxs_[f][tile_idx]; - CHECK(max.size() == strings_[idx].size()); + int max_idx = correct_tile_maxs_[f][tile_idx]; + CHECK(max.size() == strings_[max_idx].size()); CHECK( - 0 == - strncmp(max.data(), strings_[idx].c_str(), strings_[idx].size())); + 0 == strncmp( + max.data(), + strings_[max_idx].c_str(), + strings_[max_idx].size())); // Validate no sum. CHECK_THROWS_WITH( frag_meta[f]->get_tile_sum("a", tile_idx), "FragmentMetadata: Trying to access tile sum metadata that's not " "present"); + + // Validate the full tile data structure. + auto full_tile_data = frag_meta[f]->get_tile_metadata("a", tile_idx); + CHECK(strings_[min_idx] == full_tile_data.min_as()); + CHECK(strings_[max_idx] == full_tile_data.max_as()); } } @@ -1185,6 +1251,15 @@ struct CPPFixedTileMetadataPartialFx { // Validate sum. auto sum = frag_meta[0]->get_tile_sum("a", tile_idx); CHECK(*(double*)sum - correct_tile_sums[tile_idx] < 0.0001); + + // Validate the full tile data structure. + auto full_tile_data = frag_meta[0]->get_tile_metadata("a", tile_idx); + CHECK(correct_tile_mins[tile_idx] == full_tile_data.min_as()); + CHECK(correct_tile_maxs[tile_idx] == full_tile_data.max_as()); + CHECK( + std::abs( + correct_tile_sums[tile_idx] - full_tile_data.sum_as()) < + 0.0001); } // Close array. @@ -1244,7 +1319,7 @@ struct CPPVarTileMetadataPartialFx { tiledb_domain_add_dimension(ctx, domain, d1); tiledb_domain_add_dimension(ctx, domain, d2); - // Create a single attribute "a" so each (i,j) cell can store an integer + // Create a single attribute "a" so each (i,j) cell can store a string tiledb_attribute_t* a; tiledb_attribute_alloc(ctx, "a", TILEDB_STRING_ASCII, &a); tiledb_attribute_set_cell_val_num(ctx, a, TILEDB_VAR_NUM); @@ -1350,6 +1425,15 @@ struct CPPVarTileMetadataPartialFx { CHECK( 0 == memcmp(max.data(), correct_tile_maxs[tile_idx].data(), max.size())); + + // Validate the full tile data structure. + auto full_tile_data = frag_meta[0]->get_tile_metadata("a", tile_idx); + CHECK( + correct_tile_mins[tile_idx] == + full_tile_data.min_as()); + CHECK( + correct_tile_maxs[tile_idx] == + full_tile_data.max_as()); } // Close array. @@ -1375,4 +1459,199 @@ TEST_CASE_METHOD( create_array(); write_fragment(); check_metadata(); +} + +struct CPPTileMetadataStringDimFx { + CPPTileMetadataStringDimFx() + : vfs_(ctx_) { + if (vfs_.is_dir(ARRAY_NAME)) + vfs_.remove_dir(ARRAY_NAME); + } + + ~CPPTileMetadataStringDimFx() { + if (vfs_.is_dir(ARRAY_NAME)) + vfs_.remove_dir(ARRAY_NAME); + } + + void create_array() { + // Create TileDB context + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + // The array will be two string dimension "d1" and "d2". + tiledb_dimension_t* d1; + tiledb_dimension_alloc(ctx, "d1", TILEDB_STRING_ASCII, 0, 0, &d1); + tiledb_dimension_t* d2; + tiledb_dimension_alloc(ctx, "d2", TILEDB_STRING_ASCII, 0, 0, &d2); + + // Create domain + tiledb_domain_t* domain; + tiledb_domain_alloc(ctx, &domain); + tiledb_domain_add_dimension(ctx, domain, d1); + tiledb_domain_add_dimension(ctx, domain, d2); + + // Create a single attribute "a" so each (i,j) cell can store a double + tiledb_attribute_t* a; + tiledb_attribute_alloc(ctx, "a", TILEDB_FLOAT64, &a); + + // Create array schema + tiledb_array_schema_t* array_schema; + tiledb_array_schema_alloc(ctx, TILEDB_SPARSE, &array_schema); + tiledb_array_schema_set_cell_order(ctx, array_schema, TILEDB_ROW_MAJOR); + tiledb_array_schema_set_tile_order(ctx, array_schema, TILEDB_ROW_MAJOR); + tiledb_array_schema_set_domain(ctx, array_schema, domain); + tiledb_array_schema_add_attribute(ctx, array_schema, a); + + // Create array + tiledb_array_create(ctx, ARRAY_NAME, array_schema); + + // Clean up + tiledb_attribute_free(&a); + tiledb_dimension_free(&d1); + tiledb_dimension_free(&d2); + tiledb_domain_free(&domain); + tiledb_array_schema_free(&array_schema); + tiledb_ctx_free(&ctx); + } + + void write_fragment() { + // Write to the array. + auto array = tiledb::Array(ctx_, ARRAY_NAME, TILEDB_WRITE); + auto query = tiledb::Query(ctx_, array, TILEDB_WRITE); + + std::string d1 = "abbcccdddd"; + std::vector d1_offsets{0, 1, 3, 6}; + std::string d2 = "abcd"; + std::vector d2_offsets{0, 1, 2, 3}; + std::vector a{4, 5, 6, 7}; + query.set_layout(TILEDB_UNORDERED); + query.set_data_buffer("d1", d1).set_offsets_buffer("d1", d1_offsets); + query.set_data_buffer("d2", d2).set_offsets_buffer("d2", d2_offsets); + query.set_data_buffer("a", a); + + query.submit(); + query.finalize(); + array.close(); + } + + void check_metadata() { + // Open array. + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + tiledb_array_t* array; + int rc = tiledb_array_alloc(ctx, ARRAY_NAME, &array); + CHECK(rc == TILEDB_OK); + rc = tiledb_array_open(ctx, array, TILEDB_READ); + CHECK(rc == TILEDB_OK); + + // Load fragment metadata. + auto frag_meta = array->array_->fragment_metadata(); + auto& enc_key = array->array_->get_encryption_key(); + frag_meta[0]->load_fragment_min_max_sum_null_count(enc_key); + + // Do fragment metadata first. + { + // Validate mins. + auto& min = frag_meta[0]->get_min("a"); + double correct_min = 4; + CHECK(min.size() == sizeof(double)); + CHECK(0 == memcmp(min.data(), &correct_min, min.size())); + + CHECK_THROWS_WITH( + frag_meta[0]->get_min("d1"), + "FragmentMetadata: Trying to access fragment min metadata that's " + "not present"); + + CHECK_THROWS_WITH( + frag_meta[0]->get_min("d2"), + "FragmentMetadata: Trying to access fragment min metadata that's " + "not present"); + + // Validate maxs. + auto& max = frag_meta[0]->get_max("a"); + double correct_max = 7; + CHECK(max.size() == sizeof(double)); + CHECK(0 == memcmp(max.data(), &correct_max, max.size())); + + CHECK_THROWS_WITH( + frag_meta[0]->get_max("d1"), + "FragmentMetadata: Trying to access fragment max metadata that's " + "not present"); + + CHECK_THROWS_WITH( + frag_meta[0]->get_max("d2"), + "FragmentMetadata: Trying to access fragment max metadata that's " + "not present"); + } + + // Load metadata. + std::vector names{"a", "d1", "d2"}; + frag_meta[0]->load_rtree(enc_key); + frag_meta[0]->load_tile_min_values(enc_key, names); + frag_meta[0]->load_tile_max_values(enc_key, names); + frag_meta[0]->load_tile_sum_values(enc_key, names); + frag_meta[0]->load_tile_null_count_values(enc_key, names); + + // Validate min. + CHECK(frag_meta[0]->get_tile_min_as("a", 0) == 4); + CHECK_THROWS_WITH( + frag_meta[0]->get_tile_min_as("d1", 0), + "FragmentMetadata: Trying to access tile min metadata that's not " + "present"); + CHECK_THROWS_WITH( + frag_meta[0]->get_tile_min_as("d2", 0), + "FragmentMetadata: Trying to access tile min metadata that's not " + "present"); + + // Validate max. + CHECK(frag_meta[0]->get_tile_max_as("a", 0) == 7); + CHECK_THROWS_WITH( + frag_meta[0]->get_tile_max_as("d1", 0), + "FragmentMetadata: Trying to access tile max metadata that's not " + "present"); + CHECK_THROWS_WITH( + frag_meta[0]->get_tile_max_as("d2", 0), + "FragmentMetadata: Trying to access tile max metadata that's not " + "present"); + + // Validate sum. + CHECK(*(double*)frag_meta[0]->get_tile_sum("a", 0) == 22); + + // Validate the full tile data structure. + auto full_tile_data_a = frag_meta[0]->get_tile_metadata("a", 0); + CHECK(4 == full_tile_data_a.min_as()); + CHECK(7 == full_tile_data_a.max_as()); + CHECK(22 == full_tile_data_a.sum_as()); + + auto full_tile_data_d1 = frag_meta[0]->get_tile_metadata("d1", 0); + CHECK("a" == full_tile_data_d1.min_as()); + CHECK("dddd" == full_tile_data_d1.max_as()); + + auto full_tile_data_d2 = frag_meta[0]->get_tile_metadata("d2", 0); + CHECK("a" == full_tile_data_d2.min_as()); + CHECK("d" == full_tile_data_d2.max_as()); + + // Close array. + rc = tiledb_array_close(ctx, array); + CHECK(rc == TILEDB_OK); + + // Clean up. + tiledb_array_free(&array); + tiledb_ctx_free(&ctx); + } + + const char* ARRAY_NAME = "tile_metadata_unit_array"; + const uint64_t tile_extent_ = 4; + tiledb::Context ctx_; + tiledb::VFS vfs_; +}; + +TEST_CASE_METHOD( + CPPTileMetadataStringDimFx, + "TileMetadata: string dims", + "[tile-metadata][string-dims]") { + // Create the array. + create_array(); + write_fragment(); + check_metadata(); } \ No newline at end of file diff --git a/tiledb/sm/fragment/fragment_metadata.cc b/tiledb/sm/fragment/fragment_metadata.cc index 85980985d45c..8ca2427b9bec 100644 --- a/tiledb/sm/fragment/fragment_metadata.cc +++ b/tiledb/sm/fragment/fragment_metadata.cc @@ -46,6 +46,7 @@ #include "tiledb/sm/misc/constants.h" #include "tiledb/sm/misc/parallel_functions.h" #include "tiledb/sm/misc/utils.h" +#include "tiledb/sm/query/readers/aggregators/tile_metadata.h" #include "tiledb/sm/stats/global_stats.h" #include "tiledb/sm/storage_manager/storage_manager.h" #include "tiledb/sm/tile/generic_tile_io.h" @@ -1707,7 +1708,11 @@ T FragmentMetadata::get_tile_min_as( auto size = array_schema_->cell_size(name); const void* min = &tile_min_buffer_[idx][tile_idx * size]; - return *static_cast(min); + if constexpr (std::is_same::value) { + return min; + } else { + return *static_cast(min); + } } template <> @@ -1787,7 +1792,11 @@ T FragmentMetadata::get_tile_max_as( auto size = array_schema_->cell_size(name); const void* max = &tile_max_buffer_[idx][tile_idx * size]; - return *static_cast(max); + if constexpr (std::is_same::value) { + return max; + } else { + return *static_cast(max); + } } template <> @@ -1960,6 +1969,62 @@ uint64_t FragmentMetadata::get_null_count(const std::string& name) { return fragment_null_counts_[idx]; } +TileMetadata FragmentMetadata::get_tile_metadata( + const std::string& name, const uint64_t tile_idx) const { + auto var_size = array_schema_->var_size(name); + auto is_dim = array_schema_->is_dim(name); + auto count = cell_num(tile_idx); + + if (name == constants::count_of_rows) { + return {count, 0, nullptr, 0, nullptr, 0, nullptr}; + } + + uint64_t null_count = 0; + if (array_schema_->is_nullable(name)) { + null_count = get_tile_null_count(name, tile_idx); + } + + unsigned dim_idx = 0; + const NDRange* mbr = nullptr; + if (is_dim) { + throw_if_not_ok( + array_schema_->domain().get_dimension_index(name, &dim_idx)); + mbr = &rtree_.leaf(tile_idx); + } + + if (var_size) { + std::string_view min = + is_dim ? mbr->at(dim_idx).start_str() : + get_tile_min_as(name, tile_idx); + std::string_view max = + is_dim ? mbr->at(dim_idx).end_str() : + get_tile_max_as(name, tile_idx); + return { + count, + null_count, + min.data(), + min.size(), + max.data(), + max.size(), + nullptr}; + } else { + auto cell_size = array_schema_->cell_size(name); + const void* min = is_dim ? mbr->at(dim_idx).start_fixed() : + get_tile_min_as(name, tile_idx); + const void* max = is_dim ? mbr->at(dim_idx).end_fixed() : + get_tile_max_as(name, tile_idx); + + const auto type = array_schema_->type(name); + const auto cell_val_num = array_schema_->cell_val_num(name); + void* sum = nullptr; + if (TileMetadataGenerator::has_sum_metadata(type, false, cell_val_num)) { + sum = const_cast(get_tile_sum(name, tile_idx)); + } + + return {count, null_count, min, cell_size, max, cell_size, sum}; + } +} + void FragmentMetadata::set_processed_conditions( std::vector& processed_conditions) { processed_conditions_ = processed_conditions; diff --git a/tiledb/sm/fragment/fragment_metadata.h b/tiledb/sm/fragment/fragment_metadata.h index 504914c16065..3b4e822b888f 100644 --- a/tiledb/sm/fragment/fragment_metadata.h +++ b/tiledb/sm/fragment/fragment_metadata.h @@ -59,6 +59,7 @@ namespace sm { class ArraySchema; class Buffer; class EncryptionKey; +class TileMetadata; class MemoryTracker; /** Stores the metadata structures of a fragment. */ @@ -289,6 +290,10 @@ class FragmentMetadata { return has_delete_meta_; } + inline bool has_tile_metadata() { + return version_ >= constants::tile_metadata_min_version; + } + /** Returns the sizes of each attribute file. */ inline const std::vector& file_sizes() const { return file_sizes_; @@ -899,6 +904,15 @@ class FragmentMetadata { */ uint64_t get_null_count(const std::string& name); + /** + * Returns the tile metadata for a tile. + * + * @param name Name of the attribute to get the data for. + * @param tile_idx Tile index. + */ + TileMetadata get_tile_metadata( + const std::string& name, const uint64_t tile_idx) const; + /** * Set the processed conditions. The processed conditions is the list * of delete/update conditions that have already been applied for this